<a href="https://colab.research.google.com/github/tig1795/predicting_hits_ML_MA/blob/main/code/2_sentimental%20Analyse/Sentimental_Analyse.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports

In [19]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [20]:
import tensorflow as tf

# Get the GPU device name.
device_name = tf.test.gpu_device_name()

# The device name should look like the following:
if device_name == '/device:GPU:0':
    print('Found GPU at: {}'.format(device_name))
else:
    raise SystemError('GPU device not found')

Found GPU at: /device:GPU:0


In [21]:
import torch

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla T4


In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


# Beispiel mit Sätzen

In [None]:
# Import required packages
import torch
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer

# Create class for data preparation
class SimpleDataset:
    def __init__(self, tokenized_texts):
        self.tokenized_texts = tokenized_texts
    
    def __len__(self):
        return len(self.tokenized_texts["input_ids"])
    
    def __getitem__(self, idx):
        return {k: v[idx] for k, v in self.tokenized_texts.items()}

In [None]:
# Load tokenizer and model, create trainer
model_name = "siebert/sentiment-roberta-large-english"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
trainer = Trainer(model=model)

Downloading (…)okenizer_config.json:   0%|          | 0.00/256 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/687 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

In [None]:
# Create list of texts (can be imported from .csv, .xls etc.)
pred_texts = ['I like that','That is annoying','This is great!','Wouldn´t recommend it.']

In [None]:
# Tokenize texts and create prediction data set
tokenized_texts = tokenizer(pred_texts,truncation=True,padding=True)
pred_dataset = SimpleDataset(tokenized_texts)

In [None]:
# Run predictions
predictions = trainer.predict(pred_dataset)

In [None]:
# Transform predictions to labels
preds = predictions.predictions.argmax(-1)
labels = pd.Series(preds).map(model.config.id2label)
scores = (np.exp(predictions[0])/np.exp(predictions[0]).sum(-1,keepdims=True)).max(1)

In [None]:
# Create DataFrame with texts, predictions, labels, and scores
df = pd.DataFrame(list(zip(pred_texts,preds,labels,scores)), columns=['text','pred','label','score'])
df.head()

Unnamed: 0,text,pred,label,score
0,I like that,1,POSITIVE,0.998657
1,That is annoying,0,NEGATIVE,0.999409
2,This is great!,1,POSITIVE,0.998727
3,Wouldn´t recommend it.,0,NEGATIVE,0.999486


# Anwendung auf den ganzen Datensatz

In [22]:
# Install the transformers library
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [23]:
# Import required packages
import torch
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer

# Create class for data preparation
class SimpleDataset:
    def __init__(self, tokenized_texts):
        self.tokenized_texts = tokenized_texts
    
    def __len__(self):
        return len(self.tokenized_texts["input_ids"])
    
    def __getitem__(self, idx):
        return {k: v[idx] for k, v in self.tokenized_texts.items()}

In [24]:
# Load tokenizer and model, create trainer
model_name = "siebert/sentiment-roberta-large-english"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
trainer = Trainer(model=model)

In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [28]:
# Example: Import data from csv-file stored on Google Drive

df = pd.read_csv('/content/gdrive/My Drive/Colab Datasets/dataset_added_values_no_duplicates.csv')

In [30]:
pred_texts = df['lyrics'].dropna().astype('str').tolist()

# Analyse

In [31]:
# Tokenize texts and create prediction data set
tokenized_texts = tokenizer(pred_texts,truncation=True,padding=True)
pred_dataset = SimpleDataset(tokenized_texts)

In [32]:
# Run predictions
predictions = trainer.predict(pred_dataset)

In [33]:
# Transform predictions to labels
preds = predictions.predictions.argmax(-1)
labels = pd.Series(preds).map(model.config.id2label)
scores = (np.exp(predictions[0])/np.exp(predictions[0]).sum(-1,keepdims=True)).max(1)

In [34]:
# Create DataFrame with texts, predictions, labels, and scores
sentiment_df = pd.DataFrame(list(zip(pred_texts,preds,labels,scores)), columns=['text','pred','label','score'])
sentiment_df.head(5)

Unnamed: 0,text,pred,label,score
0,here she comes mmm just like an angelseems ...,1,POSITIVE,0.988107
1,look into my eyes you will seewhat you mean t...,1,POSITIVE,0.998717
2,i can't get no satisfactioni can't get no sati...,0,NEGATIVE,0.999485
3,oh i i just died in your arms tonightit must...,0,NEGATIVE,0.99831
4,your love is fadin' i feel it fadeah your lo...,0,NEGATIVE,0.997664


In [36]:
sentiment_df.shape

(3669, 4)

In [38]:
sentiment_df.text[2]

'i can\'t get no satisfactioni can\'t get no satisfaction\'cause i try and i try and i try and i tryi can\'t get no  i can\'t get nowhen i\'m drivin\' in my carand a man comes on the radiohe\'s tellin\' me more and moreabout some useless informationsupposed to fire my imaginationi can\'t get no  oh no  no  nohey  hey  hey  that\'s what i sayi can\'t get no satisfactioni can\'t get no satisfaction\'cause i try and i try and i try and i tryi can\'t get no  i can\'t get nowhen i\'m watchin\' my tvand a man comes on and tells mehow white my shirts can bebut he can\'t be a man \'cause he doesn\'t smokethe same cigarettes as mei can\'t get no  oh no  no  nohey  hey  hey  that\'s what i sayi can\'t get no satisfactioni can\'t get no girl reaction\'cause i try and i try and i try and i tryi can\'t get no  i can\'t get nowhen i\'m ridin\' \'round the worldand i\'m doin\' this and i\'m signin\' thatand i\'m tryin\' to make some girlwho tells me  "baby better come back maybe next week\'cause you 

In [40]:
from collections import Counter

Counter(sentiment_df.label)

Counter({'POSITIVE': 2723, 'NEGATIVE': 946})

In [41]:
sentiment_df.to_csv('/content/gdrive/My Drive/Colab Datasets/dataset_sentiments.csv')

# Zusammenführen der beiden Datasets

In [2]:
import pandas as pd 

df1 = pd.read_csv('/content/gdrive/My Drive/Colab Datasets/dataset_added_values_no_duplicates.csv')

In [11]:
df2 = pd.read_csv('/content/gdrive/My Drive/Colab Datasets/dataset_sentiments.csv', index_col=0)

In [4]:
df1.head(5)

Unnamed: 0,lyrics,genre,num_syllables,pos,year,fog_index,flesch_index,num_lines,sentiment,title,...,artist,difficult_words,num_dupes,number_of_tokens,number_of_types,decades,stemmed_text,POS_tags,POS,values
0,here she comes mmm just like an angelseems ...,"['alternative rock', 'glam metal', 'hard rock']",405.0,27,1990,4.8,89.75,43,"{'neg': 0.156, 'neu': 0.714, 'pos': 0.131, 'co...",(Can't Live Without Your) Love And Affection,...,Nelson,21,23,323.0,133.0,1990s,here she come mmm just like an angelseem like...,"['_SP', 'RB', 'PRP', 'VBZ', '_SP', 'FW', '_SP'...",missingcan't outside hard mindbut nothing chan...,0.037037
1,look into my eyes you will seewhat you mean t...,"['acoustic rock', 'adult contemporary', 'album...",258.3,1,1991,4.4,107.69,36,"{'neg': 0.14100000000000001, 'neu': 0.708, 'po...",(Everything I Do) I Do It For You,...,Bryan Adams,6,12,233.0,95.0,1990s,look into my eye you will seewhat you mean to...,"['VB', 'IN', 'PRP$', 'NNS', '_SP', 'PRP', 'MD'...",worth worth worth other worth worth heart soul...,1.0
2,i can't get no satisfactioni can't get no sati...,"['blues', 'blues rock', 'britannique', 'britis...",296.1,3,1965,4.8,106.67,37,"{'neg': 0.126, 'neu': 0.667, 'pos': 0.20800000...",(I Can't Get No) Satisfaction,...,Rolling Stones,18,22,260.0,83.0,1960s,i ca n't get no satisfactioni ca n't get no s...,"['PRP', 'MD', 'RB', 'VB', 'DT', 'NN', 'MD', 'R...",satisfaction'cause nowhen drivin useless satis...,0.333333
3,oh i i just died in your arms tonightit must...,['classic pop and rock'],372.6,32,1987,4.4,99.23,45,"{'neg': 0.164, 'neu': 0.766, 'pos': 0.07, 'com...",(I Just) Died In Your Arms,...,Cutting Crew,26,31,310.0,130.0,1980s,oh i i just die in your arm tonightit must 'v...,"['UH', '_SP', 'PRP', '_SP', 'PRP', 'RB', 'VBD'...",easy thisher final many long hot easy gonei to...,0.03125
4,your love is fadin' i feel it fadeah your lo...,"['american', 'psychedelic rock', 'psychedelic ...",251.1,55,1970,4.4,99.23,31,"{'neg': 0.148, 'neu': 0.795, 'pos': 0.057, 'co...",(I Know) I'm Losing You,...,Rare Earth,18,3,203.0,104.0,1970s,your love is fadin ' i feel it fadeah your lo...,"['PRP$', 'NN', 'VBZ', 'VBG', ""''"", '_SP', 'PRP...",cancan eyesa hearted worried love love love wo...,0.018182


In [5]:
df1.shape

(3669, 21)

In [12]:
df2.head(5)

Unnamed: 0,text,pred,label,score
0,here she comes mmm just like an angelseems ...,1,POSITIVE,0.988107
1,look into my eyes you will seewhat you mean t...,1,POSITIVE,0.998717
2,i can't get no satisfactioni can't get no sati...,0,NEGATIVE,0.999485
3,oh i i just died in your arms tonightit must...,0,NEGATIVE,0.99831
4,your love is fadin' i feel it fadeah your lo...,0,NEGATIVE,0.997664


In [14]:
df2.shape

(3669, 4)

In [19]:
# merge df2 into df1 based on their indexes
result = df1.merge(df2, left_index=True, right_index=True)

In [20]:
result.head(5)

Unnamed: 0,lyrics,genre,num_syllables,pos,year,fog_index,flesch_index,num_lines,sentiment,title,...,number_of_tokens,number_of_types,decades,stemmed_text,POS_tags,POS,values,pred,label,score
0,here she comes mmm just like an angelseems ...,"['alternative rock', 'glam metal', 'hard rock']",405.0,27,1990,4.8,89.75,43,"{'neg': 0.156, 'neu': 0.714, 'pos': 0.131, 'co...",(Can't Live Without Your) Love And Affection,...,323.0,133.0,1990s,here she come mmm just like an angelseem like...,"['_SP', 'RB', 'PRP', 'VBZ', '_SP', 'FW', '_SP'...",missingcan't outside hard mindbut nothing chan...,0.037037,1,POSITIVE,0.988107
1,look into my eyes you will seewhat you mean t...,"['acoustic rock', 'adult contemporary', 'album...",258.3,1,1991,4.4,107.69,36,"{'neg': 0.14100000000000001, 'neu': 0.708, 'po...",(Everything I Do) I Do It For You,...,233.0,95.0,1990s,look into my eye you will seewhat you mean to...,"['VB', 'IN', 'PRP$', 'NNS', '_SP', 'PRP', 'MD'...",worth worth worth other worth worth heart soul...,1.0,1,POSITIVE,0.998717
2,i can't get no satisfactioni can't get no sati...,"['blues', 'blues rock', 'britannique', 'britis...",296.1,3,1965,4.8,106.67,37,"{'neg': 0.126, 'neu': 0.667, 'pos': 0.20800000...",(I Can't Get No) Satisfaction,...,260.0,83.0,1960s,i ca n't get no satisfactioni ca n't get no s...,"['PRP', 'MD', 'RB', 'VB', 'DT', 'NN', 'MD', 'R...",satisfaction'cause nowhen drivin useless satis...,0.333333,0,NEGATIVE,0.999485
3,oh i i just died in your arms tonightit must...,['classic pop and rock'],372.6,32,1987,4.4,99.23,45,"{'neg': 0.164, 'neu': 0.766, 'pos': 0.07, 'com...",(I Just) Died In Your Arms,...,310.0,130.0,1980s,oh i i just die in your arm tonightit must 'v...,"['UH', '_SP', 'PRP', '_SP', 'PRP', 'RB', 'VBD'...",easy thisher final many long hot easy gonei to...,0.03125,0,NEGATIVE,0.99831
4,your love is fadin' i feel it fadeah your lo...,"['american', 'psychedelic rock', 'psychedelic ...",251.1,55,1970,4.4,99.23,31,"{'neg': 0.148, 'neu': 0.795, 'pos': 0.057, 'co...",(I Know) I'm Losing You,...,203.0,104.0,1970s,your love is fadin ' i feel it fadeah your lo...,"['PRP$', 'NN', 'VBZ', 'VBG', ""''"", '_SP', 'PRP...",cancan eyesa hearted worried love love love wo...,0.018182,0,NEGATIVE,0.997664


In [21]:
result.shape

(3669, 24)

In [23]:
from collections import Counter

Counter(result.label)

Counter({'POSITIVE': 2723, 'NEGATIVE': 946})

In [26]:
df2.score[1435]

0.99879414

In [27]:
result.score[1435]

0.99879414

In [28]:
result.to_csv('/content/gdrive/My Drive/Colab Datasets/dataset_new_sa_values.csv', index=False)