In [99]:
import pandas as pd
import numpy as np
import torch
from transformers import BertTokenizer, BertModel
from scipy.spatial.distance import cosine
from string import punctuation
import re
from collections import Counter

In [100]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 

In [101]:
df = pd.read_csv('14k_words_VAD.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,Word,V.Mean.Sum,V.SD.Sum,V.Rat.Sum,A.Mean.Sum,A.SD.Sum,A.Rat.Sum,D.Mean.Sum,D.SD.Sum,...,A.Rat.L,A.Mean.H,A.SD.H,A.Rat.H,D.Mean.L,D.SD.L,D.Rat.L,D.Mean.H,D.SD.H,D.Rat.H
0,1,aardvark,6.26,2.21,19,2.41,1.4,22,4.27,1.75,...,11,2.55,1.29,11,4.12,1.64,8,4.43,1.99,7
1,2,abalone,5.3,1.59,20,2.65,1.9,20,4.95,1.79,...,12,2.38,1.92,8,5.55,2.21,11,4.36,1.03,11
2,3,abandon,2.84,1.54,19,3.73,2.43,22,3.32,2.5,...,11,3.82,2.14,11,2.77,2.09,13,4.11,2.93,9
3,4,abandonment,2.63,1.74,19,4.95,2.64,21,2.64,1.81,...,14,5.29,2.63,7,2.31,1.45,16,3.08,2.19,12
4,5,abbey,5.85,1.69,20,2.2,1.7,20,5.0,2.02,...,9,2.55,1.92,11,4.83,2.18,18,5.43,1.62,7


In [4]:
print('Valence max value:', max(df['V.Mean.Sum']), 'and min value:', min(df['V.Mean.Sum']))
print('Arousal max value:', max(df['A.Mean.Sum']), 'and min value:', min(df['A.Mean.Sum']))

Valence max value: 8.53 and min value: 1.26
Arousal max value: 7.79 and min value: 1.6


In [9]:
high_valence_words = df[df['V.Mean.Sum'] > max(df['V.Mean.Sum']) - 1]
print('Numer of happy words:', len(high_valence_words))
print('Some examples: ' + high_valence_words.iloc[1]['Word'], high_valence_words.iloc[3]['Word'], high_valence_words.iloc[6]['Word'], sep=', ')

Numer of happy words: 210
Some examples: accomplishment, admiration, amazing


In [10]:
low_valence_words = df[df['V.Mean.Sum'] < min(df['V.Mean.Sum']) + 1]
print('Numer of happy words:', len(low_valence_words))
print('Some examples: ' + low_valence_words.iloc[0]['Word'], low_valence_words.iloc[3]['Word'], low_valence_words.iloc[6]['Word'], sep=', ')

Numer of happy words: 215
Some examples: abduction, acne, alcoholism


In [120]:
# print(np.mean(low_valence_words['A.Mean.Sum']))
sad_from_val = low_valence_words[low_valence_words['A.Mean.Sum'] < 5.35]
angry_from_val = low_valence_words[low_valence_words['A.Mean.Sum'] > 5.35]

In [19]:
# print(np.mean(high_valence_words['A.Mean.Sum']))
relaxed_from_val = high_valence_words[high_valence_words['A.Mean.Sum'] < 4.98]
happy_from_val = high_valence_words[high_valence_words['A.Mean.Sum'] > 4.98]

In [22]:
high_arousal_words = df[df['A.Mean.Sum'] > max(df['A.Mean.Sum']) - 1]
print('Numer of happy words:', len(high_arousal_words))
print('Some examples: ' + high_arousal_words.iloc[0]['Word'], high_arousal_words.iloc[3]['Word'], high_arousal_words.iloc[6]['Word'], sep=', ')

Numer of happy words: 74
Some examples: alarm, arsonist, attack


In [24]:
low_arousal_words = df[df['A.Mean.Sum'] < min(df['A.Mean.Sum']) + 1]
print('Numer of happy words:', len(low_arousal_words))
print('Some examples: ' + low_arousal_words.iloc[1]['Word'], low_arousal_words.iloc[3]['Word'], low_arousal_words.iloc[6]['Word'], sep=', ')

Numer of happy words: 204
Some examples: abbey, area, axle


In [26]:
# print(np.mean(low_arousal_words['V.Mean.Sum']))
sad_from_aro = low_arousal_words[low_arousal_words['V.Mean.Sum'] < 5.55]
relaxed_from_aro = low_arousal_words[low_arousal_words['V.Mean.Sum'] > 5.55]

In [30]:
# print(np.mean(high_arousal_words['V.Mean.Sum']))
angry_from_aro = high_arousal_words[high_arousal_words['V.Mean.Sum'] < 4.43]
happy_from_aro = high_arousal_words[high_arousal_words['V.Mean.Sum'] > 4.43]

In [103]:
# можно разбивать сильнее! например особенно эта группа!))
# happy_from_aro 
# и angry я бы назвала agressive

In [35]:
sad_words = sad_from_val['Word'].tolist()
sad_words.extend(sad_from_aro['Word'].tolist())
print('Numer of SAD:', len(sad_words))

Numer of SAD: 211


In [36]:
happy_words = happy_from_val['Word'].tolist()
happy_words.extend(happy_from_aro['Word'].tolist())
print('Numer of HAPPY:', len(happy_words))

Numer of HAPPY: 144


In [37]:
angry_words = angry_from_val['Word'].tolist()
angry_words.extend(angry_from_aro['Word'].tolist())
angry_words = list(set(angry_words))
print('Numer of ANGRY:', len(angry_words))

Numer of ANGRY: 123


In [38]:
relaxed_words = relaxed_from_val['Word'].tolist()
relaxed_words.extend(relaxed_from_aro['Word'].tolist())
relaxed_words = list(set(relaxed_words))
print('Numer of RELAXED:', len(relaxed_words))

Numer of RELAXED: 202


In [45]:
# еще хорошо бы убедиться, что одни и те же слова не попали в несколько групп

-------

In [2]:
model_name = 'bert-base-uncased'
emb_model = BertModel.from_pretrained(model_name, output_hidden_states=True)
tokenizer = BertTokenizer.from_pretrained(model_name)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




In [39]:
def get_subtoken_vectors(sent):
    sent_marked = "[CLS] " + sent + " [SEP]"
    tokenized = tokenizer.tokenize(sent_marked)
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized)
    segments_ids = [1] * len(tokenized)

    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensors = torch.tensor([segments_ids])

    with torch.no_grad():
        outputs = emb_model(tokens_tensor, segments_tensors)
        hidden_states = outputs[2]

    token_embeddings = torch.stack(hidden_states, dim=0)
    # delete batch dimension
    token_embeddings = torch.squeeze(token_embeddings, dim=1)
    # move tokens to first dimension
    token_embeddings = token_embeddings.permute(1,0,2)

    token_vecs_cat = []
    for token in token_embeddings:
        cat_vec = torch.cat((token[-1], token[-2], token[-3], token[-4]), dim=0)
        token_vecs_cat.append(cat_vec)
    return tokenized, token_vecs_cat

def get_subtokens_to_join(tokenized):
    to_join = []
    join = []
    for i, token in enumerate(tokenized):
        if token.startswith('##'):
            join.append(i)
        else:
            if join:
                join.insert(0, join[0]-1)
                to_join.append(join)
            join = []
    return to_join

def get_full_token(part_for_join, tokenized):
    full_token = ''
    token_vecs = []
    for sub_token in part_for_join:
        full_token += tokenized[sub_token].replace('##', '')
    return full_token

def get_full_token_vectors(sent):
    tokenized_sent, subtoken_vecs = get_subtoken_vectors(sent)
    to_join = get_subtokens_to_join(tokenized_sent)
    merged_join_list = []
    for seq in to_join:
        merged_join_list.extend(seq)
        
    join_list_ind = 0
    token_vectors = []
    token_words = []
    skip = []
    for i in range(len(subtoken_vecs)):
        if i in skip:
            continue
        if i in merged_join_list:
            join_indices = to_join[join_list_ind]
            token_vec = [np.array(subtoken_vecs[ind]) for ind in join_indices]
            full_token = get_full_token(join_indices, tokenized_sent)
            token_vec_av = np.average(np.array(token_vec), axis=0)
            skip.extend(join_indices)
            join_list_ind += 1
            token_vectors.append(token_vec_av)
            token_words.append(full_token)
        else:
            token_vectors.append(subtoken_vecs[i].numpy())
            token_words.append(tokenized_sent[i])
    return token_words[1:-1], token_vectors[1:-1]

In [40]:
word, vector = get_full_token_vectors('sadness')

In [43]:
def get_emotion_vectors(emo_list):
    emo_vectors = []
    for word in emo_list:
        tokens, vector = get_full_token_vectors(word)
        if len(tokens) > 1:
            print(tokens)
            mean_vector = np.average(np.array(vector), axis=0)
            emo_vectors.append(mean_vector)
        else:
            emo_vectors.append(vector[0])
    return emo_vectors

In [44]:
angry_vecs = get_emotion_vectors(angry_words)
sad_vecs = get_emotion_vectors(sad_words)
happy_vecs = get_emotion_vectors(happy_words)
relaxed_vecs = get_emotion_vectors(relaxed_words)

['heart', 'disease']
['breaking', 'and', 'entering']
['drunk', 'driving']
['measuring', 'tape']
['t', '-', 'shirt']


будет ли какая-то разница, если считать "средний" вектор всех грустных слов или посчитать отдельные и потом среднее посчитать уже у расстояний?

In [47]:
songs_df = pd.read_csv('lyrics.csv')
songs_df.head()

Unnamed: 0,Artist,Title,Mood,Text
0,George Michael,I Want Your Sex,happy,[Verse 1]\nIt’s late\nTime for bed\nSo I sit a...
1,Rob Zombie,Pussy Liquor,angry,Baby :\nGimme a B\nGimme a A\nGimme a B\nGimme...
2,Katatonia,12,sad,[Chorus]\nBlack theatre of love\nViolet dancer...
3,Bing Crosby,Swinging On A Star,happy,Would you like to swing on a star\nCarry moonb...
4,Ludacris,Get Back,angry,"[Intro]\nHeads up! (woop, woop) Heads up! (woo..."


In [48]:
example_text_1 = songs_df.iloc[0].Text
# print(example_text_1)

In [49]:
# удаляю пунктуацию - спорное решение учитывая что берт контекстный
lines = example_text_1.split('\n')
lines = [re.sub('[{}]'.format(re.escape(punctuation)), '', line.lower()) for line in lines if not line.startswith('[') and line != '']

In [52]:
stop_words = set(stopwords.words('english')) 
clean_lines = []
for line in lines:
    tokens = word_tokenize(line)
    tokens = [token for token in tokens if token not in stop_words]
    clean_lines.append(' '.join(tokens))

In [54]:
def get_mean_emo_dist(emo_list, vec):
    dists = []
    for emo_vec in emo_list:
        dist = 1 - cosine(vec, emo_vec)
        dists.append(dist)
    return np.mean(dists)

In [55]:
# тут не дистансы а симилярити......?
def get_line_mood(line):
    tokens, vectors = get_full_token_vectors(line)
    sad, happy, angry, relaxed = [], [], [], []
    for vec in vectors:
        sad_mean = get_mean_emo_dist(sad_vecs, vec)
        happy_mean = get_mean_emo_dist(happy_vecs, vec)
        angry_mean = get_mean_emo_dist(angry_vecs, vec)
        relaxed_mean = get_mean_emo_dist(relaxed_vecs, vec)
        sad.append(sad_mean)
        happy.append(happy_mean)
        angry.append(angry_mean)
        relaxed.append(relaxed_mean)
    emos = ['sad', 'relaxed', 'happy', 'angry']
    means = [np.mean(sad), np.mean(relaxed), np.mean(happy), np.mean(angry)]
    return emos[np.argmax(means)]

In [56]:
moods = []
for line in clean_lines:
    moods.append(get_line_mood(line))

In [57]:
Counter(moods).most_common()

[('happy', 15), ('relaxed', 8), ('angry', 1)]

In [59]:
example_text_2 = songs_df.iloc[14].Text
# print(example_text_2)
lines = example_text_2.split('\n')
lines = [re.sub('[{}]'.format(re.escape(punctuation)), '', line.lower()) for line in lines if not line.startswith('[') and line != '']
clean_lines = []
for line in lines:
    tokens = word_tokenize(line)
    tokens = [token for token in tokens if token not in stop_words]
    clean_lines.append(' '.join(tokens))

In [60]:
moods = []
for line in clean_lines:
    moods.append(get_line_mood(line))

In [61]:
Counter(moods).most_common()

[('angry', 47), ('happy', 10), ('relaxed', 2)]

In [88]:
example_text_3 = songs_df.iloc[39].Text
lines = example_text_3.split('\n')
lines = [re.sub('[{}]'.format(re.escape(punctuation)), '', line.lower()) for line in lines if not line.startswith('[') and line != '']
clean_lines = []
for line in lines:
    tokens = word_tokenize(line)
    tokens = [token for token in tokens if token not in stop_words]
    clean_lines.append(' '.join(tokens))

In [89]:
moods = []
for line in clean_lines:
    moods.append(get_line_mood(line))

In [90]:
Counter(moods).most_common()

[('relaxed', 5), ('happy', 4), ('angry', 3)]

In [95]:
example_text_4 = songs_df.iloc[78].Text
lines = example_text_4.split('\n')
lines = [re.sub('[{}]'.format(re.escape(punctuation)), '', line.lower()) for line in lines if not line.startswith('[') and line != '']
clean_lines = []
for line in lines:
    tokens = word_tokenize(line)
    tokens = [token for token in tokens if token not in stop_words]
    clean_lines.append(' '.join(tokens))

In [96]:
moods = []
for line in clean_lines:
    moods.append(get_line_mood(line))

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


In [97]:
Counter(moods).most_common()

[('happy', 15), ('angry', 15), ('relaxed', 5), ('sad', 2)]

In [118]:
sad_words

['abduction',
 'abusive',
 'acne',
 'afraid',
 'AIDS',
 'alcoholism',
 'amputation',
 'appendicitis',
 'asbestos',
 'blackmailer',
 'bloodbath',
 'cancer',
 'castrate',
 'castration',
 'cellmate',
 'chemo',
 'chemotherapy',
 'cirrhosis',
 'coma',
 'comatose',
 'criminal',
 'dead',
 'deadly',
 'deathbed',
 'destitute',
 'disastrous',
 'dreaded',
 'embolism',
 'enemy',
 'excrement',
 'excruciating',
 'extortion',
 'fag',
 'failure',
 'feces',
 'felon',
 'flunk',
 'fraud',
 'funeral',
 'gonorrhea',
 'greedy',
 'headache',
 'helpless',
 'HIV',
 'homeless',
 'hopeless',
 'illness',
 'incarceration',
 'incest',
 'incurable',
 'infection',
 'insecurity',
 'insulting',
 'jail',
 'leprosy',
 'lice',
 'lifeless',
 'lynching',
 'menopause',
 'misery',
 'mistrust',
 'moldy',
 'molester',
 'morgue',
 'mortuary',
 'motherless',
 'mourning',
 'mutilate',
 'nausea',
 'nauseous',
 'neglect',
 'negligent',
 'overworked',
 'parasitic',
 'parkinsons',
 'pedophile',
 'perish',
 'pimple',
 'pollute',
 'poll

Конечно работает далеко не идеально - часто выдает happy для песен, в которых я бы сама не смогла определить эмоцию. Мб уменьшать вес для happy? С сэд вообще очень печально - почти невозможно получить грустную песню. Мб связано со списком грустных слов?

In [98]:
sad_words

['abduction',
 'abusive',
 'acne',
 'afraid',
 'AIDS',
 'alcoholism',
 'amputation',
 'appendicitis',
 'asbestos',
 'blackmailer',
 'bloodbath',
 'cancer',
 'castrate',
 'castration',
 'cellmate',
 'chemo',
 'chemotherapy',
 'cirrhosis',
 'coma',
 'comatose',
 'criminal',
 'dead',
 'deadly',
 'deathbed',
 'destitute',
 'disastrous',
 'dreaded',
 'embolism',
 'enemy',
 'excrement',
 'excruciating',
 'extortion',
 'fag',
 'failure',
 'feces',
 'felon',
 'flunk',
 'fraud',
 'funeral',
 'gonorrhea',
 'greedy',
 'headache',
 'helpless',
 'HIV',
 'homeless',
 'hopeless',
 'illness',
 'incarceration',
 'incest',
 'incurable',
 'infection',
 'insecurity',
 'insulting',
 'jail',
 'leprosy',
 'lice',
 'lifeless',
 'lynching',
 'menopause',
 'misery',
 'mistrust',
 'moldy',
 'molester',
 'morgue',
 'mortuary',
 'motherless',
 'mourning',
 'mutilate',
 'nausea',
 'nauseous',
 'neglect',
 'negligent',
 'overworked',
 'parasitic',
 'parkinsons',
 'pedophile',
 'perish',
 'pimple',
 'pollute',
 'poll

In [94]:
songs_df.iloc[78].Text

"I used to laugh it off\nI used to look the other way\nI used to save them troubles for another day\nI kept my fingers crossed\nI used to never take the blame\nI'd pull a sunshine story in the pouring rain\nThe more I had to change, I just stayed the same\nThe same, the same, I stayed the same\nYou don't know what you've got, 'til you're missing it a lot\nI had to go throw it away\nI was wrong from the start\nFrom the bottom of my heart, I apologize\nWhat I did to you was hurtful\nWhat I'm going through is hurtful\nI used to buy my time\nI used to beat around the bush\nI'd rather give my ego another push\nI used to be a fool\nIt was a foolish game I played\nAnd it's a fool's faith, common mistakes I made\nOnce I had it right, it was all too late\nToo late, too late, too little too late\nYou don't know what you've got, 'til you're missing it a lot\nI had to go throw it away\nI was wrong from the start\nFrom the bottom of my heart, I apologize\nWhat I did to you was hurtful\nWhat I'm goi

In [73]:
# ваще-то должно быть грустным

In [93]:
songs_df.head(79)

Unnamed: 0,Artist,Title,Mood,Text
0,George Michael,I Want Your Sex,happy,[Verse 1]\nIt’s late\nTime for bed\nSo I sit a...
1,Rob Zombie,Pussy Liquor,angry,Baby :\nGimme a B\nGimme a A\nGimme a B\nGimme...
2,Katatonia,12,sad,[Chorus]\nBlack theatre of love\nViolet dancer...
3,Bing Crosby,Swinging On A Star,happy,Would you like to swing on a star\nCarry moonb...
4,Ludacris,Get Back,angry,"[Intro]\nHeads up! (woop, woop) Heads up! (woo..."
...,...,...,...,...
74,Reel Big Fish,"New York, New York",happy,Start spreadin' the news\nI'm leaving today\nI...
75,Loverboy,Hot Girls In Love,happy,She's so young at heart\nShe likes the pleasur...
76,Judas Priest,Bullet Train,angry,[Verse 1]\nSunrise showing every flaw\nPaying ...
77,The Foreshadowing,The Wandering,sad,Our souls are not ready for life\nWe're gonna ...
