In [1]:
import pandas as pd
import numpy as np
import pickle

# Data Primary

Task: classify the tweet as 'neutral or no emotion' or as one, or more, of eleven given emotions that best represent the mental state of the tweeter:

- anger (also includes annoyance and rage) 
- anticipation (also includes interest and vigilance)
- disgust (also includes disinterest, dislike and loathing)
- fear (also includes apprehension, anxiety, concern, and terror)
- joy (also includes serenity and ecstasy) 
- love (also includes affection)
- optimism (also includes hopefulness and confidence) 
- pessimism (also includes cynicism and lack of confidence) 
- sadness (also includes pensiveness and grief) 
- suprise (also includes distraction and amazement) 
- trust (also includes acceptance, liking, and admiration) 



In [2]:
pd.set_option('display.max_colwidth', 500)

In [4]:
folder_path = "/Users/macpro/Desktop/data_nlp"
file_name = folder_path +  "/data/primary_data/2018-E-c-En-dev.txt"
dev_data = pd.read_csv(file_name, sep='\t',  header='infer')
len(dev_data)

886

In [28]:
# dev_data.head()

In [7]:
label_list = list(dev_data.drop(['ID', 'Tweet'], axis=1).columns)

In [29]:
# dev_data[dev_data['sadness']==1]

In [16]:
file_name = folder_path + "/data/primary_data/2018-E-c-En-train.txt"
train_data = pd.read_csv(file_name, sep='\t',  header='infer')
len(train_data)

6838

In [17]:
train_data.head()

Unnamed: 0,ID,Tweet,anger,anticipation,disgust,fear,joy,love,optimism,pessimism,sadness,surprise,trust
0,2017-En-21441,“Worry is a down payment on a problem you may never have'. Joyce Meyer. #motivation #leadership #worry,0,1,0,0,0,0,1,0,0,0,1
1,2017-En-31535,Whatever you decide to do make sure it makes you #happy.,0,0,0,0,1,1,1,0,0,0,0
2,2017-En-21068,"@Max_Kellerman it also helps that the majority of NFL coaching is inept. Some of Bill O'Brien's play calling was wow, ! #GOPATS",1,0,1,0,1,0,1,0,0,0,0
3,2017-En-31436,Accept the challenges so that you can literally even feel the exhilaration of victory.' -- George S. Patton 🐶,0,0,0,0,1,0,1,0,0,0,0
4,2017-En-22195,My roommate: it's okay that we can't spell because we have autocorrect. #terrible #firstworldprobs,1,0,1,0,0,0,0,0,0,0,0


In [91]:
file_name = folder_path + "/data/primary_data/2018-E-c-En-test-gold.txt"
test_data = pd.read_csv(file_name, sep='\t',  header='infer')
len(test_data)

3259

In [92]:
len(dev_data) + len(train_data) + len(test_data)

10983

**У початковому(головному) маємо 11 тис висловлювань**

# Baseline

In [30]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import hamming_loss

In [19]:
df_dataset = train_data
df_dataset['Tags'] = df_dataset[label_list].apply(lambda row: [col for col, b in zip(dev_data[label_list].columns, row) if b],
                   axis=1)
df_dataset['Text'] = df_dataset['Tweet']

### Creating tf-idf feature
1. encode labels

2. конвертує колекцію документів(висловлювань у нашому випадку) до матриці з token counts
результат - матриця, де рядок - документ, стовпчик - слово. Матриця заповнена числами - кількість вживання кожного слова з корпусу у кожному документі.

3. обчислює tf-idf
трансформує попередню матрицю, таким чином знаходить суттєві для документу слова

4. виконуємо random over-sampling, компенсуючи незбалансування класів у вибірці

5. Класифікатор One-vs-rest (or one-vs-all)
Тренуємо бінарні класфікатори для кожного класу, обираємо те передбачення, що має найвижчий score

6. Обчислюємо label-based метрики (Hamming score and loss)

In [20]:
multilabel_binarizer = MultiLabelBinarizer()
multilabel_binarizer.fit(df_dataset.Tags)
Y = multilabel_binarizer.transform(df_dataset.Tags)

count_vect = CountVectorizer()
X_counts = count_vect.fit_transform(df_dataset.Text)

tfidf_transformer = TfidfTransformer()
X_tfidf = tfidf_transformer.fit_transform(X_counts)

In [21]:
ros = RandomOverSampler(random_state=9000)
X_tfidf_resampled, Y_tfidf_resampled = ros.fit_sample(X_tfidf, Y)

In [22]:
x_train_tfidf, x_test_tfidf, y_train_tfidf, y_test_tfidf = train_test_split(X_tfidf_resampled, Y_tfidf_resampled, test_size=0.2, random_state=9000)

### OneVsRest with different classifiers
use OneVsRest strategy to have one classifier for each class/label

In [25]:
def hamming_score(y_true, y_pred, normalize=True, sample_weight=None):
    '''
    Compute the Hamming score (a.k.a. label-based accuracy) for the multi-label case
    '''
    acc_list = []
    for i in range(y_true.shape[0]):
        set_true = set( np.where(y_true[i])[0] )
        set_pred = set( np.where(y_pred[i])[0] )
        tmp_a = None
        if len(set_true) == 0 and len(set_pred) == 0:
            tmp_a = 1
        else:
            tmp_a = len(set_true.intersection(set_pred))/float(len(set_true.union(set_pred)) )
        acc_list.append(tmp_a)
    return np.mean(acc_list)

def print_score(y_pred, clf):
    print("Clf: ", clf.__class__.__name__)
    print("Hamming loss: {}".format(hamming_loss(y_pred, y_test_tfidf)))
    print("Hamming score: {}".format(hamming_score(y_pred, y_test_tfidf)))
    print("---")   

In [26]:
nb_clf = MultinomialNB()
sgd = SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, random_state=42, max_iter=6, tol=None)
lr = LogisticRegression()
mn = MultinomialNB()

for classifier in [nb_clf, sgd, lr, mn]:
    clf = OneVsRestClassifier(classifier)
    clf.fit(x_train_tfidf, y_train_tfidf)
    y_pred = clf.predict(x_test_tfidf)
    print_score(y_pred, classifier)

Clf:  MultinomialNB
Hamming loss: 0.05760553129548763
Hamming score: 0.4586248873639704
---
Clf:  SGDClassifier
Hamming loss: 0.07878457059679767
Hamming score: 0.21397379912663755
---




Clf:  LogisticRegression
Hamming loss: 0.04052037845705968
Hamming score: 0.6135371179039302
---
Clf:  MultinomialNB
Hamming loss: 0.05760553129548763
Hamming score: 0.4586248873639704
---


# Data additional
Щоб збільшити вибірку знаходжу кілька наборів даних, які мають схожі класи і пробую поставити у відповідність новим класам наявні основні

# 1 Data 18 classes

In [281]:
file_name = folder_path + "/data/18_emotions/primary-plutchik-wheel-DFE.csv"
ei_data = pd.read_csv(file_name, sep=',',  header='infer')
ei_data = ei_data[ei_data['emotion'] != 'Awe']
len(ei_data)

2492

In [282]:
ei_data.head()

Unnamed: 0,_unit_id,_golden,_unit_state,_trusted_judgments,_last_judgment_at,emotion,emotion:confidence,emotion_gold,id,idiom_id,sentence
0,731671736,False,finalized,5,6/8/15 16:10,Neutral,0.3333,,1,1,"How much of the forecast was genuine and how much was fixed, it is a moot point."
1,731671737,False,finalized,5,6/12/15 14:33,Neutral,0.3885,,2,2,"I did touch them one time you see but of course there was nothing doing, he wanted me."
2,731671738,False,finalized,5,6/8/15 16:10,Neutral,0.3333,,3,3,We find that choice theorists admit that they introduce a style of moral paternalism at odds with liberal values.
3,731671739,False,finalized,5,6/8/15 16:30,Neutral,0.369,,4,4,"Well, here I am with an olive branch."
4,731671740,False,finalized,5,6/8/15 16:30,Neutral,0.5572,,5,5,"Its rudder and fin were both knocked out, and a four-foot-long gash in the shell meant even repairs on the bank were out of the question."


In [283]:
ei_data[['emotion', 'emotion_gold', 'sentence']].head()

Unnamed: 0,emotion,emotion_gold,sentence
0,Neutral,,"How much of the forecast was genuine and how much was fixed, it is a moot point."
1,Neutral,,"I did touch them one time you see but of course there was nothing doing, he wanted me."
2,Neutral,,We find that choice theorists admit that they introduce a style of moral paternalism at odds with liberal values.
3,Neutral,,"Well, here I am with an olive branch."
4,Neutral,,"Its rudder and fin were both knocked out, and a four-foot-long gash in the shell meant even repairs on the bank were out of the question."


In [284]:
dict_18_classes_lables_2 = {'Anger':['anger'],
'Anticipation': ['anticipation'],
'Disgust': ['disgust'],
'Disapproval' : ['disgust'],
'Fear' : ['fear'],
'Joy' : ['joy'],
'Love' : ['love'],    
'Optimism' : ['optimism'],
'Sadness' : ['sadness'],
'Surprise' : ['surprise'],
'Trust' : ['trust'],
'Contempt': ['disgust'],
'Neutral': ['neutral'],
'Aggression': ['anger','disgust'],
'Remorse':  ['sadness', 'pessimism'],
'Submission': ['pessimism'],
'Awe': [],
'Ambiguous': ['neutral']}
len(dict_18_classes_lables_2)

18

In [194]:
# 18
# ei_data[ei_data['emotion']== 'Submission']

In [285]:
ei_data['emotion_gold_list'] = ei_data[ei_data['emotion_gold'].notna()]['emotion_gold']\
    .map(lambda x: list(set(np.concatenate( [dict_18_classes_lables_2[i] for i in x.split('\n')]))))


In [273]:
ei_data[ei_data['emotion_gold'].notna()].head()

Unnamed: 0,_unit_id,_golden,_unit_state,_trusted_judgments,_last_judgment_at,emotion,emotion:confidence,emotion_gold,id,idiom_id,sentence,emotion_gold_list
23,731671759,True,golden,22,6/8/15 16:10,"[neutral, disgust, love, anger]",0.6284,Aggression\nAnger\nAwe\nContempt\nDisapproval\nDisgust\nLove\nNeutral,29,29,"Here we are at loggerheads with the rival practice, and I've gone and fallen in love with Robert's assistant.","[neutral, disgust, love, anger]"
33,731671769,True,golden,24,6/8/15 15:47,"[disgust, anger]",0.5302,Aggression\nAnger\nContempt\nDisapproval\nDisgust,40,40,I've got a bone to pick with one club.,"[disgust, anger]"
43,731671779,True,golden,19,6/8/15 16:10,"[disgust, sadness, anger, pessimism]",0.6148,Aggression\nAnger\nContempt\nDisapproval\nDisgust\nRemorse\nSadness,52,40,"Dear Father Christmas, I have a bone to pick with you.","[disgust, sadness, anger, pessimism]"
75,731671811,True,golden,19,6/8/15 16:16,"[disgust, sadness, neutral, anger, love]",0.7671,Anger\nDisapproval\nLove\nNeutral\nSadness,107,107,"Because I like you, Breeze, and it makes my blood boil to think of you slaving away as you do.","[disgust, sadness, neutral, anger, love]"
137,731671873,True,golden,16,,"[optimism, joy, surprise]",0.8109,Joy\nOptimism\nSurprise,203,203,"Yeah, pleased as punch.","[optimism, joy, surprise]"


In [290]:
ei_data.loc[:, 'emotion'] = ei_data['emotion'].map(dict_18_classes_lables_2)

In [292]:
ei_data.loc[ei_data['emotion_gold'].notna(), 'emotion'] = ei_data['emotion_gold_list']

In [293]:
ei_data.loc[ei_data['emotion_gold'].notna()].head()

Unnamed: 0,_unit_id,_golden,_unit_state,_trusted_judgments,_last_judgment_at,emotion,emotion:confidence,emotion_gold,id,idiom_id,sentence,emotion_gold_list
23,731671759,True,golden,22,6/8/15 16:10,"[neutral, disgust, love, anger]",0.6284,Aggression\nAnger\nAwe\nContempt\nDisapproval\nDisgust\nLove\nNeutral,29,29,"Here we are at loggerheads with the rival practice, and I've gone and fallen in love with Robert's assistant.","[neutral, disgust, love, anger]"
33,731671769,True,golden,24,6/8/15 15:47,"[disgust, anger]",0.5302,Aggression\nAnger\nContempt\nDisapproval\nDisgust,40,40,I've got a bone to pick with one club.,"[disgust, anger]"
43,731671779,True,golden,19,6/8/15 16:10,"[disgust, sadness, anger, pessimism]",0.6148,Aggression\nAnger\nContempt\nDisapproval\nDisgust\nRemorse\nSadness,52,40,"Dear Father Christmas, I have a bone to pick with you.","[disgust, sadness, anger, pessimism]"
75,731671811,True,golden,19,6/8/15 16:16,"[disgust, sadness, neutral, anger, love]",0.7671,Anger\nDisapproval\nLove\nNeutral\nSadness,107,107,"Because I like you, Breeze, and it makes my blood boil to think of you slaving away as you do.","[disgust, sadness, neutral, anger, love]"
137,731671873,True,golden,16,,"[optimism, joy, surprise]",0.8109,Joy\nOptimism\nSurprise,203,203,"Yeah, pleased as punch.","[optimism, joy, surprise]"


In [295]:
ei_data_1 = ei_data['emotion'].str.join(sep='*').str.get_dummies(sep='*')

In [297]:
ei_data_2 = addZeroLabels(ei_data_1, label_list)

In [298]:
ei_data_2.iloc[23]

anger           1
anticipation    0
disgust         1
fear            0
joy             0
love            1
optimism        0
pessimism       0
sadness         0
surprise        0
trust           0
Name: 23, dtype: int64

In [300]:
ei_data_2['sentence'] = ei_data['sentence']

In [303]:
class_18_df = ei_data_2

**Результат:**

In [305]:
class_18_df[22:33]

Unnamed: 0,anger,anticipation,disgust,fear,joy,love,optimism,pessimism,sadness,surprise,trust,sentence
22,0,0,0,0,0,0,0,1,0,0,0,"For Lisa's sake, for Celia's sake, perhaps it would be better to let sleeping dogs lie."
23,1,0,1,0,0,1,0,0,0,0,0,"Here we are at loggerheads with the rival practice, and I've gone and fallen in love with Robert's assistant."
24,0,1,0,0,0,0,0,0,0,0,0,Rugby league's deadliest rivals lock horns in the season's big curtain-raiser at Gateshead tomorrow.
25,0,0,0,0,0,0,0,0,0,1,0,"For a brother and sister act, there was certainly no love lost between them."
26,0,0,1,0,0,0,0,0,0,0,0,Trust a woman to make a mountain out of a molehill.
27,0,0,0,0,0,0,0,0,0,0,0,Is that much ado about nothing really?
28,0,0,0,0,0,0,0,0,0,0,0,"Well, you weren't gonna argue the toss."
29,0,0,0,0,0,0,0,0,1,0,0,"Then, in 1981, all hell broke loose in the Cambridge English Faculty."
30,1,0,1,0,0,0,0,0,0,0,0,Battle lines are drawn for control of the Norfolk Capital hotels group.
31,0,0,0,0,0,0,0,0,0,0,0,"Chantal hissed, her eyes locked on his in a battle of wills."


In [307]:
len(class_18_df)

2492

# 2 Data Twitter 

Twitter <br>
twitter status id  - annotaion <br>
вибірка з університету, може бути якісною, але треба 
- завантажити собі текст твітів за id (використати Twitter API?)
- смайлам поставити у відповідність мітки, попереньо вирішити, які смайли цікаві(знайшла всі emojies з описом), частоково в ручну треба буде обробити 

In [310]:
file_name = 'full_train_plaintext.txt'
data = pd.read_csv(file_name, sep='\t',  header='infer')

In [311]:
data.head()

Unnamed: 0,id,annotations
0,744014442837454848,47883
1,742407819496919041,1381
2,744101567981359105,1421
3,744692442033577984,1056
4,746426733834944512,10561381


**12 997 219 висловлювань**

In [312]:
# 12 997 219
len(data)

12997219

In [314]:
#  mentions 
data.groupby(data['id'].map(lambda s: s[-2:])=='_q').agg({'id':'count'})

Unnamed: 0_level_0,id
id,Unnamed: 1_level_1
False,12306695
True,690524


In [315]:
100*12306695/(12306695 + 690524)
# 94% записів без mentions

94.68714037979971

In [317]:
file_name = 'emoji_map_1791.csv'
emoji_data = pd.read_csv(file_name, sep=',',  header='infer')

In [318]:
emoji_data = emoji_data.reset_index().rename(columns={'index':'id'})

In [323]:
emoji_data[1397:1407].head()

Unnamed: 0.1,id,Unnamed: 0,category,title,shorts,unicode_alternates,keywords,ucode_short,ucode,aliases,alt_title,aliases_ascii
1397,1397,😒,people,unamused face,[unamused],,"[emotion, unamused, tired, smiley, unhappy, sad, mad, face]",1f612,😒,,,
1398,1398,😓,people,face with cold sweat,[sweat],,"[emotion, smiley, sad, sweat, stressed, cold, face]",1f613,😓,,,"[':(, ':-(, '=(]"
1399,1399,😔,people,pensive face,[pensive],,"[emotion, pensive, smiley, dejected, rip, sad, face]",1f614,😔,,,
1400,1400,😕,people,confused face,[confused],,"[emotion, smiley, confused, surprised, face]",1f615,😕,,,"[>:\, >:/, :-/, :-., :/, :\, =/, =\, :L, =L]"
1401,1401,😖,people,confounded face,[confounded],,"[emotion, confounded, angry, smiley, sad, face]",1f616,😖,,,


In [327]:
emoji_data.iloc[1397,:][['category', 'title', 'keywords', 'ucode']]

category                                                         people
title                                                     unamused face
keywords    [emotion, unamused, tired, smiley, unhappy, sad, mad, face]
ucode                                                                 😒
Name: 1397, dtype: object

In [328]:
emoji_dict = pd.Series(emoji_data['ucode'].values, index=emoji_data['id']).to_dict()

In [329]:
data['tags'] = data['annotations'].map(lambda x: x.split(','))

In [330]:
data.head()

Unnamed: 0,id,annotations,tags
0,744014442837454848,47883,"[47, 883]"
1,742407819496919041,1381,[1381]
2,744101567981359105,1421,[1421]
3,744692442033577984,1056,[1056]
4,746426733834944512,10561381,"[1056, 1381]"


In [331]:
data_top_tags =  data.groupby('annotations').agg({'id':'count'}).sort_values('id', ascending=False).reset_index()
# data_top_tags['annotations'].map(emoji_dict)

In [332]:
data_top_tags['em'] = data_top_tags['annotations'].map(lambda x: int(x) if (','not in x) else x).map(emoji_dict)

Топ 10 emojies в даних

In [333]:
data_top_tags.head(10)

Unnamed: 0,annotations,id,em
0,1381,1046494,😂
1,1447,393323,🙄
2,1392,379555,😍
3,1424,375040,😭
4,186,310493,❤
5,1389,297798,😊
6,1620,296932,🤔
7,1420,282446,😩
8,1446,203802,🙃
9,1403,182672,😘


# 3 Twitter raw data

In [211]:
file_name = 'twitter_raw_data.txt'

# 4 Emotions dataset 

1. PsychExp

In [32]:
map_PsychExp = {'joy':  'joy',
                'fear': 'fear',
                'anger': 'anger', 
                'sadness': 'sadness', 
                'disgust': 'disgust', 
                'shame': ['pessimism', 'sadness'], 
                'guilt': ['pessimism', 'sadness']}

In [33]:
file_name = folder_path + '/data/DeepMoji_project/data/PsychExp/raw.pickle'
em_data = pd.read_pickle(file_name)

In [94]:
lst =[dict['label'] for dict in em_data['info']]
values= np.array(lst).reshape(len(lst), 7)
em_df = pd.DataFrame(values, columns= ['joy', 'fear', 'anger', 'sadness', 'disgust', 'shame', 'guilt'])
em_df['text'] = em_data['texts']
print(len(em_df))
em_df[['joy', 'fear', 'anger', 'sadness', 'disgust', 'shame', 'guilt']]\
    = em_df[['joy', 'fear', 'anger', 'sadness', 'disgust', 'shame', 'guilt']].astype(int)

7480


In [100]:
em_df_1 = em_df.copy()

In [127]:
em_df_1['sadness'] = (em_df['guilt'] | em_df['sadness'] | em_df['shame'])
em_df_1['pessimism'] = (em_df['guilt'] | em_df['sadness'] | em_df['shame'])

In [129]:
class_7_df = addZeroLabels(em_df_1, label_list)
class_7_df.head()

Unnamed: 0,joy,fear,anger,sadness,disgust,text,pessimism,anticipation,love,optimism,surprise,trust
0,1,0,0,0,0,"During the period of falling in love, each time that we met and especially when we had not met for a long time.",0,0,0,0,0,0
1,0,1,0,0,0,When I was involved in a traffic accident.,0,0,0,0,0,0
2,0,0,1,0,0,"When I was driving home after several days of hard work, there was a motorist ahead of me who was driving at 50 km/hour and refused, despite his low speeed to let me overtake.",0,0,0,0,0,0
3,0,0,0,1,0,When I lost the person who meant the most to me.,1,0,0,0,0,0
4,0,0,0,0,1,"The time I knocked a deer down - the sight of the animal's injuries and helplessness. The realization that the animal was so badly hurt that it had to be put down, and when the animal screamed at the moment of death.",0,0,0,0,0,0


2. SE0714

In [107]:
file_name = folder_path + '/data/DeepMoji_project/data/SE0714/raw.pickle'
em_sf_data = pd.read_pickle(file_name)

In [108]:
lst =[dict['label'] for dict in em_sf_data['info']]
values= np.array(lst).reshape(len(lst), 3)
em_sf_df = pd.DataFrame(values, columns= [ 'fear', 'joy', 'sadness'])
em_sf_df['text'] = em_sf_data['texts']
print(len(em_sf_df))
em_sf_df[[ 'fear', 'joy', 'sadness']]\
    = em_sf_df[[ 'fear', 'joy', 'sadness']].astype(int)
em_sf_df.head()

1250


Unnamed: 0,fear,joy,sadness,text
0,1,0,1,Mortar assault leaves at least 18 dead
1,0,1,0,Goal delight for Sheva
2,1,1,0,Nigeria hostage feared dead is freed
3,1,0,1,Bombers kill shoppers
4,0,0,0,"Vegetables, not fruit, slow brain decline"


In [123]:
def addZeroLabels(df, label_list):
    for label in label_list:
        if label not in list(df.columns):
            df[label] = 0
    columns_to_drop = list(set(df.columns) -  set(label_list + ['text']))   
    df = df.drop(columns_to_drop, axis=1)
    return df        
    

In [124]:
class_3_df = addZeroLabels(em_sf_df, label_list)

In [306]:
class_3_df.head()

Unnamed: 0,fear,joy,sadness,text,anger,anticipation,disgust,love,optimism,pessimism,surprise,trust
0,1,0,1,Mortar assault leaves at least 18 dead,0,0,0,0,0,0,0,0
1,0,1,0,Goal delight for Sheva,0,0,0,0,0,0,0,0
2,1,1,0,Nigeria hostage feared dead is freed,0,0,0,0,0,0,0,0
3,1,0,1,Bombers kill shoppers,0,0,0,0,0,0,0,0
4,0,0,0,"Vegetables, not fruit, slow brain decline",0,0,0,0,0,0,0,0
