With a new dataset, only modifying the following shall be sufficient. 
* Import Data
    * directory
    * file name
    * pd.read_csv() if different file type
    * data_name
    * label_name
* Prepare Data
    * In get_train_data function, modify the way binary labels are defined.
    * Number of posts (iterations) in get_word_data based on your interest
* Results
    * Modify the way 'keyword' is used based on your binary labels.

# Import Data

In [1]:
import pandas as pd

In [2]:
directory = '/data1/link10/nrc/'
file_name = 'msgs_tec.csv'
raw_df = pd.read_csv(directory + file_name)
# remove rows with missing values
df = raw_df.dropna()
df.head()

Unnamed: 0,message_id,message,emotion
0,145353048817012736,Thinks that @melbahughes had a great 50th birt...,surprise
1,144279638024257536,"Como una expresiÃ³n tan simple, una sola oraci...",sadness
2,140499585285111809,the moment when you get another follower and y...,joy
3,145207578270507009,Be the greatest dancer of your life! practice ...,joy
4,139502146390470656,eww.. my moms starting to make her annual rum ...,disgust


In [3]:
data_name = 'message'
label_name = 'emotion'

In [4]:
print(
    'There are {} data.'.format(df.shape[0]),
    'Labels are: {}'.format(df[label_name].unique()),
    sep = '\n'
    )
df[label_name].value_counts()

There are 21049 data.
Labels are: ['surprise' 'sadness' 'joy' 'disgust' 'fear' 'anger']


joy         8239
surprise    3849
sadness     3829
fear        2816
anger       1555
disgust      761
Name: emotion, dtype: int64

# Embedding

In [5]:
# !python -m spacy download en_core_web_lg
import numpy as np
import spacy

In [6]:
# # load the language model
# nlp = spacy.load('en_core_web_lg')

In [7]:
# with nlp.disable_pipes():
#     msg_vectors = np.array([nlp(msg.lower()).vector for msg in df[data_name]])
# msg_vectors.shape

In [8]:
# nrc_glove_vectors = msg_vectors
# %store nrc_glove_vectors

In [9]:
# embedding takes huge amount of time, use stored result
%store -r nrc_glove_vectors
msg_vectors = nrc_glove_vectors
msg_vectors.shape

(21049, 300)

# Prepare data

In [11]:
from sklearn.model_selection import train_test_split
def get_train_data(keyword):
    labels = np.array([1 if x == keyword else 0 for x in df[label_name]])
    X_train, X_test, Y_train, Y_test = train_test_split(msg_vectors, labels,
                                                    test_size=0.2, random_state=1)
    return X_train, X_test, Y_train, Y_test

In [12]:
from tqdm import tqdm
def get_word_data(npl,df):
    word_list = []
    word_vec = []
    word_occr_dict = {}
    with nlp.disable_pipes():
        for i in tqdm(range(21049)):
            msg = nlp(df.iloc[i][data_name].lower())
            for token in msg:
                if token.text not in word_list:
                    word_list.append(token.text)
                    word_vec.append([token.vector])
                    word_occr_dict[token.text] = 1
                else:
                    word_occr_dict[token.text] += 1     
    word_array = np.concatenate(np.array(word_vec),0)
    word_occr = [word_occr_dict[word] for word in word_list]
    return word_list, word_array, word_occr

# Linear SVC


In [13]:
from sklearn.svm import LinearSVC
def top_words_SVC(X_train, X_test, Y_train, Y_test, word_list, word_array, word_occr): 
# Set dual=False to speed up training, and it's not needed
    svc = LinearSVC(random_state=1, dual=False, max_iter=10000)
    svc.fit(X_train, Y_train)
    accu = svc.score(X_test, Y_test)
    scores = svc.decision_function(word_array)
    word_df = pd.DataFrame({'word':word_list,'scores':scores,'occurrence':word_occr})
#     word_df = word_df[word_df['occurrence']>50]
    word_df = word_df.sort_values('scores',ascending=False)
    return accu, word_df

In [14]:
def get_top_words(keyword, word_list, word_array, word_occr):
    X_train, X_test, Y_train, Y_test = get_train_data(keyword)
    accu, words = top_words_SVC(X_train, X_test, Y_train, Y_test, word_list, word_array, word_occr)
    print(
        'Label: {}'.format(keyword),
        'Accuracy: {}'.format(accu),
        sep = '\n'
    )
    return words

# Results

In [15]:
# word_list, word_array, word_occr = get_word_data(nlp,df)

In [16]:
# nrc_glove_word = [word_list, word_array, word_occr]
# %store nrc_glove_word

In [17]:
# use stored result
%store -r nrc_glove_word
word_list = nrc_glove_word[0]
word_array = nrc_glove_word[1]
word_occr = nrc_glove_word[2]

In [18]:
df[label_name].value_counts()

joy         8239
surprise    3849
sadness     3829
fear        2816
anger       1555
disgust      761
Name: emotion, dtype: int64

In [19]:
joy_words = get_top_words('joy',word_list, word_array, word_occr)
surprise_words = get_top_words('surprise',word_list, word_array, word_occr)
sadness_words = get_top_words('sadness',word_list, word_array, word_occr)
fear_words = get_top_words('fear',word_list, word_array, word_occr)
anger_words = get_top_words('anger',word_list, word_array, word_occr)
disgust_words = get_top_words('disgust',word_list, word_array, word_occr)

Label: joy
Accuracy: 0.7684085510688836
Label: surprise
Accuracy: 0.8441805225653206
Label: sadness
Accuracy: 0.8342042755344418
Label: fear
Accuracy: 0.903562945368171
Label: anger
Accuracy: 0.9261282660332542
Label: disgust
Accuracy: 0.9624703087885986


In [20]:
thresh = 10
joy_f = joy_words[joy_words['occurrence']>thresh]
surprise_f = surprise_words[surprise_words['occurrence']>thresh]
sadness_f = sadness_words[sadness_words['occurrence']>thresh]
fear_f = fear_words[fear_words['occurrence']>thresh]
anger_f = anger_words[anger_words['occurrence']>thresh]
disgust_f = disgust_words[disgust_words['occurrence']>thresh]

In [21]:
joy_f.head(10)

Unnamed: 0,word,scores,occurrence
233,joy,7.818087,323
6244,joyful,7.789048,25
5267,psalm,7.504107,14
3150,merry,6.96045,18
7388,blessings,6.591321,21
2389,goodness,6.464386,15
9213,holidays,6.439917,37
7243,joyous,6.330112,13
5344,sunshine,6.271446,17
795,happiness,6.21843,139


In [22]:
surprise_f.head(10)

Unnamed: 0,word,scores,occurrence
1810,surprise,4.944103,186
1517,bday,4.89465,31
2342,vandaag,4.880156,14
5842,surprises,4.86172,12
3616,lekker,4.798536,25
4016,wachten,4.544061,12
1119,cadeau,4.457628,16
6858,congrats,4.262648,11
4599,wordt,4.079687,15
4105,hebben,4.041126,13


In [23]:
sadness_f.head(10)

Unnamed: 0,word,scores,occurrence
976,sad,6.721835,178
6672,lonely,6.495766,24
2096,missing,6.107835,75
2839,sadly,5.197627,16
4086,dies,5.150065,15
4736,goodbye,5.080637,19
2088,broken,4.926419,47
283,lost,4.910908,128
328,dying,4.816007,25
977,miss,4.699303,275


In [24]:
fear_f.head(10)

Unnamed: 0,word,scores,occurrence
385,fear,11.777669,365
7591,fears,10.91713,14
696,scared,10.593725,124
2715,terror,9.85887,19
2485,scary,9.284063,21
3590,scare,9.073646,11
7673,frightening,8.962389,18
381,danger,8.74765,13
1503,afraid,8.661831,422
7552,horror,7.905677,18


In [25]:
anger_f.head(10)

Unnamed: 0,word,scores,occurrence
1912,anger,8.688359,103
2438,rage,7.711628,28
623,angry,6.901929,76
3958,hatred,6.030928,12
5497,frustration,4.906374,23
2071,rude,4.856652,12
3863,selfish,4.032567,11
3754,pissed,3.836701,43
4810,throwing,3.718978,17
180,bitch,3.540508,90


In [26]:
disgust_f.head(10)

Unnamed: 0,word,scores,occurrence
6516,disgusting,7.383015,19
1200,smell,4.831673,36
3325,ew,4.811148,20
9453,hoes,4.448362,13
2924,smells,4.199117,25
8255,gross,3.747821,13
417,dirty,3.318123,28
7291,poop,3.298234,12
7968,ugly,2.934254,29
691,smh,2.777972,70
