With a new dataset, only modifying the following shall be sufficient. 
* Import Data
    * directory
    * file name
    * pd.read_csv() if different file type
    * data_name
    * label_name
* Prepare Data
    * In get_train_data function, modify the way binary labels are defined.
    * Number of posts (iterations) in get_word_data based on your interest
* Results
    * Modify the way 'keyword' is used based on your binary labels.

# Import Data

In [1]:
import pandas as pd

In [2]:
directory = '/data1/link10/nrc/'
file_name = 'msgs_tec.csv'
raw_df = pd.read_csv(directory + file_name)
# remove rows with missing values
df = raw_df.dropna()
df.head()

Unnamed: 0,message_id,message,emotion
0,145353048817012736,Thinks that @melbahughes had a great 50th birt...,surprise
1,144279638024257536,"Como una expresiÃ³n tan simple, una sola oraci...",sadness
2,140499585285111809,the moment when you get another follower and y...,joy
3,145207578270507009,Be the greatest dancer of your life! practice ...,joy
4,139502146390470656,eww.. my moms starting to make her annual rum ...,disgust


In [3]:
data_name = 'message'
label_name = 'emotion'

In [4]:
print(
    'There are {} data.'.format(df.shape[0]),
    'Labels are: {}'.format(df[label_name].unique()),
    sep = '\n'
    )
df[label_name].value_counts()

There are 21049 data.
Labels are: ['surprise' 'sadness' 'joy' 'disgust' 'fear' 'anger']


joy         8239
surprise    3849
sadness     3829
fear        2816
anger       1555
disgust      761
Name: emotion, dtype: int64

# Embedding

In [5]:
# !python -m spacy download en_core_web_lg
import numpy as np
import spacy

In [6]:
# # load the language model
# nlp = spacy.load('/data2/link10/models/fasttext/en_fasttext_crawl)

In [7]:
# with nlp.disable_pipes():
#     msg_vectors = np.array([nlp(msg.lower()).vector for msg in df[data_name]])
# msg_vectors.shape

(21049, 300)

In [8]:
# nrc_fasttext_vectors = msg_vectors
# %store nrc_fasttext_vectors

Stored 'nrc_fasttext_vectors' (ndarray)


In [6]:
# embedding takes huge amount of time, use stored result
%store -r nrc_fasttext_vectors
msg_vectors = nrc_fasttext_vectors
msg_vectors.shape

(21049, 300)

# Prepare data

In [7]:
from sklearn.model_selection import train_test_split
def get_train_data(keyword):
    labels = np.array([1 if x == keyword else 0 for x in df[label_name]])
    X_train, X_test, Y_train, Y_test = train_test_split(msg_vectors, labels,
                                                    test_size=0.2, random_state=1)
    return X_train, X_test, Y_train, Y_test

In [8]:
from tqdm import tqdm
from langdetect import detect
def get_word_data(npl,df):
    word_list = []
    word_vec = []
    word_occr_dict = {}
    with nlp.disable_pipes():
        for i in tqdm(range(21049)):
            msg = nlp(df.iloc[i][data_name].lower())
            for token in msg:
                if token.text not in word_list:
                    word_list.append(token.text)
                    word_vec.append([token.vector])
                    word_occr_dict[token.text] = 1
                else:
                    word_occr_dict[token.text] += 1     
    word_array = np.concatenate(np.array(word_vec),0)
    word_occr = [word_occr_dict[word] for word in word_list]
    return word_list, word_array, word_occr

# Linear SVC


In [9]:
from sklearn.svm import LinearSVC
def top_words_SVC(X_train, X_test, Y_train, Y_test, word_list, word_array, word_occr): 
# Set dual=False to speed up training, and it's not needed
    svc = LinearSVC(random_state=1, dual=False, max_iter=10000)
    svc.fit(X_train, Y_train)
    accu = svc.score(X_test, Y_test)
    scores = svc.decision_function(word_array)
    word_df = pd.DataFrame({'word':word_list,'scores':scores,'occurrence':word_occr})
#     word_df = word_df[word_df['occurrence']>50]
    word_df = word_df.sort_values('scores',ascending=False)
    return accu, word_df

In [10]:
def get_top_words(keyword, word_list, word_array, word_occr):
    X_train, X_test, Y_train, Y_test = get_train_data(keyword)
    accu, words = top_words_SVC(X_train, X_test, Y_train, Y_test, word_list, word_array, word_occr)
    print(
        'Label: {}'.format(keyword),
        'Accuracy: {}'.format(accu),
        sep = '\n'
    )
    return words

# Results

In [14]:
# word_list, word_array, word_occr = get_word_data(nlp,df)

100%|██████████| 21049/21049 [00:23<00:00, 883.75it/s]


In [15]:
# nrc_fasttext_word = [word_list, word_array, word_occr]
# %store nrc_fasttext_word

Stored 'nrc_fasttext_word' (list)


In [11]:
# use stored result
%store -r nrc_fasttext_word
word_list = nrc_fasttext_word[0]
word_array = nrc_fasttext_word[1]
word_occr = nrc_fasttext_word[2]

In [12]:
df[label_name].value_counts()

joy         8239
surprise    3849
sadness     3829
fear        2816
anger       1555
disgust      761
Name: emotion, dtype: int64

In [13]:
joy_words = get_top_words('joy',word_list, word_array, word_occr)
surprise_words = get_top_words('surprise',word_list, word_array, word_occr)
sadness_words = get_top_words('sadness',word_list, word_array, word_occr)
fear_words = get_top_words('fear',word_list, word_array, word_occr)
anger_words = get_top_words('anger',word_list, word_array, word_occr)
disgust_words = get_top_words('disgust',word_list, word_array, word_occr)

Label: joy
Accuracy: 0.7741092636579573
Label: surprise
Accuracy: 0.8505938242280285
Label: sadness
Accuracy: 0.8358669833729216
Label: fear
Accuracy: 0.9066508313539192
Label: anger
Accuracy: 0.9285035629453682
Label: disgust
Accuracy: 0.9624703087885986


In [50]:
thresh = 10
joy_f = joy_words[joy_words['occurrence']>thresh]
surprise_f = surprise_words[surprise_words['occurrence']>thresh]
sadness_f = sadness_words[sadness_words['occurrence']>thresh]
fear_f = fear_words[fear_words['occurrence']>thresh]
anger_f = anger_words[anger_words['occurrence']>thresh]
disgust_f = disgust_words[disgust_words['occurrence']>thresh]

In [48]:
# from langdetect import detect
# lang_list = []
# for w in joy_f['word']:
#     try:
#         lang_list.append(detect(w))
#     except langdetect.lang_detect_exception.LangDetectException:
#         lang_list.append('unknown')
# joy_f['language'] = lang_list

In [51]:
joy_f.head(10)

Unnamed: 0,word,scores,occurrence
233,joy,8.785417,322
7386,blessings,8.235936,21
5342,sunshine,7.803851,17
6242,joyful,7.759841,25
7241,joyous,7.563714,13
3459,bless,7.42534,16
3148,merry,7.252967,18
4576,soak,7.006367,12
5216,blessed,6.982289,46
2185,bath,6.77744,35


In [21]:
surprise_f.head(10)

Unnamed: 0,word,scores,occurrence
5840,surprises,9.146864,12
1809,surprise,9.092441,186
2731,sinterklaas,8.609964,35
2119,surprised,6.845616,23
1118,cadeau,6.376916,16
1339,unexpected,6.11222,21
3110,gekregen,6.099349,15
2712,expecting,5.650363,23
1516,bday,5.369607,31
7,birthday,5.015767,230


In [22]:
sadness_f.head(10)

Unnamed: 0,word,scores,occurrence
6670,lonely,8.431065,24
975,sad,8.117304,178
2839,sadly,7.446077,16
2095,missing,7.301855,75
976,miss,7.091248,275
4734,goodbye,6.834825,19
1233,missed,6.666982,86
1864,depressed,6.320037,20
2788,anymore,5.796224,68
2087,broken,5.521951,47


In [23]:
fear_f.head(10)

Unnamed: 0,word,scores,occurrence
7589,fears,13.68755,14
385,fear,13.546797,367
1502,afraid,13.19712,422
695,scared,13.02648,124
3588,scare,11.935199,11
2485,scary,10.644044,21
7671,frightening,9.289814,18
381,danger,8.64334,13
4983,freaked,7.576065,11
4656,confidence,7.225746,31


In [24]:
anger_f.head(10)

Unnamed: 0,word,scores,occurrence
1911,anger,9.779019,103
623,angry,7.621302,76
2438,rage,7.578564,28
5495,frustration,6.796474,23
2351,punch,5.007185,32
4869,annoyed,4.584491,12
4899,yell,4.543393,13
3818,shouting,4.393189,13
383,mad,4.382311,95
3752,pissed,4.180532,43


In [25]:
disgust_f.head(10)

Unnamed: 0,word,scores,occurrence
6514,disgusting,7.163973,19
8253,gross,4.92766,13
1199,smell,4.648743,36
417,dirty,3.834117,28
2922,smells,3.801148,25
4976,bieber,3.203984,15
9852,rape,3.004995,20
5127,coke,2.998459,14
9451,hoes,2.927872,13
5425,sex,2.74234,44
