With a new dataset, only modifying the following shall be sufficient. 
* Import Data
    * directory
    * file name
    * pd.read_csv() if different file type
    * data_name
    * label_name
* Prepare Data
    * In get_train_data function, modify the way binary labels are defined.
    * Number of posts (iterations) in get_word_data based on your interest
* Results
    * Modify the way 'keyword' is used based on your binary labels.

# Import Data

In [1]:
import pandas as pd
from tqdm import tqdm

In [2]:
directory = '/data2/link10/data/yelp/'
file_name = 'df1M.tsv'
raw_df = pd.read_csv(directory + file_name, delimiter = '\t')
# remove rows with missing values
df = raw_df.dropna()
df.head()

Unnamed: 0,review_id,stars,text
0,xQY8N_XvtGbearJ5X4QryQ,2,"As someone who has worked with many museums, I..."
1,UmFMZ8PyXZTY2QcwzsfQYA,1,I am actually horrified this place is still in...
2,LG2ZaYiOgpr2DK_90pYjNw,5,I love Deagan's. I do. I really do. The atmosp...
3,i6g_oA9Yf9Y31qt0wibXpw,1,"Dismal, lukewarm, defrosted-tasting ""TexMex"" g..."
4,6TdNDKywdbjoTkizeMce8A,4,"Oh happy day, finally have a Canes near my cas..."


In [3]:
data_name = 'text'
label_name = 'stars'

In [4]:
print(
    'There are {} data.'.format(df.shape[0]),
    'Labels are: {}'.format(df[label_name].unique()),
    sep = '\n'
    )
df[label_name].value_counts()

There are 1000000 data.
Labels are: [2 1 5 4 3]


5    449091
4    210363
1    156690
3    104973
2     78883
Name: stars, dtype: int64

# Embedding

In [5]:
# !python -m spacy download en_core_web_lg
import numpy as np
import spacy

In [6]:
# load the language model
nlp = spacy.load('/data2/link10/models/fasttext/en_fasttext_crawl_subword')

In [None]:
with nlp.disable_pipes():
    msg_vectors = np.array([nlp(msg.lower()).vector for msg in tqdm(df[data_name])])
msg_vectors.shape

In [None]:
yelp_fasttext_vectors_subword = msg_vectors
%store yelp_fasttext_vectors_subword

In [16]:
# embedding takes huge amount of time, use stored result
%store -r yelp_fasttext_vectors_subword
msg_vectors = yelp_fasttext_vectors_subword
msg_vectors.shape

(1000000, 300)

# Prepare data

In [7]:
from sklearn.model_selection import train_test_split
def get_train_data(keyword):
    labels = np.array([1 if x > keyword else 0 for x in df[label_name]])
    X_train, X_test, Y_train, Y_test = train_test_split(msg_vectors, labels,
                                                    test_size=0.2, random_state=1)
    return X_train, X_test, Y_train, Y_test

In [8]:
def get_word_data(npl,df):
    word_list = []
    word_vec = []
    word_occr_dict = {}
    with nlp.disable_pipes():
        for i in tqdm(range(1000000)):
            msg = nlp(df.iloc[i][data_name].lower())
            for token in msg:
                if token.text not in word_list:
                    word_list.append(token.text)
                    word_vec.append([token.vector])
                    word_occr_dict[token.text] = 1
                else:
                    word_occr_dict[token.text] += 1     
    word_array = np.concatenate(np.array(word_vec),0)
    word_occr = [word_occr_dict[word] for word in word_list]
    return word_list, word_array, word_occr

# Linear SVC


In [9]:
from sklearn.svm import LinearSVC
def top_words_SVC(X_train, X_test, Y_train, Y_test, word_list, word_array, word_occr): 
# Set dual=False to speed up training, and it's not needed
    svc = LinearSVC(random_state=1, dual=False, max_iter=10000)
    svc.fit(X_train, Y_train)
    accu = svc.score(X_test, Y_test)
    scores = svc.decision_function(word_array)
    word_df = pd.DataFrame({'word':word_list,'scores':scores,'occurrence':word_occr})
#     word_df = word_df[word_df['occurrence']>50]
    word_df = word_df.sort_values('scores',ascending=False)
    return accu, word_df

In [10]:
def get_top_words(keyword, word_list, word_array, word_occr):
    X_train, X_test, Y_train, Y_test = get_train_data(keyword)
    accu, words = top_words_SVC(X_train, X_test, Y_train, Y_test, word_list, word_array, word_occr)
    print(
        'Critical star: {}'.format(keyword),
        'Accuracy: {}'.format(accu),
        sep = '\n'
    )
    return words

# Results

In [11]:
word_list, word_array, word_occr = get_word_data(nlp,df)

100%|██████████| 1000000/1000000 [1:39:29<00:00, 167.51it/s]


In [12]:
yelp_fasttext_word_subword = [word_list, word_array, word_occr]
%store yelp_fasttext_word_subword

Stored 'yelp_fasttext_word_subword' (list)


In [13]:
# use stored result
%store -r yelp_fasttext_word_subword
word_list = yelp_fasttext_word_subword[0]
word_array = yelp_fasttext_word_subword[1]
word_occr = yelp_fasttext_word_subword[2]

In [14]:
# fasttext_subword + linear SVC + filter

In [17]:
top_words = get_top_words(3,word_list, word_array, word_occr)

Critical star: 3
Accuracy: 0.881965


In [18]:
thresh = 1000
pos_words_f = top_words[top_words['occurrence']>thresh]

In [19]:
pos_words_f.head(15)

Unnamed: 0,word,scores,occurrence
3631,fav,32.910557,2525
7519,joy,27.102466,1754
7553,x,26.820804,3099
8941,fave,22.610263,2247
8587,~,22.225642,1244
4707,gem,21.945507,11036
6848,à,19.636158,5490
68,cozy,19.467758,7961
3340,yum,19.011517,11546
5998,tom,18.900867,3815


In [20]:
neg_words_f  = pos_words_f.sort_values('scores',ascending=True)
neg_words_f.head(15)

Unnamed: 0,word,scores,occurrence
1795,rude,-33.204613,30652
203,tacky,-30.649772,1082
787,bland,-27.704997,16126
1450,stale,-27.080081,3705
5021,ugly,-26.341043,1406
7626,lied,-25.167316,2052
864,poor,-24.54346,17567
4214,0,-23.725654,3518
1041,ok,-23.618393,48327
854,waste,-23.406361,11468


In [None]:
%store