With a new dataset, only modifying the following shall be sufficient. 
* Import Data
    * directory
    * file name
    * pd.read_csv() if different file type
    * data_name
    * label_name
* Prepare Data
    * In get_train_data function, modify the way binary labels are defined.
    * Number of posts (iterations) in get_word_data based on your interest
* Results
    * Modify the way 'keyword' is used based on your binary labels.

# Import Data

In [1]:
import pandas as pd
from tqdm import tqdm

In [2]:
directory = '/data1/link10/yelp/'
file_name = 'df1M.tsv'
raw_df = pd.read_csv(directory + file_name, delimiter = '\t')
# remove rows with missing values
df = raw_df.dropna()
df.head()

Unnamed: 0,review_id,stars,text
0,xQY8N_XvtGbearJ5X4QryQ,2,"As someone who has worked with many museums, I..."
1,UmFMZ8PyXZTY2QcwzsfQYA,1,I am actually horrified this place is still in...
2,LG2ZaYiOgpr2DK_90pYjNw,5,I love Deagan's. I do. I really do. The atmosp...
3,i6g_oA9Yf9Y31qt0wibXpw,1,"Dismal, lukewarm, defrosted-tasting ""TexMex"" g..."
4,6TdNDKywdbjoTkizeMce8A,4,"Oh happy day, finally have a Canes near my cas..."


In [3]:
data_name = 'text'
label_name = 'stars'

In [4]:
print(
    'There are {} data.'.format(df.shape[0]),
    'Labels are: {}'.format(df[label_name].unique()),
    sep = '\n'
    )
df[label_name].value_counts()

There are 1000000 data.
Labels are: [2 1 5 4 3]


5    449091
4    210363
1    156690
3    104973
2     78883
Name: stars, dtype: int64

# Embedding

In [1]:
# !python -m spacy download en_core_web_lg
import numpy as np
import spacy

In [5]:
# load the language model
nlp = spacy.load('/data2/link10/models/fasttext/en_fasttext_crawl')

In [16]:
with nlp.disable_pipes():
    msg_vectors = np.array([nlp(msg.lower()).vector for msg in tqdm(df[data_name])])
msg_vectors.shape

100%|██████████| 1000000/1000000 [10:05<00:00, 1651.56it/s]


(1000000, 300)

In [17]:
yelp_fasttext_vectors = msg_vectors
%store yelp_fasttext_vectors

Stored 'yelp_fasttext_vectors' (ndarray)


In [6]:
# embedding takes huge amount of time, use stored result
%store -r yelp_fasttext_vectors
msg_vectors = yelp_fasttext_vectors
msg_vectors.shape

(1000000, 300)

# Prepare data

In [7]:
from sklearn.model_selection import train_test_split
def get_train_data(keyword):
    labels = np.array([1 if x > keyword else 0 for x in df[label_name]])
    X_train, X_test, Y_train, Y_test = train_test_split(msg_vectors, labels,
                                                    test_size=0.2, random_state=1)
    return X_train, X_test, Y_train, Y_test

In [8]:
def get_word_data(npl,df):
    word_list = []
    word_vec = []
    word_occr_dict = {}
    with nlp.disable_pipes():
        for i in tqdm(range(1000000)):
            msg = nlp(df.iloc[i][data_name].lower())
            for token in msg:
                if token.text not in word_list:
                    word_list.append(token.text)
                    word_vec.append([token.vector])
                    word_occr_dict[token.text] = 1
                else:
                    word_occr_dict[token.text] += 1     
    word_array = np.concatenate(np.array(word_vec),0)
    word_occr = [word_occr_dict[word] for word in word_list]
    return word_list, word_array, word_occr

# Linear SVC


In [9]:
from sklearn.svm import LinearSVC
def top_words_SVC(X_train, X_test, Y_train, Y_test, word_list, word_array, word_occr): 
# Set dual=False to speed up training, and it's not needed
    svc = LinearSVC(random_state=1, dual=False, max_iter=10000)
    svc.fit(X_train, Y_train)
    accu = svc.score(X_test, Y_test)
    scores = svc.decision_function(word_array)
    word_df = pd.DataFrame({'word':word_list,'scores':scores,'occurrence':word_occr})
#     word_df = word_df[word_df['occurrence']>50]
    word_df = word_df.sort_values('scores',ascending=False)
    return accu, word_df

In [10]:
def get_top_words(keyword, word_list, word_array, word_occr):
    X_train, X_test, Y_train, Y_test = get_train_data(keyword)
    accu, words = top_words_SVC(X_train, X_test, Y_train, Y_test, word_list, word_array, word_occr)
    print(
        'Critical star: {}'.format(keyword),
        'Accuracy: {}'.format(accu),
        sep = '\n'
    )
    return words

# Results

In [13]:
word_list, word_array, word_occr = get_word_data(nlp,df)

100%|██████████| 1000000/1000000 [2:02:45<00:00, 135.77it/s] 


In [14]:
yelp_fasttext_word = [word_list, word_array, word_occr]
%store yelp_fasttext_word

Stored 'yelp_fasttext_word' (list)


In [15]:
# use stored result
%store -r yelp_fasttext_word
word_list = yelp_fasttext_word[0]
word_array = yelp_fasttext_word[1]
word_occr = yelp_fasttext_word[2]

In [16]:
# fasttext + linear SVC + filter

In [17]:
top_words = get_top_words(3,word_list, word_array, word_occr)

Critical star: 3
Accuracy: 0.88954


In [18]:
thresh = 1000
pos_words_f = top_words[top_words['occurrence']>thresh]

In [19]:
pos_words_f.head(15)

Unnamed: 0,word,scores,occurrence
1829,knowledgeable,23.677865,16204
3983,superb,23.409851,4944
6425,knowledgable,22.175282,2868
4995,terrific,22.064291,3998
1862,pleasure,20.89398,4677
3429,fabulous,20.276752,9754
84,wonderful,20.043213,37654
313,amazing,19.977137,130807
929,fantastic,19.934197,36577
1173,gorgeous,19.838303,5277


In [20]:
neg_words_f  = pos_words_f.sort_values('scores',ascending=True)
neg_words_f.head(15)

Unnamed: 0,word,scores,occurrence
5349,tasteless,-37.372035,3938
2270,flavorless,-35.978347,2531
6321,inedible,-34.455828,1991
6444,undercooked,-34.410932,2945
5805,unprofessional,-34.119944,6733
203,tacky,-31.303869,1082
9178,disrespectful,-31.126935,1535
214,disgusting,-30.882839,6644
1367,unacceptable,-30.660224,2800
1126,lackluster,-30.618109,1156


In [21]:
%store

Stored variables and their in-db values:
amazon_fasttext_vectors             -> array([[-0.08999809, -0.09236344, -0.05310385, ...
amazon_fasttext_word                -> [['i', 'have', 'bought', 'several', 'of', 'the', '
nrc_fasttext_vectors                -> array([[ 0.01679   , -0.15144   , -0.02061   , ...
nrc_fasttext_word                   -> [['thinks', 'that', '@melbahughes', 'had', 'a', 'g
nrc_glove_vectors                   -> array([[ 0.04746217,  0.181698  ,  0.01058619, ...
nrc_glove_word                      -> [['thinks', 'that', '@melbahughes', 'had', 'a', 'g
yelp_fasttext_vectors               -> array([[-0.03174149, -0.02313265, -0.01704215, ...
yelp_fasttext_word                  -> [['as', 'someone', 'who', 'has', 'worked', 'with',
yelp_glove_vectors                  -> array([[-8.77902319e-04,  1.61974162e-01, -1.50082
yelp_glove_word                     -> [['as', 'someone', 'who', 'has', 'worked', 'with',
