With a new dataset, only modifying the following shall be sufficient. 
* Import Data
    * directory
    * file name
    * pd.read_csv() if different file type
    * data_name
    * label_name
* Prepare Data
    * In get_train_data function, modify the way binary labels are defined.
    * Number of posts (iterations) in get_word_data based on your interest
* Results
    * Modify the way 'keyword' is used based on your binary labels.

# Import Data

In [1]:
import pandas as pd
from tqdm import tqdm

In [2]:
directory = '/home/roshansk/YelpAnalysis/AmazonFineFood/'
file_name = 'Reviews.csv'
raw_df = pd.read_csv(directory + file_name)
# remove rows with missing values
df = raw_df.dropna()
df.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [3]:
data_name = 'Text'
label_name = 'Score'

In [4]:
print(
    'There are {} data.'.format(df.shape[0]),
    'Labels are: {}'.format(df[label_name].unique()),
    sep = '\n'
    )
df[label_name].value_counts()

There are 568411 data.
Labels are: [5 1 4 2 3]


5    363111
4     80655
1     52264
3     42638
2     29743
Name: Score, dtype: int64

# Embedding

In [5]:
# !python -m spacy download en_core_web_lg
import numpy as np
import spacy

In [6]:
# load the language model
# nlp = spacy.load('en_core_web_lg')

In [7]:
# with nlp.disable_pipes():
#     msg_vectors = np.array([nlp(msg.lower()).vector for msg in tqdm(df[data_name])])
# msg_vectors.shape

100%|██████████| 568411/568411 [3:04:45<00:00, 51.28it/s]  


(568411, 300)

In [8]:
# amazon_glove_vectors = msg_vectors
# %store amazon_glove_vectors

Stored 'amazon_glove_vectors' (ndarray)


In [9]:
# embedding takes huge amount of time, use stored result
%store -r amazon_glove_vectors
msg_vectors = amazon_glove_vectors
msg_vectors.shape

(568411, 300)

# Prepare data

In [10]:
from sklearn.model_selection import train_test_split
def get_train_data(keyword):
    labels = np.array([1 if x > keyword else 0 for x in df[label_name]])
    X_train, X_test, Y_train, Y_test = train_test_split(msg_vectors, labels,
                                                    test_size=0.2, random_state=1)
    return X_train, X_test, Y_train, Y_test

In [11]:
def get_word_data(npl,df):
    word_list = []
    word_vec = []
    word_occr_dict = {}
    with nlp.disable_pipes():
        for i in tqdm(range(df.shape[0])):
            msg = nlp(df.iloc[i][data_name].lower())
            for token in msg:
                if token.text not in word_list:
                    word_list.append(token.text)
                    word_vec.append([token.vector])
                    word_occr_dict[token.text] = 1
                else:
                    word_occr_dict[token.text] += 1     
    word_array = np.concatenate(np.array(word_vec),0)
    word_occr = [word_occr_dict[word] for word in word_list]
    return word_list, word_array, word_occr

# Linear SVC


In [12]:
from sklearn.svm import LinearSVC
def top_words_SVC(X_train, X_test, Y_train, Y_test, word_list, word_array, word_occr): 
# Set dual=False to speed up training, and it's not needed
    svc = LinearSVC(random_state=1, dual=False, max_iter=10000)
    svc.fit(X_train, Y_train)
    accu = svc.score(X_test, Y_test)
    scores = svc.decision_function(word_array)
    word_df = pd.DataFrame({'word':word_list,'scores':scores,'occurrence':word_occr})
#     word_df = word_df[word_df['occurrence']>50]
    word_df = word_df.sort_values('scores',ascending=False)
    return accu, word_df

In [13]:
def get_top_words(keyword, word_list, word_array, word_occr):
    X_train, X_test, Y_train, Y_test = get_train_data(keyword)
    accu, words = top_words_SVC(X_train, X_test, Y_train, Y_test, word_list, word_array, word_occr)
    print(
        'Critical star: {}'.format(keyword),
        'Accuracy: {}'.format(accu),
        sep = '\n'
    )
    return words

# Results

In [14]:
word_list, word_array, word_occr = get_word_data(nlp,df)

100%|██████████| 568411/568411 [4:09:48<00:00, 37.92it/s]   


In [15]:
amazon_glove_word = [word_list, word_array, word_occr]
%store amazon_glove_word

Stored 'amazon_glove_word' (list)


In [16]:
# use stored result
%store -r amazon_glove_word
word_list = amazon_glove_word[0]
word_array = amazon_glove_word[1]
word_occr = amazon_glove_word[2]

In [17]:
# glove + linear SVC + filter

In [18]:
top_words = get_top_words(3,word_list, word_array, word_occr)

Critical star: 3
Accuracy: 0.8574808898428085


In [19]:
thresh = 1000
pos_words_f = top_words[top_words['occurrence']>thresh]

In [23]:
pos_words_f.head(30)

Unnamed: 0,word,scores,occurrence
2673,versatile,17.79002,1092
2235,handy,16.010722,2603
586,excellent,15.639082,22496
3573,enjoys,15.497575,2891
139,great,15.394777,163700
470,fast,15.364663,12752
1046,wonderful,15.353344,23795
477,perfect,14.856625,32223
2031,fabulous,14.476006,2679
2228,powerful,14.271646,1134


In [24]:
neg_words_f  = pos_words_f.sort_values('scores',ascending=True)
neg_words_f.head(30)

Unnamed: 0,word,scores,occurrence
6368,disgusting,-23.14637,2290
2840,misleading,-22.883223,1334
2114,tasteless,-21.358779,1600
333,stale,-20.197219,5805
3109,disappointing,-19.004977,2608
1917,bland,-18.051356,5338
703,soggy,-17.756729,1216
3779,overpriced,-16.840491,1491
2947,weak,-16.54145,7716
2645,horrible,-16.380705,5189


In [22]:
%store

Stored variables and their in-db values:
amazon_fasttext_vectors             -> array([[-0.08999809, -0.09236344, -0.05310385, ...
amazon_fasttext_word                -> [['i', 'have', 'bought', 'several', 'of', 'the', '
amazon_glove_vectors                -> array([[-0.14978772,  0.15294558, -0.09378192, ...
amazon_glove_word                   -> [['i', 'have', 'bought', 'several', 'of', 'the', '
nrc_fasttext_vectors                -> array([[ 0.01679   , -0.15144   , -0.02061   , ...
nrc_fasttext_word                   -> [['thinks', 'that', '@melbahughes', 'had', 'a', 'g
nrc_glove_vectors                   -> array([[ 0.04746217,  0.181698  ,  0.01058619, ...
nrc_glove_word                      -> [['thinks', 'that', '@melbahughes', 'had', 'a', 'g
yelp_fasttext_vectors               -> array([[-0.03174149, -0.02313265, -0.01704215, ...
yelp_fasttext_word                  -> [['as', 'someone', 'who', 'has', 'worked', 'with',
yelp_glove_vectors                  -> array([[-8.77902319e