# Sentiment Analysis on Google Play store apps

<h2> Import Data 

In [1]:
import pandas as pd
import numpy as np

import re
import os
import spacy
from sklearn.feature_extraction.text import CountVectorizer
from spacy.lang.en.stop_words import STOP_WORDS
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')

In [2]:
reviews = pd.read_csv('googleplaystore_user_reviews.csv')
reviews = reviews.dropna()
reviews.head(10)

Unnamed: 0,App,Translated_Review,Sentiment,Sentiment_Polarity,Sentiment_Subjectivity
0,10 Best Foods for You,I like eat delicious food. That's I'm cooking ...,Positive,1.0,0.533333
1,10 Best Foods for You,This help eating healthy exercise regular basis,Positive,0.25,0.288462
3,10 Best Foods for You,Works great especially going grocery store,Positive,0.4,0.875
4,10 Best Foods for You,Best idea us,Positive,1.0,0.3
5,10 Best Foods for You,Best way,Positive,1.0,0.3
6,10 Best Foods for You,Amazing,Positive,0.6,0.9
8,10 Best Foods for You,"Looking forward app,",Neutral,0.0,0.0
9,10 Best Foods for You,It helpful site ! It help foods get !,Neutral,0.0,0.0
10,10 Best Foods for You,good you.,Positive,0.7,0.6
11,10 Best Foods for You,Useful information The amount spelling errors ...,Positive,0.2,0.1


<h2> Data cleaning & wrangling

In [3]:
# encode sentiment into numeric values
conditions = [
    (reviews['Sentiment'] == 'Positive'),
    (reviews['Sentiment'] == 'Neutral'),
    (reviews['Sentiment'] == 'Negative')]

choices = [1, 0, -1]
reviews['Sentiment_encode'] = np.select(conditions, choices, default= None)

In [4]:
# take a look at the sentiment distribution
reviews.Sentiment_encode.value_counts() 

 1    23998
-1     8271
 0     5158
Name: Sentiment_encode, dtype: int64

In [5]:
# clean text data
def clean_text(sentence):
    sent = sentence.lower()  # lowercase
    sent = re.sub(r'[^\w\s]',' ',sent) # remove punctuation
    sent = sent.replace(os.linesep,"")  # remove line break
    sent = re.sub(r'\d+','',sent)  # remove digits
#     sent = ' '.join([tok for tok in sent.split() if tok not in STOP_WORDS]) # remove stopwords vs. with stopwords
    return sent

In [6]:
reviews['reviews'] = reviews['Translated_Review'].apply(clean_text)
reviews.head()

Unnamed: 0,App,Translated_Review,Sentiment,Sentiment_Polarity,Sentiment_Subjectivity,Sentiment_encode,reviews
0,10 Best Foods for You,I like eat delicious food. That's I'm cooking ...,Positive,1.0,0.533333,1,i like eat delicious food that s i m cooking ...
1,10 Best Foods for You,This help eating healthy exercise regular basis,Positive,0.25,0.288462,1,this help eating healthy exercise regular basis
3,10 Best Foods for You,Works great especially going grocery store,Positive,0.4,0.875,1,works great especially going grocery store
4,10 Best Foods for You,Best idea us,Positive,1.0,0.3,1,best idea us
5,10 Best Foods for You,Best way,Positive,1.0,0.3,1,best way


In [9]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(reviews['reviews'],reviews['Sentiment_encode'],test_size = 0.3, random_state=0)
X_train.head()

8671     been using paid version years now  originally ...
29070    i love app  using ages  however latest ver   s...
58115    i hate  weeks waiting items i find not getting...
12111                                             the best
2609     new tos data collection   i m out     uninstal...
Name: reviews, dtype: object

In [15]:
train, test = train_test_split(reviews,test_size = 0.3, random_state=0)
train.head()

Unnamed: 0,App,Translated_Review,Sentiment,Sentiment_Polarity,Sentiment_Subjectivity,Sentiment_encode,reviews
8671,Apex Launcher,Been using paid version years now. Originally ...,Positive,0.34,0.68,1,been using paid version years now originally ...
29070,ConvertPad - Unit Converter,"I love app, using ages, however latest ver 3.1...",Positive,0.425,0.55,1,i love app using ages however latest ver s...
58115,H&M,I hate 2 weeks waiting items I find NOT gettin...,Negative,-0.9,0.95,-1,i hate weeks waiting items i find not getting...
12111,Bagan - Myanmar Keyboard,The best,Positive,1.0,0.3,1,the best
2609,AC - Tips & News for Android™,New TOS data collection.. I'm out!!! (Uninstal...,Positive,0.266335,0.454545,1,new tos data collection i m out uninstal...


<h2>logistic regression

In [19]:
# logistic regression
from sklearn.feature_extraction.text import TfidfTransformer
# from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

text_clf_LR = Pipeline([('vect', CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')),
                         ('tfidf', TfidfTransformer()),
                         ('clf', LogisticRegression())])

text_clf_LR.fit(train.reviews.values, train.Sentiment_encode.astype('int')) 
predicted_LR = text_clf_LR.predict(test.reviews.values)

print('predicted values:',predicted_LR)
accuracy_score(predicted_LR, test.Sentiment_encode.astype('int'))   # logistic regression 有0.87 accuracy
# accuracy_score(predicted_LR, X_test.values)

predicted values: [-1  0 -1 ...  1 -1  1]


0.8831596758393445

<h3>N-gram with logistic regression：（-> unigram with stopwords效果最好）

In [20]:
# testing
CountVectorizer().get_params().keys() # check the available params of CountVectorizer()

dict_keys(['analyzer', 'binary', 'decode_error', 'dtype', 'encoding', 'input', 'lowercase', 'max_df', 'max_features', 'min_df', 'ngram_range', 'preprocessor', 'stop_words', 'strip_accents', 'token_pattern', 'tokenizer', 'vocabulary'])

In [21]:
# compare accuracy of unigram, bigram, trigram

cvec = CountVectorizer()
lr = LogisticRegression()
n_features = np.arange(10000,100001,10000)  # 这里我只取了10000作为max_features

# cvec.set_params(stop_words = STOP_WORDS, max_features=10000, ngram_range=(1,1))    # unigram without stopwords: 0.8721
# cvec.set_params(max_features=10000, ngram_range=(1,2))  # bigram without stopwords: 0.87024
# cvec.set_params(max_features=10000, ngram_range=(1,3))  # trigram without stopwords: 0.8705

text_clf_LR = Pipeline([('vect', cvec),
                         ('tfidf', TfidfTransformer()),
                         ('clf', LogisticRegression())])

text_clf_LR.fit(train.reviews.values, train.Sentiment_encode.astype('int')) 
predicted_LR = text_clf_LR.predict(test.reviews.values)
print('predicted values:',predicted_LR)
accuracy_score(predicted_LR, test.Sentiment_encode.astype('int'))   

predicted values: [-1  0 -1 ...  1 -1  1]


0.8859203847181405

CountVectorizer()的params: 排除了stopwords之后，unigram的表现最好，好于bigram和trigram
- 有stopwords跟排除stopwords结果差不多；
- max_features增加，accuracy反而下降

In [74]:
# TODO： plot出unigram, bigram, trigram比较图(iterate over num. of features)
# 参考：https://towardsdatascience.com/another-twitter-sentiment-analysis-with-python-part-4-count-vectorizer-b3f4944e51b5

<h3>TfidfVectorizer() vs. CountVectorizer() (-> 没有显著差别）

In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer
TfidfVectorizer().get_params().keys()

dict_keys(['analyzer', 'binary', 'decode_error', 'dtype', 'encoding', 'input', 'lowercase', 'max_df', 'max_features', 'min_df', 'ngram_range', 'norm', 'preprocessor', 'smooth_idf', 'stop_words', 'strip_accents', 'sublinear_tf', 'token_pattern', 'tokenizer', 'use_idf', 'vocabulary'])

In [23]:
from sklearn.feature_extraction.text import TfidfVectorizer

tvec = TfidfVectorizer()
lr = LogisticRegression()

tvec.set_params(stop_words = STOP_WORDS, max_features=10000, ngram_range=(1,1))  # unigram  -> o.8721
# tvec.set_params(stop_words = STOP_WORDS, max_features=10000, ngram_range=(1,2))  # bigram  -> 0.87024

text_clf_LR_tfidf = Pipeline([('vect', tvec),
                         ('clf', LogisticRegression())])

text_clf_LR_tfidf.fit(train.reviews.values, train.Sentiment_encode.astype('int')) 
predicted_LR = text_clf_LR_tfidf.predict(test.reviews.values)
print('predicted values:',predicted_LR)
accuracy_score(predicted_LR, test.Sentiment_encode.astype('int'))

predicted values: [ 0 -1 -1 ...  1 -1  1]


0.8721168403241607

之前用CountVectorizer()是pipeline包含了tf-idf transformer的，所以performance跟使用tdidfVectorizer()效果一样

<h3> Other Classifier

In [24]:
# linear SVC
from sklearn.svm import LinearSVC

tvec = TfidfVectorizer()
svc = LinearSVC()

# tvec.set_params(stop_words = STOP_WORDS, max_features=10000, ngram_range=(1,1))  # unigram -> 0.893
tvec.set_params(max_features=10000, ngram_range=(1,3))  # trigram -> 0.8941 accuracy

text_clf_svc_tfidf = Pipeline([('vect', tvec),
                         ('clf', LinearSVC())])

text_clf_svc_tfidf.fit(train.reviews.values, train.Sentiment_encode.astype('int')) 
predicted_svc = text_clf_svc_tfidf.predict(test.reviews.values)
print('predicted values:',predicted_svc)
accuracy_score(predicted_svc, test.Sentiment_encode.astype('int'))   

predicted values: [ 0 -1 -1 ...  1 -1  1]


0.9244812538961618

In [25]:
# RidgeClassifier()
from sklearn.linear_model import RidgeClassifier

tvec = TfidfVectorizer()
rc = RidgeClassifier()

# tvec.set_params(stop_words = STOP_WORDS, max_features=10000, ngram_range=(1,1))   # unigram: 0.844
tvec.set_params(max_features=10000, ngram_range=(1,2)) # bigram: 0.856

text_clf_rc_tfidf = Pipeline([('vect', tvec),
                         ('clf', RidgeClassifier())])

text_clf_rc_tfidf.fit(train.reviews.values, train.Sentiment_encode.astype('int')) 
predicted_rc = text_clf_rc_tfidf.predict(test.reviews.values)
print('predicted values:',predicted_rc)
accuracy_score(predicted_rc, test.Sentiment_encode.astype('int'))  

predicted values: [ 0  1 -1 ...  1  1  1]


0.8773710927063852

In [26]:
# PassiveAggressiveClassifier()

from sklearn.linear_model import PassiveAggressiveClassifier

tvec = TfidfVectorizer()
pac = PassiveAggressiveClassifier()

# tvec.set_params(stop_words = STOP_WORDS, max_features=10000, ngram_range=(1,1))  # unigram: 0.893
tvec.set_params(max_features=10000, ngram_range=(1,2))    # bigram: 0.895 (max)

text_clf_pac_tfidf = Pipeline([('vect', tvec),
                         ('clf', PassiveAggressiveClassifier())])

text_clf_pac_tfidf.fit(train.reviews.values, train.Sentiment_encode.astype('int')) 
predicted_pac = text_clf_pac_tfidf.predict(test.reviews.values)
print('predicted values:',predicted_pac)
accuracy_score(predicted_pac, test.Sentiment_encode.astype('int'))  

predicted values: [ 0 -1 -1 ...  1 -1  1]


0.9279544037759373

<h4>结果:</h4> 
linear_SVC(trigram:0.894) | PassiveAggressiveClassifier(bigram:0.895) | Logistic_regression (unigram: 0.87021) | Ridge Classifier (bigram: 0.856) 

<h3> ensemble classifier

结合上面几个performance比较好的classifier,去建一个ensemble classifier。再看performance是否变好

In [32]:
# 此function用于比较pipeline classifier, 基于accuracy和training time (TODO：内部结构待弄懂)

from time import time

def accuracy_summary(pipeline, x_train, y_train, x_test, y_test):
    if len(x_test[y_test == 0]) / (len(x_test)*1.) > 0.5:
        null_accuracy = len(x_test[y_test == 0]) / (len(x_test)*1.)
    else:
        null_accuracy = 1. - (len(x_test[y_test == 0]) / (len(x_test)*1.))
    t0 = time()
    sentiment_fit = pipeline.fit(x_train, y_train)
    y_pred = sentiment_fit.predict(x_test)
    train_test_time = time() - t0
    accuracy = accuracy_score(y_test, y_pred)
    print ("null accuracy: {0:.2f}%".format(null_accuracy*100))
    print ("accuracy score: {0:.2f}%".format(accuracy*100))
    if accuracy > null_accuracy:
        print ("model is {0:.2f}% more accurate than null accuracy".format((accuracy-null_accuracy)*100))
    elif accuracy == null_accuracy:
        print ("model has the same accuracy with the null accuracy")
    else:
        print ("model is {0:.2f}% less accurate than null accuracy".format((null_accuracy-accuracy)*100))
    print ("train and test time: {0:.2f}s".format(train_test_time))
    print ("-"*80)
    return accuracy, train_test_time



In [33]:
# compare the accuracy and training time for each classifier

from sklearn.feature_selection import SelectFromModel
from sklearn.naive_bayes import MultinomialNB

names = ["Logistic Regression", "Linear SVC", "LinearSVC with L1-based feature selection","Multinomial NB", 
         "Ridge Classifier", "Passive-Aggresive"]

classifiers = [
    LogisticRegression(),
    LinearSVC(),
    Pipeline([
  ('feature_selection', SelectFromModel(LinearSVC(penalty="l1", dual=False))),
  ('classification', LinearSVC(penalty="l2"))]),
    MultinomialNB(),
    RidgeClassifier(),
    PassiveAggressiveClassifier()
    ]

zipped_clf = zip(names,classifiers)
tvec = TfidfVectorizer()

def classifier_comparator(vectorizer=tvec, n_features=10000, stop_words=None, ngram_range=(1, 1), classifier=zipped_clf):
    result = []
    vectorizer.set_params(stop_words=STOP_WORDS, max_features=n_features, ngram_range=ngram_range)
    for n,c in classifier:
        checker_pipeline = Pipeline([
            ('vectorizer', vectorizer),
            ('classifier', c)
        ])
        print ("Validation result for {}".format(n))
        print (c)
#         clf_accuracy,tt_time = accuracy_summary(checker_pipeline, x_train, y_train, x_validation, y_validation)
        clf_accuracy,tt_time = accuracy_summary(checker_pipeline,train.reviews.values, train.Sentiment_encode.astype('int'), test.reviews.values, test.Sentiment_encode.astype('int'))
        result.append((n,clf_accuracy,tt_time))
    return result

bigram_result = classifier_comparator(n_features=10000,ngram_range=(1,2))
bigram_result

Validation result for Logistic Regression
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)
null accuracy: 86.21%
accuracy score: 87.02%
model is 0.81% more accurate than null accuracy
train and test time: 3.63s
--------------------------------------------------------------------------------
Validation result for Linear SVC
LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)
null accuracy: 86.21%
accuracy score: 89.34%
model is 3.13% more accurate than null accuracy
train and test time: 2.70s
--------------------------------------------------------------------------------
Validation result for LinearSVC with L1-base

[('Logistic Regression', 0.8702466826965892, 3.626021146774292),
 ('Linear SVC', 0.8934010152284264, 2.7019970417022705),
 ('LinearSVC with L1-based feature selection',
  0.8942025113545284,
  4.080567121505737),
 ('Multinomial NB', 0.735684388636566, 2.160248041152954),
 ('Ridge Classifier', 0.8567103036779766, 3.3868720531463623),
 ('Passive-Aggresive', 0.8952711728559979, 3.515577793121338)]

In [34]:
# compare results of each classifier and emsemble classifier

from sklearn.ensemble import VotingClassifier

clf1 = LogisticRegression()
clf2 = LinearSVC()
clf3 = MultinomialNB()
clf4 = RidgeClassifier()
clf5 = PassiveAggressiveClassifier()

eclf = VotingClassifier(estimators=[('lr', clf1), ('svc', clf2), ('mnb', clf3), ('rcs', clf4), ('pac', clf5)], voting='hard')

for clf, label in zip([clf1, clf2, clf3, clf4, clf5, eclf], ['Logistic Regression', 'Linear SVC', 'Multinomial NB', 'Ridge Classifier', 'Passive Aggresive Classifier', 'Ensemble']):
    checker_pipeline = Pipeline([
            ('vectorizer', TfidfVectorizer(max_features=10000,ngram_range=(1, 2))),
            ('classifier', clf)
        ])
    print ("Validation result for {}".format(label))
    print (clf)
#     clf_accuracy,tt_time = accuracy_summary(checker_pipeline, x_train, y_train, x_validation, y_validation)
    clf_accuracy,tt_time = accuracy_summary(checker_pipeline,train.reviews.values, train.Sentiment_encode.astype('int'), test.reviews.values, test.Sentiment_encode.astype('int'))
    

Validation result for Logistic Regression
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)
null accuracy: 86.21%
accuracy score: 88.90%
model is 2.69% more accurate than null accuracy
train and test time: 3.21s
--------------------------------------------------------------------------------
Validation result for Linear SVC
LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)
null accuracy: 86.21%
accuracy score: 92.51%
model is 6.30% more accurate than null accuracy
train and test time: 3.11s
--------------------------------------------------------------------------------
Validation result for Multinomial NB
Multino

结论：LinearSVC and Passive Aggresive classifier yield the best performance ~92% accuracy, even better than the voting classifier(emsemble) of 90% accuracy.

*P.s: 在此project, 将tutorial中的x_train 改为train.reviews.values，y_train改为 train.Sentiment_encode.astype('int') 即可；<br>
test data同理。

In [None]:
# TODO: compute positive, negative proportion for each word (tutorial Part 5)
# 目的是用于lexical approach for text classification

<h3> Doc2Vec

In [41]:
from gensim.models import Doc2Vec
from gensim.models.doc2vec import LabeledSentence

def labelize_tweets_ug(reviews,label):
    result = []
    prefix = label
    for i, r in zip(reviews.index, reviews):
        result.append(LabeledSentence(r.split(), [prefix + '_%s' % i]))
    return result

all_x_w2v = labelize_tweets_ug(reviews.reviews, 'all')
all_x_w2v[:5]

[LabeledSentence(words=['i', 'like', 'eat', 'delicious', 'food', 'that', 's', 'i', 'm', 'cooking', 'food', 'myself', 'case', 'best', 'foods', 'helps', 'lot', 'also', 'best', 'before', 'shelf', 'life'], tags=['all_0']),
 LabeledSentence(words=['this', 'help', 'eating', 'healthy', 'exercise', 'regular', 'basis'], tags=['all_1']),
 LabeledSentence(words=['works', 'great', 'especially', 'going', 'grocery', 'store'], tags=['all_3']),
 LabeledSentence(words=['best', 'idea', 'us'], tags=['all_4']),
 LabeledSentence(words=['best', 'way'], tags=['all_5'])]

<h4> TODO: 1. DBOW (Distributed Bag Of Words)

In [47]:
from sklearn.linear_model import LogisticRegression
from tqdm import tqdm  # Instantly make your loops show a smart progress meter
from sklearn import utils
import multiprocessing

cores = multiprocessing.cpu_count()

model_ug_dbow = Doc2Vec(dm=0, size=100, negative=5, min_count=2, workers=cores, alpha=0.065, min_alpha=0.065)
model_ug_dbow.build_vocab([x for x in tqdm(all_x_w2v)])

for epoch in range(30):
    model_ug_dbow.train(utils.shuffle([x for x in tqdm(all_x_w2v)]), total_examples=len(all_x_w2v), epochs=1)
    model_ug_dbow.alpha -= 0.002
    model_ug_dbow.min_alpha = model_ug_dbow.alpha
    

def get_vectors(model, corpus, size):
    vecs = np.zeros((len(corpus), size))
    n = 0
    for i in corpus.index:
        prefix = 'all_' + str(i)
        vecs[n] = model.docvecs[prefix]
        n += 1
    return vecs

train_vecs_dbow = get_vectors(model_ug_dbow, train.reviews, 100)
validation_vecs_dbow = get_vectors(model_ug_dbow, test.reviews, 100)

clf = LogisticRegression()
# clf.fit(train_vecs_dbow, y_train)
clf.fit(train_vecs_dbow, train.Sentiment_encode.astype('int'))
# clf.score(validation_vecs_dbow, y_validation)
clf.score(validation_vecs_dbow, test.Sentiment_encode.astype('int'))

100%|██████████| 37427/37427 [00:00<00:00, 1768432.50it/s]
100%|██████████| 37427/37427 [00:00<00:00, 2122071.18it/s]
100%|██████████| 37427/37427 [00:00<00:00, 2034186.62it/s]
100%|██████████| 37427/37427 [00:00<00:00, 2325770.65it/s]
100%|██████████| 37427/37427 [00:00<00:00, 2117634.10it/s]
100%|██████████| 37427/37427 [00:00<00:00, 2203509.44it/s]
100%|██████████| 37427/37427 [00:00<00:00, 2082960.24it/s]
100%|██████████| 37427/37427 [00:00<00:00, 2152477.94it/s]
100%|██████████| 37427/37427 [00:00<00:00, 2167426.73it/s]
100%|██████████| 37427/37427 [00:00<00:00, 2299840.54it/s]
100%|██████████| 37427/37427 [00:00<00:00, 2243439.84it/s]
100%|██████████| 37427/37427 [00:00<00:00, 2129671.50it/s]
100%|██████████| 37427/37427 [00:00<00:00, 2026466.35it/s]
100%|██████████| 37427/37427 [00:00<00:00, 2339914.97it/s]
100%|██████████| 37427/37427 [00:00<00:00, 2381194.02it/s]
100%|██████████| 37427/37427 [00:00<00:00, 2328876.00it/s]
100%|██████████| 37427/37427 [00:00<00:00, 2080172.47it/

0.7205450173657494

<h4> TODO: 2.DMC (Distributed Memory Concatenation)

<h2>Deeping Learning

<h3>Representation: Sequence Creation

In [18]:
# tokenization
from keras.preprocessing.text import Tokenizer
## Tokenize the sentences

max_features = 276    # max length of reviews
tokenizer = Tokenizer(num_words=max_features)
# tokenizer.fit_on_texts(list(train_X)+list(test_X))
tokenizer.fit_on_texts(list(X_train)+list(X_test))

X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

In [24]:
# Pad Sequence
from keras.preprocessing.sequence import pad_sequences

maxlen = 276
X_train = pad_sequences(X_train, maxlen=maxlen)
X_test = pad_sequences(X_test, maxlen=maxlen)    

<h3> Embedding Enrichment

will be using GLoVE Word2Vec embeddings to explain the enrichment.

In [27]:
# load glove index

def load_glove_index():
#     EMBEDDING_FILE = '../input/embeddings/glove.840B.300d/glove.840B.300d.txt'
    EMBEDDING_FILE = './input/glove.42B.300d.txt'
    def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')[:300]    # 选最常出现的300 words
    embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE))
    return embeddings_index

glove_embedding_index = load_glove_index()

In [38]:
# list(glove_embedding_index.items())[:3]

In [None]:
# create glove (add polarity and lowercase as well)

def create_glove(word_index,embeddings_index):
    emb_mean,emb_std = -0.005838499,0.48782197
    all_embs = np.stack(embeddings_index.values())
    embed_size = all_embs.shape[1]
    nb_words = min(max_features, len(word_index))
    embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size+4))
    
    count_found = nb_words
    for word, i in tqdm(word_index.items()):
        if i >= max_features: continue
        embedding_vector = embeddings_index.get(word)
        word_sent = TextBlob(word).sentiment
        # Extra information we are passing to our embeddings
        extra_embed = [word_sent.polarity,word_sent.subjectivity]
        if embedding_vector is not None: 
            embedding_matrix[i] =  np.append(embedding_vector,extra_embed)
        else:
            if word.islower():
                embedding_vector = embeddings_index.get(word.capitalize())
                if embedding_vector is not None: 
                    embedding_matrix[i] = np.append(embedding_vector,extra_embed)
                else:
                    embedding_matrix[i,300:] = extra_embed
                    count_found-=1
            else:
                embedding_matrix[i,300:] = extra_embed
                count_found-=1
    print("Got embedding for ",count_found," words.")
    return embedding_matrix