# Cosine Similarity Matrix of Words for pos & neg regarding creativity

In [1]:
from nltk.corpus import stopwords
import gensim
from gensim.parsing.preprocessing import remove_stopwords
import pandas as pd
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize, sent_tokenize
import numpy as np
import scipy

In [139]:
creativity_dict = ['creative', 'new', 'novel', 'interesting', 'genius', 
                   'imaginative', 'ingenious','innovative', 'inventive']

In [149]:
# # Stemming
# ps = PorterStemmer() 
  
# for w in creativity_dict: 
#     print(w, " : ", ps.stem(w)) 

In [150]:
# Lemmatization with gensim requires python 3,6!

# Lemmatization
lemmatizer = WordNetLemmatizer()

# print(lemmatizer.lemmatize('creatively'))

In [4]:
df_full = pd.read_csv('/Users/nessyliu/Desktop/RA/AllReviews_26thNov2019.csv')

# df = df_full.head(10000)
df = df_full

review_list = []
for review in df['review_text']:
    # tokenize review, lower case, remove accented, remove stopwords
    tokens = [lemmatizer.lemmatize(w) 
              for w in gensim.utils.simple_preprocess(remove_stopwords(review), deacc=True, min_len=3)]
    review_list.append(tokens)
print(len(review_list))

1010251


In [5]:
model = gensim.models.Word2Vec(min_count=5, size=300, window = 5)

model.build_vocab(review_list)
model.intersect_word2vec_format("/Users/nessyliu/Desktop/RA/GoogleNews-vectors-negative300.bin", 
                                  binary=True, lockf=1.0)
model.train(review_list, total_examples=len(review_list), epochs=model.epochs)

print(len(list(model.wv.vocab)))

21316


In [18]:
df_pos = pd.DataFrame()
for base_word in creativity_dict:
    top_similar_list = list(set(model.wv.most_similar(positive=[base_word], topn=100)))
    for pair in top_similar_list:
        df_pos.at[pair[0], base_word] = pair[1]

df_pos.loc[:,'Row_Mean'] = df_pos.mean(numeric_only=True, axis=1)
df_pos.loc[:,'Num_NaN'] = df_pos.isnull().sum(axis=1)
df_pos

Unnamed: 0,creative,new,novel,interesting,genius,imaginative,ingenious,innovative,inventive,Row_Mean,Num_NaN
tinkered,0.401562,,,,,,,,,0.401562,8
neat,0.402275,,0.307337,0.471488,0.440187,,0.504657,,,0.425189,4
tweek,0.370045,,,,,,,,,0.370045,8
opportunity,0.365450,,,,,,,,,0.365450,8
challenging,0.354656,,,,,,,,0.406224,0.380440,7
...,...,...,...,...,...,...,...,...,...,...,...
unpredictable,,,,,,,,,0.414377,0.414377,8
grest,,,,,,,,,0.417058,0.417058,8
inexspensive,,,,,,,,,0.395711,0.395711,8
classy,,,,,,,,,0.422217,0.422217,8


In [19]:
# Drop the rows with high Num_NaN
df_pos_dropped = df_pos.copy()

# Get names of indexes to drop
indexNames = df_pos_dropped[df_pos_dropped['Num_NaN'] >= 6].index
 
# Delete these row indexes from dataFrame
df_pos_dropped.drop(indexNames , inplace=True)
df_pos_dropped = df_pos_dropped.sort_values(by =['Row_Mean'], ascending=False)
df_pos_dropped = df_pos_dropped.head(100)
extended_creativity_dict = list(set(list(df_pos_dropped.index)+creativity_dict))
df_pos_dropped

Unnamed: 0,creative,new,novel,interesting,genius,imaginative,ingenious,innovative,inventive,Row_Mean,Num_NaN
inventive,0.5232,,0.348066,,0.380742,0.74121,0.606477,0.649409,,0.541517,3
imaginative,0.411242,,0.395319,,0.394292,,0.582219,0.591492,0.74121,0.519296,3
clever,0.491569,,0.411869,0.360856,0.571555,0.513869,0.644944,0.501276,0.599966,0.511988,1
ingenious,0.360706,,0.41271,,0.472336,0.582219,,0.528395,0.606477,0.493807,3
innovative,0.425377,,0.345816,,0.385818,0.591492,0.528395,,0.649409,0.487718,3
creativity,0.545321,,,,,0.451726,,0.41713,0.428902,0.46077,5
intricate,,,,,0.370379,0.513993,0.475568,,0.478061,0.4595,5
ingenuity,,,,,0.455595,0.4499,0.48825,0.435454,0.440036,0.453847,4
quirky,,,0.321381,,,0.54159,0.419968,,0.515763,0.449676,5
inspiring,,,,,0.406429,0.474658,,0.438969,0.47281,0.448217,5


In [21]:
uncreativity_dict = ['old', 'bore', 'worn', 'uninteresting', 'uninteresting','uninspired','boring','bland']

df_neg = pd.DataFrame()
for base_word in uncreativity_dict:
    top_similar_list = list(set(model.wv.most_similar(positive=[base_word], topn=100)))
    for pair in top_similar_list:
        df_neg.at[pair[0], base_word] = pair[1]

df_neg.loc[:,'Row_Mean'] = df_neg.mean(numeric_only=True, axis=1)
df_neg.loc[:,'Num_NaN'] = df_neg.isnull().sum(axis=1)
df_neg

# Drop the rows with high Num_NaN
df_neg_dropped = df_neg.copy()

# Get names of indexes to drop
indexNames_neg = df_neg_dropped[df_neg_dropped['Num_NaN'] >= 5].index
 
# Delete these row indexes from dataFrame
df_neg_dropped.drop(indexNames_neg, inplace=True)
df_neg_dropped = df_neg_dropped.sort_values(by =['Row_Mean'], ascending=False)
df_neg_dropped = df_neg_dropped.head(100)
extended_uncreativity_dict = list(set(list(df_neg_dropped.index)+uncreativity_dict))
df_neg_dropped

Unnamed: 0,old,bore,worn,uninteresting,uninspired,boring,bland,Row_Mean,Num_NaN
uninspiring,,,,0.635065,0.805005,0.429899,,0.623323,4
unexciting,,,,0.686163,0.654778,0.457475,,0.599472,4
blah,,,,,0.41802,0.681188,0.660554,0.586587,4
dull,,,,0.560801,0.527137,0.660049,0.503844,0.562958,3
unremarkable,,,,0.635921,0.58714,0.420924,,0.547995,4
unimpressive,,,,0.60248,0.678542,0.348398,,0.54314,4
unmemorable,,,,0.620137,0.646331,0.346731,,0.537733,4
disappointingly,,,,0.641699,0.628573,0.338065,,0.536112,4
forgettable,,,,0.582091,0.639951,0.356318,,0.52612,4
flavorless,,,,0.491648,0.487514,0.539359,0.527805,0.511582,3


In [28]:
print("Dict for creativity:")
print("\n(containing", len(extended_creativity_dict), "words)")
print("\n",extended_creativity_dict)

print("\nDict for uncreativity:")
print("(containing", len(extended_uncreativity_dict), "words)")
print("\n",extended_uncreativity_dict)

Dict for creativity:

(containing 39 words)

 ['amusing', 'simplistic', 'innovative', 'hilarious', 'invented', 'untraditional', 'intuitive', 'bizarre', 'sophisticated', 'unorthodox', 'neat', 'brilliant', 'quirky', 'fascinating', 'invent', 'novel', 'originality', 'inventive', 'intricate', 'clever', 'experimentation', 'resourceful', 'imaginative', 'invention', 'creative', 'teriffic', 'genius', 'new', 'uncomplicated', 'intrigued', 'creativity', 'ingenious', 'interesting', 'intriguing', 'ingenuity', 'crafty', 'inspiring', 'unconventional', 'creatively']

Dict for uncreativity:
(containing 41 words)

 ['unattractive', 'muddled', 'disappointing', 'monotonous', 'boring', 'devoid', 'enjoyable', 'blah', 'lifeless', 'exciting', 'humdrum', 'uninspiring', 'unappealing', 'uninspired', 'disappointingly', 'bore', 'old', 'unmemorable', 'tastless', 'tasteless', 'worn', 'mediocre', 'unimpressed', 'meh', 'bland', 'drab', 'pointless', 'dull', 'lackluster', 'flavorless', 'pathetic', 'stodgy', 'unexciting',

# PMI

PMI(A,B) = (P(A,B))/ (P(A)*P(B))

P(A,B) = (number of doc where A B both occur)/ total number of doc

P(A) = (number of doc where A occurs)/ total number of doc

P(B) = (number of doc where B occurs)/ total number of doc

In [107]:
### Subset a sample 

df_sample = df_full.head(10000)

review_list_sample = []
for review in df_sample['review_text']:
    # tokenize review, lower case, remove accented, remove stopwords
    tokens = [lemmatizer.lemmatize(w) 
              for w in gensim.utils.simple_preprocess(remove_stopwords(review), deacc=True, min_len=3)]
    review_list_sample.append(tokens)
print(len(review_list_sample))

model_sample = gensim.models.Word2Vec(min_count=5, size=300, window = 5)

model_sample.build_vocab(review_list_sample)
model_sample.intersect_word2vec_format("/Users/nessyliu/Desktop/RA/GoogleNews-vectors-negative300.bin", 
                                  binary=True, lockf=1.0)
model_sample.train(review_list_sample, total_examples=len(review_list_sample), epochs=model_sample.epochs)

print(len(list(model_sample.wv.vocab)))

10000
2631


In [109]:
# Create a dictionary for each vocabulary (key) in the word2vec model with its probability (value)

N = len(review_list_sample)
vocab_prob = {}
for vocab in model_sample.wv.vocab:
    vocab_prob[vocab] = sum([vocab in review for review in review_list_sample])/ N

In [110]:
# Create a PMI df, each column is a dict word (A), each row is a vocab in review data (B), values are P(A,B)

sample_dict = [word for word in extended_creativity_dict if word in model_sample.wv.vocab]
print(sample_dict)

df_PMI = pd.DataFrame()
i = 1
for A in sample_dict:
    print(i,'/',len(sample_dict))
    i+=1
    for B in model_sample.wv.vocab:
        P_A = vocab_prob[A]
        P_B = vocab_prob[B]
        P_AB = sum([(A in review and B in review) for review in review_list_sample]) / N
        df_PMI.at[B, A] = P_AB / (P_A * P_B)


['creative', 'genius', 'new', 'interesting']
1 / 4
2 / 4
3 / 4
4 / 4


In [142]:
df_PMI.head(20)

Unnamed: 0,creative,genius,new,interesting
loved,1.88253,0.941265,1.044641,0.684556
thanks,3.229974,3.229974,1.717674,1.761804
sharing,0.0,5.319149,1.475833,1.934236
recipe,1.508205,1.508205,1.422769,1.096876
too,0.0,4.045307,0.935331,2.942042
die,0.0,78.125,3.612717,0.0
for,0.0,5.230126,1.692989,0.0
very,1.69837,0.0,0.628299,0.0
basic,32.051282,0.0,2.96428,0.0
need,0.0,3.396739,1.727821,2.470356


# Closest Sentences

In [144]:
# Sent tokenize the reviews
sentence_list = []
raw_list = []
for review in df_sample['review_text']:
    sentences_in_review = sent_tokenize(review)
    for sent in sentences_in_review:
        tokens = [lemmatizer.lemmatize(w) 
                  for w in gensim.utils.simple_preprocess(remove_stopwords(review), deacc=True, min_len=3)]
        sentence_list.append(tokens)
        raw_list.append(sent)
print(len(sentence_list))

38960


In [147]:
# mean vector of the extended creative dict
creative_vec = np.mean([model_sample.wv[word] for word in sample_dict], axis=0)

# df storing each sentence with its vector value (mean of the vectors of all its words)
sent_vector_list = []
cosine_distance_list = []
for sent in sentence_list:
#     try:
#         sent_vector = np.mean([model_sample.wv[token] for token in sent])
#         sent_vector_list.append(sent_vector)
#         cosine_distance_list.append(scipy.spatial.distance.cosine(sent_vector, creative_vec))
#     except: # if the word not in vocab
#         sent_vector = 'None'
#         sent_vector_list.append(sent_vector)
#         cosine_distance_list.append('None')
    token_vec_list = []
    for token in sent:
        try:
            token_vec = model_sample.wv[token]
        except:
            # continue
            token_vec = np.zeros(300)
        token_vec_list.append(token_vec)
    
    sent_vector = np.mean(token_vec_list, axis=0)
    sent_vector_list.append(sent_vector)
    cosine_distance_list.append(abs(scipy.spatial.distance.cosine(sent_vector, creative_vec)))
    
df_sent = pd.DataFrame({
    'raw': raw_list,
    #'sentence' : sentence_list,
    'vector' : sent_vector_list,
    'cos_dist': cosine_distance_list
})
df_sent = df_sent.drop_duplicates(subset='cos_dist', keep="first")

In [148]:
df_sent = df_sent.sort_values(by =['cos_dist'], ascending=True)

df_sent.head(20)

Unnamed: 0,raw,vector,cos_dist
13630,I really love this recipe -- it's got a sweet ...,"[0.123237014, -0.018454539, 0.036114357, 0.304...",0.26198
28619,In my son's new top 10 of favorites.,"[0.018842475, 0.1026762, -0.00029796362, 0.475...",0.281492
15272,i made this as presented and it was very good ...,"[0.17010970103243986, 0.14826197984317938, 0.0...",0.294592
18675,A new favorite!!!!!,"[0.06800623, 0.1639131, 0.31489766, 0.3450297,...",0.298927
33332,So different and SO GOOD!,"[-0.010801996, 0.013818285, 0.009665249, 0.361...",0.300156
18208,This is my familiy's new favorite!,"[0.09201975688338279, 0.11644953414797783, 0.1...",0.307868
32837,I truly love Chef Johns recipes and this one i...,"[-0.027955972822383046, -0.046079766008188014,...",0.308222
23363,"Wow, that was awesome.","[-0.070553094, -0.038026787, 0.029123444, 0.40...",0.30953
2296,"this is so great, simple and tastes delicious!!","[0.014271491, 0.019467827, 0.01845777, 0.35075...",0.316049
2928,Strange name that brings strange comments but ...,"[0.05712676, 0.042948022, -0.0030723654, 0.150...",0.316661
