In [1]:
import numpy as np
import pandas as pd
import scipy.sparse

import time
import random
from collections import Counter

from sklearn.preprocessing import StandardScaler
from sklearn.random_projection import sparse_random_matrix
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

import gensim



***

# DATA:

In [2]:
cleanedText = pd.read_csv('df_amazon.csv').cleanedText
adj_listO_words = np.load('df_adjective_listO_words.npy')
adj_text = pd.read_csv('df_adjective_text.csv').text
score = pd.read_csv('df_adjective_text.csv').score

df_amazon_adj = pd.DataFrame().assign(text = cleanedText, 
                                      adj_listO_words = adj_listO_words,
                                      score = score)
df_amazon_adj.head()

Unnamed: 0,text,adj_listO_words,score
0,have bought several the vitality canned dog fo...,"[good, stew, look, product, like, quality]",positive
1,product arrived labeled jumbo salted peanuts t...,"[unsalted, sized, sure]",negative
2,this confection that has been around few centu...,"[recommend, chewy, heaven, highly, yummy, mout...",positive
3,you are looking for the secret ingredient robi...,"[good, medicinal, soda, flavor]",negative
4,great taffy great price there was wide assortm...,"[price, assortment, yummy, delivery, great, ta...",positive


***

### Fn1: RETURNS LIST OF TFIDF WEIGHED SENTENCE VECTORS:

In [17]:
def retrn3_listO_tf_w2v_textVec(corpus, w2vmodel, w2v_dims, tfidf_feat, matO_tfidf):
    
    from progressbar import ProgressBar
    pbar1 = ProgressBar()
    
    if str(type(tfidf_feat)) != "<class 'list'>":
        print('tfidf_feat must be a list')
        return
    
    listO_tf_w2v_textVec = []
    listO_anamolous_indices1 = []
    listO_anamolous_indices2 = []

    row = 0
    
    for listO_words in pbar1(corpus): 
        tf_w2v_textVec = np.zeros(w2v_dims) 
        sumO_tf_idf_vals = 0
        for word in set(listO_words): 
            word = word.lower()
            try:
                col = tfidf_feat.index(word)
                tf_idf_val = matO_tfidf[row, col]                
                wordVec = w2vmodel[word]
                tf_w2v_textVec += (wordVec * tf_idf_val)
                sumO_tf_idf_vals += tf_idf_val
            except:
                listO_anamolous_indices1.append((row, word))
                pass
            
        condition = (sumO_tf_idf_vals != 0) or sumO_tf_idf_vals
        
        if condition:
            tf_w2v_textVec = tf_w2v_textVec/sumO_tf_idf_vals             
        else:
            tf_w2v_textVec = np.zeros(w2v_dims)
            listO_anamolous_indices2.append(row)
            
        listO_tf_w2v_textVec.append(tf_w2v_textVec)
        row += 1
    
    print('3 vals returned: listO_tf_w2v_textVec, listO_anamolous_indices1, listO_anamolous_indices2')
    return listO_tf_w2v_textVec, listO_anamolous_indices1, listO_anamolous_indices2

***

# START:

### BOW & TFIDF VECTORIZERS:

In [20]:
text = df_amazon_adj.text

BOW = CountVectorizer(stop_words='english', min_df=5)
matO_bow = BOW.fit_transform(text)

TFIDF = TfidfVectorizer(stop_words='english', min_df=5)
matO_tfidf = TFIDF.fit_transform(text)

In [21]:
matO_bow.shape, matO_tfidf.shape

((364119, 31304), (364119, 31304))

In [22]:
tfidf_feat = TFIDF.get_feature_names()
len(tfidf_feat)

31304

In [23]:
scipy.sparse.save_npz('matO_bow.npz', matO_bow)
scipy.sparse.save_npz('matO_tfidf.npz', matO_tfidf)
np.save('tfidf_feat', tfidf_feat, allow_pickle=True, fix_imports=True)

***

### CREATING SENTENCE VECTORS:

In [4]:
w2v_google = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)
matO_bow = scipy.sparse.load_npz('matO_bow.npz')
matO_tfidf = scipy.sparse.load_npz('matO_tfidf.npz')
tfidf_feat = np.load('tfidf_feat.npy')

In [5]:
w2v_dims = w2v_google['wizard'].shape[0]
w2v_dims

300

In [18]:
corpus = df_amazon_adj.adj_listO_words.values
w2vmodel = w2v_google
tfidf_feat = list(tfidf_feat)

tup3 = retrn3_listO_tf_w2v_textVec(corpus, w2vmodel, w2v_dims, tfidf_feat, matO_tfidf)

100% (364119 of 364119) |################| Elapsed Time: 0:16:54 Time:  0:16:54


3 vals returned: listO_tf_w2v_textVec, listO_anamolous_indices1, listO_anamolous_indices2


In [19]:
listO_tf_w2v_textVec = tup3[0]
listO_tf_w2v_textVec[0]

array([-4.92878162e-02,  1.44347557e-01,  6.89371794e-02,  7.15543422e-02,
       -1.22727715e-02,  7.16988656e-02,  1.18965220e-01, -1.15484713e-01,
       -2.74176593e-02,  1.77681201e-01, -1.48702960e-01, -1.97969874e-01,
        7.52617787e-02, -1.63147496e-02, -2.13378979e-01,  8.58427428e-02,
        8.68626852e-02,  1.10201271e-01, -2.52078722e-02, -1.81099368e-01,
       -1.69754713e-02,  9.43785707e-02, -2.97007108e-02,  4.62577035e-02,
        4.70311383e-02, -4.42980394e-03, -1.45439970e-02,  5.62720509e-02,
       -1.24489483e-01,  5.33022844e-02, -8.67931220e-02,  5.13981029e-02,
        6.01518780e-02, -5.10699976e-02,  5.04220448e-02,  3.26314173e-03,
        1.26959211e-01, -1.62630385e-01,  1.03982101e-01,  9.96132148e-02,
       -1.73088230e-02, -1.66139614e-01,  1.29420982e-01, -1.19608596e-02,
       -1.27959048e-01, -2.20956358e-01, -1.18479665e-01,  1.07637609e-01,
        6.67668975e-02,  2.21604317e-03, -3.16172800e-02,  8.18147003e-02,
       -1.27128060e-01, -

In [21]:
len(listO_tf_w2v_textVec)

364119

In [None]:
listO_tf_w2v_textVec = tup3[0]
np.save('listO_tf_w2v_textVec', listO_tf_w2v_textVec, allow_pickle=True, fix_imports=True)

***