#### The purpose of this notebook is to prepare the data and split them into train and test data set

In [1]:
import numpy as np
import pandas as pd
import math
from collections import Counter

from glove import Corpus, Glove
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
import nltk
nltk.download('stopwords')
nltk.download('punkt')

from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfVectorizer

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Zijun\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Zijun\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
import pickle

In [3]:
def save_obj(obj, name ):
    with open('./'+ name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name ):
    with open('./' + name + '.pkl', 'rb') as f:
        return pickle.load(f)

In [4]:
#import the merged data
filename = "GameData.csv"

df = pd.read_csv(filename,encoding = "mac_roman",sep='\t')

In [5]:
df.head(10)

Unnamed: 0.1,Unnamed: 0,new-price,sales-volume,release-year,genres,name,slug,rating,developers,platforms,tags,match
0,,<lambda>,sum,<lambda>,,,,,,,,
1,product-name-slug,,,,,,,,,,,
2,-30-great-games--family-party,7.88,16,2008.0,,Family Party: 30 Great Games,family-party-30-great-games,0.0,,Wii,[party],True
3,007--agent-under-fire,12.43,97,2002.0,,James Bond 007: Agent Under Fire,james-bond-007-agent-under-fire,3.9,"Electronic Arts,Visceral Games,Electronic Arts...","Xbox,GameCube,PlayStation 2","Multiplayer,Singleplayer",True
4,007--everything-or-nothing,9.987066800602712,143,2004.0,[shooter],James Bond 007: Everything or Nothing,james-bond-007-everything-or-nothing,4.42,"Griptonite Games,Electronic Arts Redwood Shores","PlayStation 2,Game Boy Advance,Xbox,GameCube","Multiplayer,Singleplayer,cooperativeplay",True
5,007--from-russia-with-love,8.94,99,2005.0,[action],James Bond 007: From Russia with Love,james-bond-007-from-russia-with-love,3.91,"Rebellion,Visceral Games,Electronic Arts Redwo...","GameCube,PlayStation 2,PSP,Xbox","Multiplayer,Singleplayer",True
6,007--nightfire,38.1529028714891,255,2003.0,"Action,Shooter",James Bond 007: NightFire,james-bond-007-nightfire,3.93,[gearbox-software],PC,,True
7,007--quantum-of-solace,8.594764740917212,483,2008.0,[action],James Bond 007: Quantum of Solace,james-bond-007-quantum-of-solace,3.6,"Beenox,Vicarious Visions","PlayStation 3,PC,Xbox 360,Wii,Nintendo DS,Play...",,True
8,007--world-is-not-enough,23.33,360,2000.0,[action],007: The World is not Enough,the-world-is-not-enough,4.17,[eurocom],"PC,PlayStation,Nintendo 64","Multiplayer,Singleplayer,exclusive",True
9,007-agent-under-fire,21.208075138121547,1191,2002.0,,James Bond 007: Agent Under Fire,james-bond-007-agent-under-fire,3.9,"Electronic Arts,Visceral Games,Electronic Arts...","Xbox,GameCube,PlayStation 2","Multiplayer,Singleplayer",True


In [6]:
#get rid of the first 2 rows
df = df.iloc[2:]

In [7]:
df = df.rename(columns={"Unnamed: 0": "product-name-slug"})

In [8]:
df = df.reset_index()

In [9]:
df.head()

Unnamed: 0,index,product-name-slug,new-price,sales-volume,release-year,genres,name,slug,rating,developers,platforms,tags,match
0,2,-30-great-games--family-party,7.88,16,2008.0,,Family Party: 30 Great Games,family-party-30-great-games,0.0,,Wii,[party],True
1,3,007--agent-under-fire,12.43,97,2002.0,,James Bond 007: Agent Under Fire,james-bond-007-agent-under-fire,3.9,"Electronic Arts,Visceral Games,Electronic Arts...","Xbox,GameCube,PlayStation 2","Multiplayer,Singleplayer",True
2,4,007--everything-or-nothing,9.987066800602712,143,2004.0,[shooter],James Bond 007: Everything or Nothing,james-bond-007-everything-or-nothing,4.42,"Griptonite Games,Electronic Arts Redwood Shores","PlayStation 2,Game Boy Advance,Xbox,GameCube","Multiplayer,Singleplayer,cooperativeplay",True
3,5,007--from-russia-with-love,8.94,99,2005.0,[action],James Bond 007: From Russia with Love,james-bond-007-from-russia-with-love,3.91,"Rebellion,Visceral Games,Electronic Arts Redwo...","GameCube,PlayStation 2,PSP,Xbox","Multiplayer,Singleplayer",True
4,6,007--nightfire,38.1529028714891,255,2003.0,"Action,Shooter",James Bond 007: NightFire,james-bond-007-nightfire,3.93,[gearbox-software],PC,,True


In [10]:
df["match"].value_counts()

True    11367
Name: match, dtype: int64

In [11]:
#take the lines with match = True
df_game = df[df["match"]==True]

In [12]:
genres_series = []

In [13]:
#create a function to clean genres, platforms and developers
def clean_format(label,df_set):
    result = []
    for index,row in df_set.iterrows():
        data_label = row[label]
        if isinstance(data_label ,float):
            result.append("NA")
        else:
            if data_label[0] == "[":
                #if it's a single string and starts with "["
                temp = data_label[1:-1]
                result.append(temp.lower())
            else:
                #if the string is separated by commas
                temp = data_label.split(",")
                #transform to lower cases
                temp_lower = [x.lower() for x in temp]
                result.extend(temp_lower)
    return result

In [14]:
genres_output = clean_format("genres",df)

In [15]:
genres_output_unique = set(genres_output)

In [16]:
genres_output_unique 

{'NA',
 'action',
 'adventure',
 'arcade',
 'board games',
 'board-games',
 'card',
 'casual',
 'educational',
 'family',
 'fighting',
 'indie',
 'massively multiplayer',
 'massively-multiplayer',
 'platformer',
 'puzzle',
 'racing',
 'role-playing-games-rpg',
 'rpg',
 'shooter',
 'simulation',
 'sports',
 'strategy'}

In [17]:
developers_output = clean_format("developers",df)

In [18]:
developers_output_unique = set(developers_output)

In [19]:
platforms_output = clean_format("platforms",df)

In [20]:
platforms_output_unique = set(platforms_output)

In [21]:
len(set(platforms_output))

50

In [None]:
tags_output = clean_format("tags",df)

In [None]:
tags_output_unique = set(tags_output)

In [None]:
len(set(tags_output)) 

Several feature engineering:
1. Change release-year to age
2. On genres, developers and platforms, extract unique values for each case
3. Use vector representation of genres, developers and platforms. This way, we would not only include the number of available tags/features, but we would also be able to include the importance of each feature
4. Change NaN to "NA" as a stand-alone value
5. Normalization if necessary
6. Find a way to add features "Collections" and "Add-On" 
7. Word learning using GloVe

In [None]:
#change the release-year to age
df["age"] = 2019 - df["release-year"].astype(float)

In [None]:
#filter out the negative age
df = df[df["age"]>=0]

In [None]:
df

In [None]:
genres_output_unique 

In [None]:
#make a copy of df
df_inuse = df.copy()

In [None]:
#clean genres and platforms
#genres

#first clean some typo errors
#clearly, board-games and board games are the same, and massively multiplayer and massively-multiplayer are the same
#let's keep 'board-games' and 'massively-multiplayer' 'massively multiplayer'
genres_output_unique.remove('board games')
genres_output_unique.remove('massively multiplayer')

In [None]:
#make each genre a binary feature
genres_features_label = list(genres_output_unique)

In [None]:
genres_features_label = ["genre-" + x for x in genres_features_label]

In [None]:
genres_features_label 

In [None]:
#and make them binary features
for title in genres_features_label:
    df_inuse[title] = 0

In [None]:
df_inuse

In [None]:
#populate genre features correctly
#change the function written before
#no need to compare because of how the feature_labels are generated: they came from the genres itself, except "board games" and "masively-multiplayer"
def clean_format_features(label,df_set,label_type):
    for index,row in df_set.iterrows():
        data_label = row[label]
        if isinstance(data_label ,float):
            df_set.loc[index,label_type+ "-"+"NA"] = 1
        else:
            if data_label[0] == "[":
                #if it's a single string and starts with "["
                temp = data_label[1:-1]
                #modify 2 genres,only works when it's populating genres
                if temp == "board games":
                    temp = "board-games"
                elif temp == "massively multiplayer":
                    temp = "massively-multiplayer"
                df_set.loc[index,label_type+ "-"+temp] = 1
            else:
                #if the string is separated by commas and has multiple genres
                temp = data_label.split(",")
                #transform to lower cases
                temp_lower = [x.lower() for x in temp]

                for tt in temp_lower:
                    if tt == "board games":
                        tt = "board-games"
                    elif tt == "massively multiplayer":
                        tt = "massively-multiplayer"
                    df_set.loc[index,label_type+ "-"+tt] = 1
    return df_set

In [None]:
df_genres_cleaned = clean_format_features("genres",df_inuse,"genre")

In [None]:
df_genres_cleaned.iloc[3] 

In [None]:
#clean the platforms in the same fashion
platforms_output_unique

In [None]:
platforms_features_label = list(platforms_output_unique)

In [None]:
platforms_features_label = ["platform-" + x for x in platforms_features_label]

In [None]:
for title in platforms_features_label:
    df_genres_cleaned[title] = 0

In [None]:
df_genres_cleaned.iloc[3]

In [None]:
#populate the platform feature
df_platform_cleaned = clean_format_features("platforms",df_genres_cleaned,"platform")

In [None]:
df_platform_cleaned.iloc[3]

In [None]:
#now populate developers: pick the top 50 in terms of counts and put others in "others"
developers_output 

In [None]:
developers_output_count = Counter(developers_output)

In [None]:
type(developers_output_count)

In [None]:
developers_output_top50 = developers_output_count.most_common(50)

In [None]:
developers_output_top50

In [None]:
developers_output_top50_list = [x[0] for x in developers_output_top50]

In [None]:
developers_output_top50_list.append("Others")

In [None]:
developers_output_top50_list

In [None]:
#make it a dictionary so that it's faster to search
developers_output_top50_dict = {}
for x in developers_output_top50_list:
    developers_output_top50_dict[x] = "developer-"+x

In [None]:
developers_output_top50_dict 

In [None]:
#modify the function to fit developers and tags
def clean_format_features_2(label,df_set,label_dict):
    for index,row in df_set.iterrows():
        data_label = row[label]
        if isinstance(data_label ,float):
            #it's NA
            df_set.loc[index,label_dict["NA"]] = 1
        else:
            if data_label[0] == "[":
                #if it's a single string and starts with "["
                #means there's only one item -> we take the string without "[]"
                temp = data_label[1:-1]
                #modify if the item is not in the dictionary (not in the top50 or 25 depending on the label)
                if temp in label_dict:
                    df_set.loc[index,label_dict[temp]] = 1
                else:
                    #other developers
                    temp = "Others"
                    df_set.loc[index,label_dict[temp]] = 1
            else:
                #if the string is separated by commas and has multiple items
                temp = data_label.split(",")
                #transform to lower cases
                temp_lower = [x.lower() for x in temp]

                for tt in temp_lower:
                    if tt in label_dict:
                        #print(tt)
                        df_set.loc[index,label_dict[tt]] = 1
                    else:
                        #print(tt)
                        tt = "Others"
                        df_set.loc[index,label_dict[tt]] = 1
                    
    return df_set

In [None]:
df_new = df_platform_cleaned

In [None]:
#pre-populate the dataset with zeros
for title in developers_output_top50_dict.items():
    df_new[title[1]] = 0

In [None]:
df_new.head(5)

In [None]:
df_developers_cleaned = clean_format_features_2("developers",df_new,developers_output_top50_dict)

In [None]:
df_developers_cleaned.loc[1,"developers"]

In [None]:
df_developers_cleaned.loc[1,"developer-electronic arts"]

In [None]:
#save a copy
df_new = df_developers_cleaned

In [None]:
#clean tags in the same way (top 25 tags)
tags_output_count = Counter(tags_output)
tags_output_top25 = tags_output_count.most_common(25)
tags_output_top25_list = [x[0] for x in tags_output_top25]
tags_output_top25_list.append("Others")


tags_output_top25_dict = {}
for x in tags_output_top25_list:
    tags_output_top25_dict[x] = "tag-"+x

In [None]:
tags_output_top25_dict

In [None]:
#pre-populate the dataset with zeros
for title in tags_output_top25_dict.items():
    df_new[title[1]] = 0
    
df_new.head(5)

In [None]:
#populate binary features on tags
df_tags_cleaned = clean_format_features_2("tags",df_new,tags_output_top25_dict)

In [None]:
df_tags_cleaned.loc[1,"tags"]

In [None]:
df_tags_cleaned.iloc[1]

##### use GloVe to train game names

In [None]:
#from glove import Corpus, Glove

In [None]:
#construct the vocalbulary/documents/copus using game names
#from nltk.corpus import stopwords 
#from nltk.tokenize import word_tokenize

In [None]:
import nltk
nltk.download('stopwords')

In [None]:
stop_words = set(stopwords.words('english')) 
stop_words

In [None]:
#replace "-" with space in slug and every name would be a document, all the game titles would constinute a corpus
df_new = df_tags_cleaned.copy()

In [None]:
len(df_new) 

In [None]:
#df_new["glove_name"] = df_new["slug"].str.replace("-"," ")

#10-29-2019

df_new["glove_name"] = df_new["product-name-slug"].str.replace("-"," ")

In [None]:
df_new

In [None]:
#ger rid of other symbols such as "[]"

#df_new["glove_name"] = df_new["glove_name"].str.replace("[","")
#df_new["glove_name"] = df_new["glove_name"].str.replace("]","")
#change all to lower cases
#df_new["glove_name"]  =  df_new["glove_name"].str.lower()
#df_new["glove_name"]  = df_new["glove_name"].str.replace("\\","")
#df_new["glove_name"]  = df_new["glove_name"].str.replace("/","")
#df_new["glove_name"]  = df_new["glove_name"].str.replace("&","")

In [None]:
#get rid of all the punctuations

df_new["glove_name"] = df_new["glove_name"].str.translate(str.maketrans('', '', string.punctuation))

In [None]:
raw_lines = list(df_new["glove_name"])

In [None]:
raw_lines

In [None]:
lines = [str(x) if isinstance(x,float) else word_tokenize(x) for x in raw_lines]

In [None]:
lines

In [None]:
filtered_lines = []

In [None]:
#get rid of english stopwords
for l in lines:
    temp = 0
    if l == 'nan':
        temp = ['nan']
    else:
        temp = [w for w in l if (w not in stop_words)]                  
    filtered_lines.append(temp)

In [None]:
filtered_lines

In [None]:
#save filtred_lines
#save_obj(filtered_lines,"filtered_lines")

#new filtered lines using longer slug names
save_obj(filtered_lines,"filtered_lines_ver_2")

##### check most common words (double check if there's any redundant punctuations)

In [None]:
filtered_lines

In [None]:
flatten_lines =  [item for sublist in filtered_lines for item in sublist]
word_counter = Counter(flatten_lines)
word_counter.most_common(200)

##### Train corpus

In [None]:
#now train the corpus
corpus = Corpus() 

In [None]:
corpus.fit(filtered_lines, window=10)

In [None]:
#try word embedding of 50 instead of 20
glove = Glove(no_components=50, learning_rate=0.01)

In [None]:
glove.fit(corpus.matrix, epochs=100, no_threads=4, verbose=True)

In [None]:
glove.add_dictionary(corpus.dictionary)

In [None]:
glove.save('glove.model')

In [None]:
glove.dictionary['james']

In [None]:
glove.dictionary

In [None]:
dir(glove)

In [None]:
print(glove.word_vectors[glove.dictionary['james']])

In [None]:
print(glove.most_similar('007'))

In [None]:
print(glove.most_similar('kart'))

In [None]:
print(glove.most_similar('assassins'))

In [None]:
print(glove.most_similar('dragon'))

In [None]:
print(glove.most_similar('hits'))

In [None]:
print(glove.most_similar('collectors'))

In [None]:
#virtualize using t-SNE

#from sklearn.manifold import TSNE
#import matplotlib.pyplot as plt
%matplotlib inline

#function from https://www.kaggle.com/jeffd23/visualizing-word-vectors-with-t-sne

def tsne_transform(model):
    "Creates and TSNE model and plots it"
    labels = []
    tokens = []

    for word in model.dictionary:
        #print(word)
        tokens.append(model.word_vectors[model.dictionary[word]])
        labels.append(word)
    
    tsne_model = TSNE(perplexity=40, n_components=2, init='pca', n_iter=2500, random_state=10)
    new_values = tsne_model.fit_transform(tokens)

    x = []
    y = []
    for value in new_values:
        x.append(value[0])
        y.append(value[1])
        
    return x,y,labels


In [None]:
x,y,labels_lst = tsne_transform(glove)

In [None]:
full_size = len(x)
full_size

In [None]:
def tsne_plot(x,y,labels,x_min,x_max,y_min,y_max,picname,num_points):
    plt.figure(figsize=(16, 16)) 
    for i in range(len(x[:num_points])):
        plt.scatter(x[i],y[i])
        plt.annotate(labels[i],
                     xy=(x[i], y[i]),
                     xytext=(5, 2),
                     textcoords='offset points',
                     ha='right',
                     va='bottom')
        
    plt.axis([x_min,x_max,y_min,y_max])
    plt.show()
    #plt.savefig(picname +'.png')

In [None]:
tsne_plot(x,y,labels_lst,-100,100,-100,100,"All_data",300)

In [None]:
#zoom in some interesting areas
tsne_plot(x,y,labels_lst,-100,100,-100,100,"All_data",full_size)

###### tuning

In [None]:
#draw zooms
#for x_value in range(-100,90,10):
#    for y_value in range(-100,90,10):
#        tsne_plot(x,y,labels_lst,x_value,x_value+10,y_value,y_value+10,"All_data",full_size)

In [None]:
glove_model_dict = {}

In [None]:
#the function will return a dictionary of glovel models that have been trained
def glove_fitting_models(glove_model_dict,learning_rate=0.05,no_components=20,epochs=100,no_threads=4,order=1):
    glove_temp = Glove(no_components, learning_rate)
    glove_temp.fit(corpus.matrix, epochs, no_threads, verbose=True)
    glove_temp.add_dictionary(corpus.dictionary)
    glove_temp.save('glove'+'_'+str(order)+'.model')
    glove_model_dict['glove'+'_'+str(order)] = glove_temp
    return glove_model_dict

In [None]:
order = [1,2,3,4]
no_components_vector = [10,20,50,100]
epochs_num = 1000

In [None]:
#try different number of components
for i in order:
    glove_fitting_models(glove_model_dict,learning_rate=0.01,no_components=no_components_vector[i],epochs=epochs_num,no_threads=4,order=i)

In [None]:
glove_model_dict

In [None]:
x_glove_2,y_glove_2,labels_lst_glove_2 = tsne_transform(glove_model_dict['glove_2'])

In [None]:
size_glove_2 = len(labels_lst_glove_2)
size_glove_2 

In [None]:
tsne_plot(x_glove_2,y_glove_2,labels_lst_glove_2,-100,100,-100,100,"All_data",size_glove_2)

In [None]:
tsne_plot(x_glove_2,y_glove_2,labels_lst_glove_2,-100,100,-100,100,"All_data",300)

In [None]:
tsne_plot(x_glove_2,y_glove_2,labels_lst_glove_2,20,30,-85,-75,"All_data",full_size)

In [None]:
tsne_plot(x_glove_2,y_glove_2,labels_lst_glove_2,-35,-25,-65,-55,"All_data",full_size)

In [None]:
glove_model_dict_2 = {}

In [None]:
order2 = [5,6,7,8]

In [None]:
#try different number of components
for i in order2:
    glove_fitting_models(glove_model_dict_2,learning_rate=0.05,no_components=no_components_vector[i-4],epochs=5000,no_threads=4,order=i)

In [None]:
glove_model_dict_2

In [None]:
x_glove_6,y_glove_6,labels_lst_glove_6 = tsne_transform(glove_model_dict_2['glove_6'])

In [None]:
size_glove_6= len(labels_lst_glove_6)
size_glove_6 

In [None]:
tsne_plot(x_glove_6,y_glove_6,labels_lst_glove_6,-100,100,-100,100,"All_data",size_glove_6)

In [None]:
tsne_plot(x_glove_6,y_glove_6,labels_lst_glove_6,-100,100,-100,100,"All_data",300)

In [None]:
tsne_plot(x_glove_6,y_glove_6,labels_lst_glove_6,20,30,-85,-75,"All_data",full_size)

In [None]:
tsne_plot(x_glove_6,y_glove_6,labels_lst_glove_6,-35,-25,-65,-55,"All_data",full_size)

In [None]:
#check some key word
#5000 trainings dimention 20
print(glove.word_vectors[glove_model_dict_2['glove_6'].dictionary['james']])
print(glove_model_dict_2['glove_6'].most_similar('james'))
print(glove_model_dict_2['glove_6'].most_similar('assassins'))

In [None]:
#1000 trainings dimention 20
print(glove.word_vectors[glove_model_dict['glove_2'].dictionary['james']])
print(glove_model_dict['glove_2'].most_similar('james'))
print(glove_model_dict['glove_2'].most_similar('assassins'))

In [None]:
#1000 trainings dimention 10
print(glove.word_vectors[glove_model_dict['glove_1'].dictionary['james']])
print(glove_model_dict['glove_1'].most_similar('james'))
print(glove_model_dict['glove_1'].most_similar('assassins'))

In [None]:
#1000 trainings dimention 50
print(glove.word_vectors[glove_model_dict['glove_3'].dictionary['james']])
print(glove_model_dict['glove_3'].most_similar('james'))
print(glove_model_dict['glove_3'].most_similar('assassins'))

In [None]:
#100 trainings dimention 20
print(glove.word_vectors[glove.dictionary['james']])
print(glove.most_similar('james'))
print(glove.most_similar('assassins'))

##### end of  tuning

Next is to generate actual vectors to be used in the features

In [None]:
#try first with the vectors from epoch = 100 and dimention 20
#and take the mean of all the words available in the title
sentence_vectors = []
real_vectors = []
embed_n = 50
for each_line in filtered_lines:
    print(each_line)
    temp = 0
    sum_temp = 0
    if (each_line == 'nan') or (len(each_line) == 0):
        #temp = glove.word_vectors[glove.dictionary['nan']]
        temp = [0]*embed_n
    else:
        #for w in each_line:
        #    sum_temp = sum_temp + glove.word_vectors[glove.dictionary[w]]
        #temp = sum_temp/(len(each_line))
        temp = np.mean([glove.word_vectors[glove.dictionary[w]] for w in each_line],axis=0)    
        temp_realv = [glove.word_vectors[glove.dictionary[w]] for w in each_line]
    sentence_vectors.append(temp) 
    real_vectors.append(temp_realv)


In [None]:
len(sentence_vectors)

In [None]:
len(real_vectors[0])

In [None]:
#save realvectors

#save_obj(real_vectors,"word_vectors")

save_obj(real_vectors,"word_vectors_size50")
save_obj(sentence_vectors,"mean_word_vectors_size50")

In [None]:
len(df_new)

In [None]:
len(filtered_lines)

In [None]:
len(real_vectors)

In [None]:
len(sentence_vectors)

In [None]:
#add this into df_new
df_new["glove_vectors"] = sentence_vectors

In [None]:
df_new

In [None]:
#df_new.to_pickle("./CleanedData_firstversion.pk1")
#save as the second version
df_new.to_pickle("./CleanedData_secondversion.pk1")

##### Create vectors using TF-IDF

In [None]:
#from sklearn.feature_extraction.text import TfidfVectorizer

#turn filtered lines into a list of sentences with spaces and without stopwords 
key_sentences = []

for x in filtered_lines:
    temp = " ".join(x)
    key_sentences.append(temp)

In [None]:
key_sentences

In [None]:
vectorizer = TfidfVectorizer(token_pattern=r'\S+')
weights = vectorizer.fit_transform(key_sentences)

In [None]:
feature_names = vectorizer.get_feature_names()
doc = 2
feature_index = weights[doc,:].nonzero()[1]
tfidf_scores = zip(feature_index, [weights[doc, x] for x in feature_index])

In [None]:
feature_names

In [None]:
for w, s in [(feature_names[i], s) for (i, s) in tfidf_scores]:
    print(w, s)

In [None]:
tfidf_scores

In [None]:
#now get the scores for each line
score_dict = {}
line_dict = {}

total_lines = len(filtered_lines)

for i in range(total_lines):
    line_dict = {}
    feature_index = weights[i,:].nonzero()[1]
    tfidf_scores = zip(feature_index, [weights[i, x] for x in feature_index])
    for w, s in [(feature_names[ii], s) for (ii, s) in tfidf_scores]:
        line_dict[w] = s
    
    #append the score for the line to the total dictionary
    score_dict[i] = line_dict

In [None]:
score_dict

In [None]:
#export and save the dictionary of score

#save_obj(score_dict,"tfidf_dict")

#save it as a second edition for word embedding size of 50
save_obj(score_dict,"tfidf_dict_size50")


In [None]:
len(score_dict)

In [None]:
len(filtered_lines)

In [None]:
total_lines

In [None]:
#apply tfidf to the vectors

In [None]:
#score_dict{index:{word:score....word:score....}}
#real_vectors 21219*n*20

tfidf_vector = []
v_sum = 0
s_sum = 0

for i in range(total_lines):
    v_sum = 0
    s_sum = 0
    s_temp = 0
    temp = 0
    n=len(filtered_lines[i])
    if n>0:
        for w in range(n):
            word = filtered_lines[i][w]
            print(i,w,word)
            s_temp = score_dict[i][str(word)]
            v_sum = v_sum + real_vectors[i][w] * s_temp
            s_sum = s_sum + s_temp
            temp = v_sum/s_sum
    else:
        temp = 0
    
    tfidf_vector.append(temp)


In [None]:
score_dict[0][filtered_lines[0][0]]

In [None]:
filtered_lines[33][0]

In [None]:
score_dict[33]

In [None]:
tfidf_vector[0]

In [None]:
sentence_vectors[0]

In [None]:
#save the tfidf adjusted vector
#save_obj(tfidf_vector,"tfidf_vector_corpus")

#save it as second version
save_obj(tfidf_vector,"tfidf_vector_corpus_size50")

In [None]:
len(df_new)

In [None]:
#add tfidf_vector to the dataframe
#add this into df_new
df_new["tfidf_vectors"] =tfidf_vector

In [None]:
#save this new dataset
#save_obj(df_new,"df_tfidf")

#save it as second version for a vector size of 50
save_obj(df_new,"df_tfidf_size50")

In [None]:
df_new