In [5]:
import pandas as pd
import ast
import re
from gensim.models import Word2Vec, KeyedVectors
import time
import pickle
import numpy as np
from sklearn.decomposition import PCA

In [6]:
# ratings=pd.read_csv('data/movielens_latest/ml-latest-lean/ratings.zip', compression='gzip')
# tags=pd.read_csv('data/movielens_latest/ml-latest-lean/tags.csv')
# movies=pd.read_csv('data/movielens_latest/ml-latest-lean/movies.csv')

# movie_keywords=tags.groupby('movieId').apply(lambda x: x['tag'].tolist())
movie0=pd.read_csv('../data/processed_data/movies_with_kws.csv')

* import tags information

In [7]:
tags_df = pd.read_csv('../data/movielens_latest/ml-latest-lean/tags.csv')
tags_df = tags_df.groupby('movieId')['tag'].apply(lambda x: ' '.join(x)).reset_index()
tags_df['tag'] = tags_df['tag'].apply(lambda x: x.lower().split(' '))

* create keyword columns

In [8]:
def parse_title(s):
    return [a.lower() for a in re.split(' |, | ,|,|\)|\(',s.strip())]

In [9]:
movie = pd.merge(movie0, tags_df, on='movieId', how='left')
movie['description_kws'] = movie['description_kws'].apply(lambda x: ast.literal_eval(x)) # string to list conversion
movie['cast_kws'] = movie['cast_kws'].apply(lambda x: ast.literal_eval(x)) # string to list conversion
movie['genre_kws'] = movie['genre_kws'].apply(lambda x: ast.literal_eval(x)) # string to list conversion
movie['title_kws'] = movie['title'].apply(parse_title)

* clean up null tags

In [10]:
for row in movie.loc[movie['tag'].isnull(), 'tag'].index:
    movie.at[row, 'tag'] = []

* combine all keywords into one string

In [11]:
kws_all = []
for l1, l2, l3, l4 in movie[['description_kws','cast_kws','tag','title_kws']].values:
    kws_all.append(l1+l2+l3+l4)

movie['kws_all'] = kws_all
movie['kws_all'] = movie['kws_all'].apply(lambda x: ' '.join(list(set(x)))) # back to string
movie['kws_all'] = movie['kws_all'].apply(lambda x: x.lower()) # lower case for names
corpus =[re.findall(r'(?u)\b\w+\b',x) for x in movie['kws_all']] # some character cleanup

* word2vec

In [12]:
workdir='C:\\Users\\rohan.thavarajah\\Downloads\\'
t0=time.time()
model=  Word2Vec(size=300, min_count=1)

In [13]:
model.build_vocab(corpus)
print('total key words',len(model.wv.vocab))

total key words 21403


In [14]:
model.intersect_word2vec_format(
        workdir+'GoogleNews-vectors-negative300.bin', binary=True, lockf=1.0)
print(time.time()-t0)
vocab=model.wv.vocab
print('total key words with vectors',len(model.wv.vocab))
# model.wv.save('data/processed_data/kw_vectors.kv')
# to load the vectors
# wv = KeyedVectors.load('data/processed_data/kw_vectors.kv', mmap='r')

41.7380907535553
total key words with vectors 21403


In [15]:
wv = model.wv

In [16]:
wv.similar_by_word('comedy')

[('comedies', 0.7952671647071838),
 ('comedic', 0.7497517466545105),
 ('sitcom', 0.7363550066947937),
 ('dramedy', 0.7125568389892578),
 ('comic', 0.6860707998275757),
 ('satire', 0.662791907787323),
 ('comedian', 0.6516796946525574),
 ('slapstick', 0.6389349102973938),
 ('mockumentary', 0.6205418109893799),
 ('comedians', 0.6155717372894287)]

* pca

In [17]:
pca = PCA(n_components=64)
pca_out = pca.fit_transform(wv.vectors)

* prepare dataframe of words and embedding

In [18]:
# asdf = vocab['again']
# asdf.index
# wv.vectors[470][:5]
# wv.get_vector('again')[:5]

In [19]:
output = {}
for key in vocab:
    output[key] = pca_out[vocab[key].index]

In [20]:
with open('keyword_embedding.pkl', 'wb') as dumpMe:
    pickle.dump(output, dumpMe)

In [23]:
movie[['title','movieId','qmdbId','tmdbId','year','genre_kws','kws_all']].to_csv('../data/processed_data/keywords_all.csv')