In [1]:
import pandas as pd
import numpy as np
import gensim as gs
import simplejson as json
import matplotlib.pyplot as plt

import nltk

from sklearn.feature_extraction.text import TfidfVectorizer
from wordcloud import WordCloud
from nltk.corpus import stopwords
from collections import defaultdict
from gensim.models.doc2vec import TaggedDocument
# from nltk import RegexpTokenizer

%matplotlib inline



In [None]:
def idf(x):
    return np.log(10/x)

x = np.linspace(1,10,num=10)
plt.plot(x,idf(x))
plt.show()

In [2]:
class PreprocessIMDBData:
    def __init__(self):
        self.pattern = "[\w]+"
        self.stopword_set = set(stopwords.words('english'))
        
    def tokenizeSentence(self,text):
        tokenized_sentence = [w.lower() for w in nltk.regexp_tokenize(text,self.pattern)]
        cleanedSentence = list(set(tokenized_sentence).difference(self.stopword_set))
        return cleanedSentence

In [3]:
class VectorizeIMDBData:
    
    def __init__(self,tokenizerObj = PreprocessIMDBData()):
        self.tokenizerObj = tokenizerObj 
        self.tfidf = TfidfVectorizer(tokenizer=tokenizerObj.tokenizeSentence, ngram_range=(0, 1))
    
    def getTFIDFMatrix(self,docs):
        return self.tfidf.fit_transform(docs),self.tfidf.vocabulary_
    
    def getCleanedDocuments(self,docs, threshold = 0.6):
        scores,vocab = self.getTFIDFMatrix(docs)
        idf_inverse = 1/self.tfidf.idf_
        new_vocab_keys = (idf_inverse <= threshold)
        
        new_docs = []
        for doc_id,document in enumerate(docs):
            new_doc = []
            words = self.tokenizerObj.tokenizeSentence(document)
            for word in words:
                word_id  = vocab.get(word)
                if word_id is not None and new_vocab_keys[word_id]:
                    new_doc.append(word)
            
            cleanedDocument = ' '.join(new_doc)
            new_docs.append(cleanedDocument)
            
            del new_doc
        return new_docs
    
    def plot_words(self,threshold = [0.6]):
        max_figs_in_row = 5
        rows = int(np.ceil(len(threshold)/max_figs_in_row))
        cols  = np.minimum(len(threshold),max_figs_in_row)
        
        fig, axes = plt.subplots(rows, cols, figsize = (15, 9))
        
        vocab = self.tfidf.vocabulary_
        idf_inverse = 1/self.tfidf.idf_
        
        for i, ax in enumerate(fig.axes):
            frequencies = {}
            for key,value in vocab.items():
                if  value < len(idf_inverse) and idf_inverse[value] <= threshold[i]:
                    frequencies[key] = idf_inverse[value]
            
            if len(frequencies) <= 0:
                continue
            
            wc  = WordCloud(max_words=100)
            wc.generate_from_frequencies(frequencies)
            ax.imshow(wc, interpolation='bilinear')
            ax.set_axis_off()
        return

In [4]:
class Utils:
    
    @staticmethod
    def getGenres(genresStr):
        if pd.isnull(genresStr ):
            return []
        
        genresInStrList = []
        genreJson = json.loads(genresStr.replace("'",'"'))
        for genreObject in genreJson:
            genreName = genreObject['name'].lower()
            genresInStrList.append(genreName)
        
        return genresInStrList
    
    @staticmethod
    def convert_df_array(df,uniqueGenres = dict()):
        tokenizer = PreprocessIMDBData()
        movies = []
        for index,row in df.iterrows():
            movietags = []
            tags = Utils.getGenres( row['genres'])
            if len(uniqueGenres) != 0:
                for tag in tags:
                    if tag in uniqueGenres:
                        movietags.append(tag)
            else:
                movietags = movietags + tags
#             movietags.append(row['id'])
#             movietags.append(row['title'])
            movies.append(TaggedDocument(words = tokenizer.tokenizeSentence(row['overview_cleaned']),
                                              tags = movietags))
        return movies

In [5]:
mytokenizer = PreprocessIMDBData()
myTfIdfGenerator = VectorizeIMDBData(mytokenizer)

                                            #################  Test tf-idf and tokenizer
# docs = ["The sun is shining allowance", "The sun weather is sweet", "the is shining and the weather is sweet"]
# cleaned_docs = myTfIdfGenerator.getCleanedDocuments(docs,threshold=0.8)
# print(cleaned_docs)

In [6]:
movies_actual = pd.read_csv('./the-movies-dataset/movies_metadata.csv',low_memory=False)
cols = ['id','title','genres','overview']
movies_df_clean = movies_actual[cols]
movies_df_clean = movies_df_clean[pd.notnull(movies_df_clean['overview'])]
movies_df_clean.info()
del movies_actual

<class 'pandas.core.frame.DataFrame'>
Int64Index: 44512 entries, 0 to 45465
Data columns (total 4 columns):
id          44512 non-null object
title       44506 non-null object
genres      44512 non-null object
overview    44512 non-null object
dtypes: object(4)
memory usage: 1.7+ MB


In [None]:
def datasetOverview(df):
    genre_count = 0
    word_count = 0
    for index,row in df.iterrows():
        movietags = Utils.getGenres( row['genres'])
        genre_count += len(movietags)
        words = mytokenizer.tokenizeSentence(row['overview'])
        word_count += len(words)
    
    print("Avg Genre :: ",(genre_count/len(df)))
    print("Avg Words :: ",(word_count/len(df)))
    return

datasetOverview(movies_df_clean)

In [None]:
myTfIdfGenerator.getTFIDFMatrix(docs=list(movies_df_clean['overview']))
myTfIdfGenerator.plot_words(threshold=np.linspace(0.1,0.99,num=10))

In [None]:
overview_cleaned = myTfIdfGenerator.getCleanedDocuments(list(movies_df_clean['overview']), threshold = 0.6)
movies_df_clean['overview_cleaned'] = overview_cleaned

In [7]:
def plot_frequencies(movies_df_clean,cutoff = 0.00000001,columnName = 'overview',showGraph = False):
    mytokenizer = PreprocessIMDBData()
    genre_unkw = {}
    for index,row in movies_df_clean.iterrows():
        summary = set(mytokenizer.tokenizeSentence(row[columnName]))
        if len(summary) <= 0:
            continue
        
        genreList = Utils.getGenres(row['genres'])
        if len(genreList) <= 0:
            continue
        
        for genreObject in genreList:
            uniqueWords = set()
            if genreObject  in genre_unkw:
                uniqueWords = genre_unkw[genreObject]
            
            uniqueWords |= summary
            genre_unkw[genreObject] = uniqueWords
            
    uniqueWordsSet = set()    
    uniqueWordsDict = {}
    for genre,uniqueWords in genre_unkw.items():
        uniqueGenreRep_Count = len(uniqueWords)
        uniqueWordsSet |= uniqueWords
        if uniqueGenreRep_Count <= cutoff * len(movies_df_clean):
            continue
        
        uniqueWordsDict[genre] = uniqueGenreRep_Count
        uniqueWordsDict['VocabSize'] = len(uniqueWordsSet)
        
    df = pd.DataFrame.from_dict(uniqueWordsDict, orient='index')
    if showGraph:
        df.plot(kind='bar')
    
#     del uniqueWordsDict
#     del genre_unkw
    return df,uniqueWordsDict

In [None]:
df,_ = plot_frequencies(movies_df_clean,cutoff = 0.1,showGraph=True)

In [None]:
thresholds=np.linspace(0.1,0.99,num=10)

plots_list = list()
for threshold in thresholds:
    print("Analyzing for threshold ",threshold)
    overview_cleaned = myTfIdfGenerator.getCleanedDocuments(list(movies_df_clean['overview']), threshold = threshold)
    movies_df_clean['overview_cleaned'] = overview_cleaned
    df,_ = plot_frequencies(movies_df_clean,cutoff=0.1,columnName='overview_cleaned')
    plots_list.append(df)
    print("Done with analysis")
print("------------->Done<-------------")

In [None]:
max_figs_in_row = 3
rows = int(np.ceil(len(plots_list)/max_figs_in_row))
cols  = np.minimum(len(plots_list),max_figs_in_row)

fig, axes = plt.subplots(rows, cols, figsize = (5*rows,9*cols))
for i, ax in enumerate(fig.axes):
    if i >= len(plots_list):
        continue
#     plots_list[i].plot(kind='bar',ax = ax)
    s = ['g' if index == 'VocabSize' else 'c' for index,value in plots_list[i][0].iteritems()]
    plots_list[i].plot(kind='bar',ax = ax,color = s)
    ax.set_xlabel("genre")
    ax.set_ylabel("num of unique words, {} threshold".format(thresholds[i]))

In [8]:
overview_cleaned = myTfIdfGenerator.getCleanedDocuments(list(movies_df_clean['overview']), threshold = 0.5)
movies_df_clean['overview_cleaned'] = overview_cleaned

In [9]:
_,uniqueGenres = plot_frequencies(movies_df_clean,cutoff=0.1,columnName='overview_cleaned')

# All genres have similar sentiments

In [10]:
from gensim.models.doc2vec import TaggedDocument
from gensim.models import Doc2Vec

In [11]:
class LabeledIMDBData(object):
    
    def __init__(self,data,considerTags = dict()):
        self.movies = data
        self.uniqueGenres = considerTags
    
    @classmethod
    def fromlist(cls,movies_data,considerTags = dict()):
        return cls(movies_data,considerTags)
    
    @classmethod
    def fromDataFrame(cls,movies_df,considerTags = dict()):
        movies_data = Utils.convert_df_array(movies_df,considerTags)
        return cls(movies_data,considerTags)
    
    def __iter__(self):
        for movie_data in self.movies:
            yield movie_data
        return
    
    def to_array(self):
        return self.movies
    
    def generate_permutations(self):
        from random import shuffle
        shuffle(self.movies)
        return self
    
    def datasetSize(self):
        return len(self.movies)
    
    def split_data(self,test_split = 0.3):
        from sklearn.model_selection import train_test_split
        movies_train_array, movies_test_array = train_test_split(self.movies, test_size=test_split, random_state=42)
        movies_train = LabeledIMDBData.fromlist(movies_train_array,self.uniqueGenres)
        movies_test = LabeledIMDBData.fromlist(movies_test_array,self.uniqueGenres)
        return movies_train,movies_test
    
    def getVocabFreq(self):
        vocab = {}
        for movie_info in self:
            overview = movie_info.words
            for word in overview:
                freq = 0
                if word in vocab:
                    freq = vocab[word]
                freq = freq + 1
                vocab[word] = freq
            
            tags = movie_info.tags
            for tag in tags:
                freq = 0
                if tag in vocab:
                    freq = vocab[tag]
                freq = freq + 1
                vocab[tag] =  freq
        return vocab
    
    def slice_tags(self,tagToSearch):
        movies = []
        
        for movie in self:
            tags = movie.tags
            for tag in tags:
                if tag == tagToSearch:
                    movies.append(movie)
        return movies
    
    def slice_text(self,wordToSearch):
        movies = []
        
        for movie in self:
            overview = movie_info.words
            for word in overview:
                if word == wordToSearch:
                    movies.append(movie)
        return movies
        

In [63]:
def accuracy_rate_for_model(test_model, train_set, test_set, uniqueGenres):
    """Generate precision recall values for train and test documents"""
    
    def predictLabels(model,data):
        labels_pred = []
        labels_actual = []
        for movie in data:
            prediction_vector = test_model.infer_vector(movie.words)
            sims = test_model.docvecs.most_similar([prediction_vector],topn=2)
            
            predictions = []
            for pred in sims:
                predictions.append(pred[0])
            
            labels_pred.append(predictions)
            labels_actual.append(movie.tags)
        return labels_pred,labels_actual
    
    def accuracy(actual_labels,pred_labels):
        tp = tn = fp = fn = 0
        for actual_label, pred_label in zip(actual_labels,pred_labels):
            for prediction in pred_label:
                if prediction in actual_label:
                    tp += 1
                else:
                    fp += 1
            
            for actual in actual_label:
                if actual not in pred_label:
                    fn += 1
            
            uniqueValues = set(actual_label+pred_label)
            tn = len(uniqueGenres) - len(uniqueValues)
            del uniqueValues
        
        accuracy = (tp+tn)/(tp+fp+tn+fn)
        precision = (tp)/(tp+fp)
        recall = (tp)/(tp+fn)
        return accuracy,precision,recall

    train_label_pred,train_label_actual = predictLabels(test_model,train_set)
#     print("--\n",train_label_actual,"--",train_label_pred,"\n")
    test_label_pred,test_label_actual = predictLabels(test_model,test_set)
#     print("--\n",test_label_actual,"--\n",test_label_pred,"\n")
    
    train_accuracy,train_precision,train_recall = accuracy(train_label_actual,train_label_pred)
    test_accuracy,test_precision,test_recall = accuracy(test_label_actual,test_label_pred)
    
    return train_accuracy,train_precision,train_recall,test_accuracy,test_precision,test_recall

In [64]:
movies_data = LabeledIMDBData.fromDataFrame(movies_df_clean,uniqueGenres)
movies_train,movies_test = movies_data.split_data(test_split = 0.3)
print("Train data size ",movies_train.datasetSize(),"\nTest data size ",movies_test.datasetSize())

Train data size  31158 
Test data size  13354


In [None]:
model = Doc2Vec(dm=0, size=100, negative=5, hs=0, min_count=1, workers=1)
model.build_vocab(movies_data.to_array())

In [None]:
alpha, min_alpha, passes = (0.025, 0.001, 200)
alpha_delta = (alpha - min_alpha) / passes

accuracy =[[],[]]
precision = [[],[]]
recall = [[],[]]

for epoch in range(passes):
    print("Now at epoch number ",(epoch+1)," out of total number of epoch ",passes)
    
    model.alpha, model.min_alpha = alpha, alpha
    model.train(movies_train.generate_permutations(), total_examples=movies_train.datasetSize(), epochs=1)
    train_accuracy,train_precision,train_recall,test_accuracy,test_precision,test_recall = accuracy_rate_for_model(model,movies_train,movies_test,uniqueGenres)
    accuracy[0].append(train_accuracy)
    accuracy[1].append(test_accuracy)
    precision[0].append(train_precision)
    precision[1].append(test_precision)
    recall[0].append(train_recall)
    recall[1].append(test_recall)
    model.alpha -= alpha_delta
#     print(model.corpus_count)
    
print("---------------------->Done with training<----------------------")

Now at epoch number  1  out of total number of epoch  200
Now at epoch number  2  out of total number of epoch  200
Now at epoch number  3  out of total number of epoch  200
Now at epoch number  4  out of total number of epoch  200
Now at epoch number  5  out of total number of epoch  200
Now at epoch number  6  out of total number of epoch  200


In [None]:
def plot_array(data_array,colors = ['r','b'],labels = ['train','test'],titles=['accuracy','precision','recall']):
    fig, axis = plt.subplots(1,len(data_array), figsize = (15, 3))
    for x_idx in range(len(data_array)):
        for y_idx in range(len(data_array[x_idx])):
            axis[x_idx].plot(data_array[x_idx][y_idx],color = colors[y_idx],label = labels[y_idx])
            axis[x_idx].set_title(titles[x_idx])
            x_limits = len(data_array[x_idx][y_idx])+0.05
            axis[x_idx].set_xlim([0,x_limits])
            axis[x_idx].set_ylim([0,1.05])
            axis[x_idx].legend(bbox_to_anchor=(0.3, 0.95), loc=1, borderaxespad=0.)
    return

plot_array([accuracy,precision,recall],titles=['accuracy','precision','recall'])

In [None]:
model.save('./models/imdb.d2v')
model.save_word2vec_format('./models/word2vecformat.nn')

In [None]:
test_overviews = movies_test.to_array()[0].words
test_overviews

In [None]:
new_vector = model.infer_vector(test_overviews)
sims = model.docvecs.most_similar([new_vector],topn=3)
sims

In [None]:
doc_id = str(862)
sims = model.docvecs.most_similar(doc_id, topn=model.docvecs.count)  # get *all* similar documents
print(u'TARGET (%s): «%s»\n' % (doc_id, ' '.join(movies_data.slice_tags(doc_id)[0].words)))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
for label, index in [('MOST', 0), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(movies_data.slice_tags(doc_id)[0].words)))

In [None]:
doc_id = ('animation')
sims = model.docvecs.most_similar(doc_id, topn=20)
sims

In [None]:
doc_id = ('comedy')
sims = model.docvecs.most_similar(doc_id, topn=20)
sims

In [None]:
model.most_similar(['wedding','opens'])

In [None]:
model.docvecs.similarity('romance','animation')

In [None]:
model.docvecs.similarity('drama','family')

In [None]:
genres = set()
for idx,row in movies_df_clean.iterrows():
    genres |= set(Utils.getGenres(row['genres']))

genres = list(genres)

In [None]:
from itertools import permutations

genreSimDict = {}
for genreSublist in permutations(genres,2):
    genreOne = genreSublist[0]
    genreTwo = genreSublist[1]
    if genreOne not in uniqueGenres or genreTwo not in uniqueGenres:
        continue
    try:
        similarity_score = model.docvecs.similarity(genreOne,genreTwo)
    except:
        pass
    genreSimDict[genreSublist] = similarity_score
#     if similarity_score< 0.3:
#         continue
#     print("Similarity between %s,%s is %f\n" % (genreOne,genreTwo,similarity_score))
ser = pd.Series(list(genreSimDict.values()),
                  index=pd.MultiIndex.from_tuples(genreSimDict.keys()))
df = ser.unstack().fillna(0)
df.shape

In [None]:
import seaborn as sns
sns.heatmap(df);

In [None]:
# from gensim import corpora
# from gensim.models.ldamodel import LdaModel

# sentenceTokenizer = PreprocessIMDBData()
# movies = movies_df_clean['overview_cleaned'].values
# movie_tokenized = [sentenceTokenizer.tokenizeSentence(movie) for movie in movies]
# dictionary = corpora.Dictionary(movie_tokenized)
# doc_term_matrix = [dictionary.doc2bow(movie) for movie in movie_tokenized]

In [None]:
# ldamodel = LdaModel(doc_term_matrix,num_topics=10,id2word=dictionary,passes= 20,alpha=0.3,eta=0.7)

In [None]:
# print(ldamodel.print_topics(num_topics=10,num_words=3))