### Dataset
The dataset here is the updated metadata data, containing 415 movie.

#### Goal 
Creating a content based and hybrid recommender system based on the "overview" feature of the data.

In [3]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
import nltk
import seaborn as sns

from surprise import Dataset
from surprise import Reader
from surprise import KNNWithMeans
from surprise.model_selection import train_test_split
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.corpus import stopwords
import re

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.metrics.pairwise import cosine_similarity

import gensim 
from gensim.models import Word2Vec 
#from gensim.test.utils import common_texts, get_tmpfile


from six.moves import cPickle as pickle


###  1 Importing data and preparing data for learning a word embedding model

In [2]:
df_m=pd.read_csv('movies_metadata_updated.csv')

In [4]:
df_m.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2274 entries, 0 to 2273
Data columns (total 25 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Unnamed: 0             2274 non-null   int64  
 1   adult                  2274 non-null   bool   
 2   belongs_to_collection  771 non-null    object 
 3   budget                 2274 non-null   int64  
 4   genres                 2274 non-null   object 
 5   homepage               1187 non-null   object 
 6   id                     2274 non-null   float64
 7   imdb_id                2274 non-null   object 
 8   original_language      2274 non-null   object 
 9   original_title         2274 non-null   object 
 10  overview               2274 non-null   object 
 11  popularity             2274 non-null   float64
 12  poster_path            2274 non-null   object 
 13  production_companies   2274 non-null   object 
 14  production_countries   2274 non-null   object 
 15  rele

#### 1.1 Restricting the dataframe to only id, title and overview columns

In [5]:
df_m_r=df_m[["id","title" ,"overview"]].dropna()



In [6]:
df_m_r=df_m_r.drop_duplicates()


In [7]:
df_m_r.shape

(2274, 3)

#### 1.2 Removing ASCII characters, converting lower case, removing stop words, html and punctuation from overview

In [8]:
df_m_r['overview']=df_m_r['overview'].astype('str')

In [9]:
#Utitlity functions for removing ASCII characters, converting lower case, removing stop words, html and punctuation from description

def _removeNonAscii(s):
    return "".join(i for i in s if  ord(i)<128)


def make_lower_case(text):
    return text.lower()

def remove_stop_words(text):
    text = text.split()
    stops = set(stopwords.words("english"))
    text = [w for w in text if not w in stops]
    text = " ".join(text)
    return text

def remove_html(text):
    html_pattern = re.compile('<.*?>')
    return html_pattern.sub(r'', text)

def remove_punctuation(text):
    tokenizer = RegexpTokenizer(r'\w+')
    text = tokenizer.tokenize(text)
    text = " ".join(text)
    return text

df_m_r['overview']=df_m_r['overview'].fillna("")
df_m_r['overview_n'] = df_m_r['overview'].apply(_removeNonAscii)
df_m_r['overview_n'] = df_m_r.overview_n.apply(make_lower_case)
df_m_r['overview_n'] = df_m_r.overview_n.apply(remove_stop_words)
df_m_r['overview_n'] = df_m_r.overview_n.apply(remove_punctuation)
df_m_r['overview_n'] = df_m_r.overview_n.apply(remove_html)



In [10]:
df_m_r.head()

Unnamed: 0,id,title,overview,overview_n
0,862.0,Toy Story,"Led by Woody, Andy's toys live happily in his ...",led woody andy s toys live happily room andy s...
1,8844.0,Jumanji,When siblings Judy and Peter discover an encha...,siblings judy peter discover enchanted board g...
2,949.0,Heat,"Obsessive master thief, Neil McCauley leads a ...",obsessive master thief neil mccauley leads top...
3,710.0,GoldenEye,James Bond must unmask the mysterious head of ...,james bond must unmask mysterious head janus s...
4,524.0,Casino,The life of the gambling paradise – Las Vegas ...,life gambling paradise las vegas dark mafia un...


In [11]:
df1=df_m_r.reset_index().drop('overview', axis=1)
df1.shape

(2274, 4)

#### 1.3 Saving prepared data

We save prepared data to match with ratings data to use in collaborative model.

In [12]:
df1.to_csv('content_based_data.csv')

### 2 Learn the word embedding using word2vec and improved word2vec

#### 2.1 Finding idf weights of updated overview column

In [13]:
#creating the tfidfVectorizer model
tfidf = TfidfVectorizer(analyzer='word',stop_words='english',ngram_range=(1, 2),min_df =0)
tfidf_matrix = tfidf.fit_transform(df1['overview_n'])
tfidf_feature = tfidf.get_feature_names()

In [14]:
#info about tf_idf matrix shape 
print(len(tfidf_feature))
print(tfidf_matrix.shape)
#print(tfidf_feature[:10])
#tfidf_matrix[0]

68739
(2274, 68739)


In [15]:
#here we list the idf for each element of vocabulary
print(tfidf.idf_)
print(tfidf.idf_.shape)

[7.34344097 7.63112304 8.03658815 ... 8.03658815 8.03658815 8.03658815]
(68739,)


In [16]:
tfidf_list = dict(zip(tfidf.get_feature_names(), list(tfidf.idf_)))


In [None]:
#total_words = tfidf_matrix.sum(axis=0)
#print(total_words.shape)
#print(total_words)

#### 2.2 Learning word2vec model

In [17]:
#creating a list of rows of overview_n column
corpus = []
for words in df1['overview_n']:
    corpus.append(words.split())


In [18]:
#creating word2vec model 
model = Word2Vec(corpus,size = 100, window=5, min_count = 2, workers = -1)
model.save("word2vec.model")


In [19]:
#checking vocabulary
vocabulary = model.wv.vocab
#model.wv['put'] return a 100 vec

In [20]:
# to get all the vocabularies in the model:
vocabulary.keys()
#to find the embedding vector for a word 
model.wv['dinosaurs'].shape


(100,)

#### 2.3 Finding the word2vec embedding vector for each text in the corpus (word_embeddings)

In [21]:
def vectors():
    
    # Creating a list for storing the vectors (description into vectors)
    global word_embeddings
    word_embeddings = []
    for line in df1['overview_n']:
        avgword2vec = None
        count = 0
        for word in line.split():      
            if word in model.wv.vocab:
                count += 1
                if avgword2vec is None:
                    avgword2vec = model[word]
                else:
                    avgword2vec = avgword2vec + model[word]                
        if avgword2vec is not None:
            avgword2vec = avgword2vec / count
            word_embeddings.append(avgword2vec)
        else:
            word_embeddings.append([0]*100)
        
        

In [22]:
vectors()

  avgword2vec = model[word]
  avgword2vec = avgword2vec + model[word]


#### 2.4 Finding the tf-idf weighted word2vec embedding vector for each text in the corpus (tfdif_vectors)

In [24]:
# Building TF-IDF Word2Vec Model
def tfdifvectors():
    # Storing the TFIDF Word2Vec embeddings
    global tfidf_vectors
    tfidf_vectors = []; 
    line = 0;
    # for each book description
    for desc in corpus: 
        # Word vectors are of zero length (Used 100 dimensions)
        sent_vec = np.zeros(100) 
        # num of words with a valid vector in the movie overview
        weight_sum =0 
        # for each word in the movieoverview
        for word in desc: 
            if word in model.wv.vocab and word in tfidf_feature:
                vec = model.wv[word]
                tf_idf = tfidf_list[word] * (desc.count(word) / len(desc))
                sent_vec += (vec * tf_idf)
                weight_sum += tf_idf
        if weight_sum != 0:
            sent_vec /= weight_sum
            tfidf_vectors.append(sent_vec)
        else: 
            tfidf_vectors.append(sent_vec)
        line += 1


In [25]:
tfdifvectors()

### 3 Defining a content based recommender 

In [26]:
def recommendations(title,cosine_similarities):
    indices = pd.Series(df1.index, index = df1['title']).drop_duplicates()
    idx = indices[title]
    sim_scores = list(enumerate(cosine_similarities[idx]))
    sim_scores = sorted(sim_scores, key = lambda x: x[1], reverse = True)
    sim_scores = sim_scores[1:6]
    movie_indices = [i[0] for i in sim_scores]
    recommend = df1['title'].iloc[movie_indices]
    return recommend
    

#### 3.1 Recommending 5 movies similar to "GoldenEye" movie using word embedding 

In [27]:
# here we find the cosine similariy matrix of word_embeddings,a matrix of shape (13715, 13715)

cosine_similarities_we =cosine_similarity(word_embeddings, word_embeddings)

In [28]:
title=df1['title'][0]
print(title)
print(recommendations(title,cosine_similarities_we))

Toy Story
460                                  Toy Story 2
623     Harry Potter and the Philosopher's Stone
1936                                       Selma
755                                          Elf
1358                                 Toy Story 3
Name: title, dtype: object


#### 3.2 Recommending 5 movies similar to "GoldenEye" movie using tfdifvector

In [29]:
cosine_similarities_tf=cosine_similarity( tfidf_vectors,  tfidf_vectors)

In [30]:
print(recommendations(title,cosine_similarities_tf))

460                Toy Story 2
1358               Toy Story 3
913     The 40 Year Old Virgin
716        The Matrix Reloaded
1936                     Selma
Name: title, dtype: object


### 4 Defining a hybrid recommender

Here we define a hybrid recommender using learned svd model and knn model in collborative file.

In [31]:
def recommender_hybrid(title,cosine_similarities,model,user_Id,df_meta):
    indices = pd.Series(df_meta.index, index = df_meta['title']).drop_duplicates()
    idx = indices[title]
    sim_scores = list(enumerate(cosine_similarities[idx]))
    sim_scores = sorted(sim_scores, key = lambda x: x[1], reverse = True)
    sim_scores=sim_scores[1:26]
    movie_indices = [i[0] for i in sim_scores]
    data=df_meta[df_meta['id'].isin(movie_indices)]
    data['prediction']=data['id'].apply(lambda x: model.predict(uid=user_Id,iid=x).est)
    data= data.sort_values('prediction', ascending=False)
    recommend = data.iloc[:5]['title']
    return recommend

In [32]:
svd_model = pickle.load(open('recommender-svd', 'rb'))

In [33]:
recommender_hybrid(title,cosine_similarities_we,svd_model,user_Id=24,df_meta=df1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['prediction']=data['id'].apply(lambda x: model.predict(uid=user_Id,iid=x).est)


1002                   Crank
15       From Dusk Till Dawn
259     Seven Years in Tibet
143      A Fish Called Wanda
614         Mulholland Drive
Name: title, dtype: object

In [34]:
knn_model = pickle.load(open('recommender-knn', 'rb'))

In [35]:
recommender_hybrid(title,cosine_similarities_we,knn_model,user_Id=24,df_meta=df1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['prediction']=data['id'].apply(lambda x: model.predict(uid=user_Id,iid=x).est)


259     Seven Years in Tibet
143      A Fish Called Wanda
1002                   Crank
15       From Dusk Till Dawn
614         Mulholland Drive
Name: title, dtype: object