In [120]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances

# Content Filtering Recommendation System
A simple content filtering recommendation system to better understand the concepts. \
Source: https://github.com/BadreeshShetty/Learnings-to-make-Recommedations 

In [184]:
df_mov = pd.read_csv('simple_rec_sys_data/movies.csv', index_col=0)

In [185]:
df_mov.head()

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy
2,Jumanji,Adventure|Children|Fantasy
3,Grumpier Old Men,Comedy|Romance
4,Waiting to Exhale,Comedy|Drama|Romance
5,Father of the Bride Part II,Comedy


In [186]:
df_mov.shape

(9742, 2)

In [187]:
df_mov.dtypes

title     object
genres    object
dtype: object

## Correct Data Structure
- Correct genres string structure
- Determine null values
- Determine if duplicates exist

Split the genres string into a list

In [188]:
df_mov['genres'] = df_mov['genres'].str.split('|')

In [189]:
df_mov['genres'] = df_mov['genres'].fillna("").astype('str')

In [190]:
df_mov.head()

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story,"['Adventure', 'Animation', 'Children', 'Comedy..."
2,Jumanji,"['Adventure', 'Children', 'Fantasy']"
3,Grumpier Old Men,"['Comedy', 'Romance']"
4,Waiting to Exhale,"['Comedy', 'Drama', 'Romance']"
5,Father of the Bride Part II,['Comedy']


In [191]:
df_mov['title'] = df_mov['title'].apply(str)

In [192]:
df_mov.dtypes

title     object
genres    object
dtype: object

Remove any null or empty values. 

In [193]:
df_mov.isna().any()

title     False
genres    False
dtype: bool

Within the dataframe there are no null values

Remove any duplicates

In [194]:
duplicates = df_mov[df_mov.duplicated(subset=['title'], keep='first')]

In [195]:
duplicates.shape

(286, 2)

In [196]:
duplicates.head()

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
915,Sabrina,"['Comedy', 'Romance']"
1344,Cape Fear,"['Crime', 'Drama', 'Thriller']"
1460,That Darn Cat,"['Children', 'Comedy', 'Mystery']"
1873,Misrables Les,"['Crime', 'Drama', 'Romance', 'War']"
1941,Hamlet,['Drama']


In [197]:
df_mov[df_mov['title'] == 'Sabrina']

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
7,Sabrina,"['Comedy', 'Romance']"
915,Sabrina,"['Comedy', 'Romance']"


Remove the duplicates

In [198]:
df_mov.drop_duplicates(subset=['title'], keep='first', inplace=True)

In [199]:
df_mov.shape

(9456, 2)

Re-index the dataframe

In [200]:
df_mov = df_mov.reset_index()

In [201]:
df_mov.tail()

Unnamed: 0,movieId,title,genres
9451,193581,Black Butler Book of the Atlantic,"['Action', 'Animation', 'Comedy', 'Fantasy']"
9452,193583,No Game No Life Zero,"['Animation', 'Comedy', 'Fantasy']"
9453,193585,Flint,['Drama']
9454,193587,Bungo Stray Dogs Dead Apple,"['Action', 'Animation']"
9455,193609,Andrew Dice Clay Dice Rules,['Comedy']


## Text to vector form based on a Tfid Transformer

Commonly used tool in NLP feature extraction and text mining. 
It stands fro Term Frequency-Inverse Document Frequency Vectorizer. 
Converts a collection of raw text into a matrix of TF-IDF features. A numerical representation suitable for machine learning algs. 

tf(t) = Number of times term t appears in a document/ Total number of terms in the document \
idf(t) = log(Total number of documents)/ Number of documents with term t in it \
`Tf-idf = tf * idf`

In [202]:
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0.0, stop_words='english')

In [203]:
tfidf_matrix = tf.fit_transform(df_mov['genres'])

In [204]:
len(tf.vocabulary_)

186

In [205]:
tfidf_matrix.shape

(9456, 186)

## Similarity Measurement
This is the measure of how close the individual words in the vocabulary are to one another based on their vector form. 
Testing both cosine similarity and euclidian distance

In [206]:
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [207]:
cosine_sim[:4, :4]

array([[1.        , 0.3147038 , 0.0607625 , 0.0523509 ],
       [0.3147038 , 1.        , 0.        , 0.        ],
       [0.0607625 , 0.        , 1.        , 0.35194245],
       [0.0523509 , 0.        , 0.35194245, 1.        ]])

In [208]:
eu_sim = euclidean_distances(tfidf_matrix, tfidf_matrix)

In [209]:
eu_sim[:4, :4]

array([[0.        , 1.17072302, 1.37057469, 1.3766983 ],
       [1.17072302, 0.        , 1.41421356, 1.41421356],
       [1.37057469, 1.41421356, 0.        , 1.13847051],
       [1.3766983 , 1.41421356, 1.13847051, 0.        ]])

## Recommendation System
Based on a given movie title, recommend the most similar movies based on the genres. 

In [210]:
movie_titles = pd.Series(df_mov['title'].values, index=df_mov.index)

In [211]:
movie_titles.head()

0                      Toy Story
1                        Jumanji
2               Grumpier Old Men
3              Waiting to Exhale
4    Father of the Bride Part II
dtype: object

In [212]:
indx = movie_titles[movie_titles == 'Dark Knight'].index[0]

In [213]:
indx

6554

In [214]:
test = list(enumerate(cosine_sim[1]))

In [215]:
def genre_recommendations(similarity_matrix, title, movie_titles):
    indx = movie_titles[movie_titles == title].index[0]  # Retrieve the movie titles index
    ind_score_pair = list(enumerate(similarity_matrix[indx]))
    sorted_ind_score_pair = sorted(ind_score_pair, key=lambda x: x[1], reverse=True)
    top_ten = sorted_ind_score_pair[1:10]
    top_ten_indices = [i[0] for i in top_ten]
    return movie_titles[top_ten_indices]

In [216]:
genre_recommendations(cosine_sim, 'Dark Knight', movie_titles)

8157                     Need for Speed
7931    Grandmaster The Yi dai zong shi
123                           Apollo 13
7813                         Life of Pi
8166                               Noah
38                      Dead Presidents
341                         Bad Company
347           Faster Pussycat Kill Kill
430                   Menace II Society
dtype: object