# Recommendation system

## Imports and db connection

In [1]:
import sqlite3
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import linear_kernel
import pandas as pd
import numpy as np
#import streamlit as st
import time
import matplotlib.pyplot as plt

In [2]:
connexion = sqlite3.connect("../database/imdb/imdb.db")
cursor = connexion.cursor()

In [3]:
# creating dataframe for movie details table
cursor.execute("""
    SELECT tb.tconst, tb.primaryTitle, tb.genres, tc.directors, tr.averageRating, tr.numVotes
    FROM title_basics tb, title_crew tc, title_ratings tr 
    ON tb.tconst = tc.tconst AND tb.tconst = tr.tconst
    where tb.isAdult = 0 and tb.titleType = 'movie'
""")
data = cursor.fetchall()
col = [description[0] for description in cursor.description]

df_movies = pd.DataFrame.from_records(data=data, columns=col)

In [4]:
# creating dataframe for actors details table
cursor.execute("""
    SELECT tp.tconst, tp.nconst
    FROM title_principals tp
    WHERE tp.category = 'actor' or tp.category = 'actress'
""")
data = cursor.fetchall()
col = [description[0] for description in cursor.description]

df_actors = pd.DataFrame.from_records(data=data, columns=col)

In [5]:
print(f"movie df shape is : {df_movies.shape}")
print(f"actors df shape is : {df_actors.shape}")

movie df shape is : (219235, 6)
actors df shape is : (20275166, 2)


In [6]:
df_actors.head()

Unnamed: 0,tconst,nconst
0,tt0000005,nm0443482
1,tt0000005,nm0653042
2,tt0000007,nm0179163
3,tt0000007,nm0183947
4,tt0000008,nm0653028


In [7]:
# merge actors with same movie id in same column
groupby = df_actors.groupby('tconst')['nconst'].apply(','.join).reset_index()
groupby.head()

Unnamed: 0,tconst,nconst
0,tt0000005,"nm0443482,nm0653042"
1,tt0000007,"nm0179163,nm0183947"
2,tt0000008,nm0653028
3,tt0000009,"nm0063086,nm0183823,nm1309758"
4,tt0000011,nm3692297


In [8]:
df = pd.merge(df_movies, groupby, how='left', on='tconst')
df.shape

(219235, 7)

In [9]:
df = df.drop('tconst', axis=1)
df = df.rename(columns={"nconst": "actors"})

In [10]:
df.head()

Unnamed: 0,primaryTitle,genres,directors,averageRating,numVotes,actors
0,Miss Jerry,Romance,nm0085156,5.3,200,"nm0063086,nm0183823,nm1309758"
1,Bohemios,\N,nm0063413,4.2,14,"nm0215752,nm0252720"
2,The Story of the Kelly Gang,"Action,Adventure,Biography",nm0846879,6.0,797,"nm0846887,nm0846894,nm1431224,nm3002376"
3,The Prodigal Son,Drama,nm0141150,5.1,20,"nm0906197,nm0332182,nm1323543,nm1759558"
4,Robbery Under Arms,Drama,nm0533958,4.3,23,"nm3071427,nm0581353,nm0888988,nm0240418,nm0346..."


In [11]:
df.isna().sum()

primaryTitle         0
genres               0
directors            0
averageRating        0
numVotes             0
actors           30587
dtype: int64

## Exploratory analysis

In [12]:
df.dtypes

primaryTitle     object
genres           object
directors        object
averageRating    object
numVotes         object
actors           object
dtype: object

In [13]:
# convert data types
df['averageRating'] = df['averageRating'].astype(float)
df['numVotes'] = df['numVotes'].astype(int)

In [14]:
# apply filters to reduce dataset
df_above5 = df[df['averageRating'] >= 5.0]
df_above5

Unnamed: 0,primaryTitle,genres,directors,averageRating,numVotes,actors
0,Miss Jerry,Romance,nm0085156,5.3,200,"nm0063086,nm0183823,nm1309758"
2,The Story of the Kelly Gang,"Action,Adventure,Biography",nm0846879,6.0,797,"nm0846887,nm0846894,nm1431224,nm3002376"
3,The Prodigal Son,Drama,nm0141150,5.1,20,"nm0906197,nm0332182,nm1323543,nm1759558"
7,The Fairylogue and Radio-Plays,"Adventure,Fantasy","nm0091767,nm0877783",5.2,66,"nm0000875,nm0122665,nm0933446,nm2924919"
17,Amor gitano,\N,nm0159015,5.1,16,
...,...,...,...,...,...,...
219230,The Changin' Times of Ike White,"Documentary,Music",nm1427130,6.7,216,
219231,A Classic Tour of Scotland: Footloose Special,Documentary,nm1645815,7.0,6,nm1644256
219232,Manoharam,"Comedy,Drama",nm7011994,6.8,831,"nm2068971,nm10375007,nm7243877,nm1428724"
219233,Footloose in the Cotswolds: Part 1,Documentary,nm1645815,7.0,10,nm1644256


In [15]:
# apply filters to reduce dataset
df_clean = df_above5[df_above5['numVotes'] >= 300]
df_clean

Unnamed: 0,primaryTitle,genres,directors,averageRating,numVotes,actors
2,The Story of the Kelly Gang,"Action,Adventure,Biography",nm0846879,6.0,797,"nm0846887,nm0846894,nm1431224,nm3002376"
51,Cleopatra,"Drama,History",nm0309130,5.1,536,"nm0906610,nm0306947,nm0801774,nm0276160,nm0733..."
52,Dante's Inferno,"Adventure,Drama,Fantasy","nm0078205,nm0655824,nm0209738",7.0,2988,"nm0660139,nm0685283,nm0209738,nm3942815"
55,From the Manger to the Cross,"Biography,Drama",nm0646058,5.8,596,"nm0446092,nm0087381,nm0245769,nm0310155,nm0391..."
61,Passion,"Biography,Drama,Romance",nm0523932,6.6,911,"nm0624470,nm0417837,nm0509573,nm0903235"
...,...,...,...,...,...,...
219222,Ott Tänak: The Movie,"Documentary,Sport",nm4942142,8.1,488,
219223,Ottam,Drama,nm10533890,6.6,566,"nm10533891,nm10379526,nm8166767,nm4425773"
219224,Pengalila,Drama,nm0151535,8.0,678,"nm0482309,nm7785804,nm4710376,nm1230844"
219232,Manoharam,"Comedy,Drama",nm7011994,6.8,831,"nm2068971,nm10375007,nm7243877,nm1428724"


In [16]:
# convert data types
df_clean['genres'] = df_clean['genres'].astype("string")
df_clean['directors'] = df_clean['directors'].astype("string")
df_clean['actors'] = df_clean['actors'].astype("string")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['genres'] = df_clean['genres'].astype("string")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['directors'] = df_clean['directors'].astype("string")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['actors'] = df_clean['actors'].astype("string")


In [17]:
df_clean.dtypes

primaryTitle      object
genres            string
directors         string
averageRating    float64
numVotes           int64
actors            string
dtype: object

In [18]:
df_clean.shape

(44112, 6)

## Data processing

In [19]:
# Function to convert all strings to lower case and strip names of spaces
def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        #Check if director exists. If not, return empty string
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''

In [20]:
# Apply clean_data function to your features.
features = ['directors', 'genres', 'actors']

for feature in features:
    df_clean[feature] = df_clean[feature].apply(clean_data)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean[feature] = df_clean[feature].apply(clean_data)


In [21]:
def create_soup(x):
    return ' '.join(x['directors']) + ' ,' + ' '.join(x['genres']) + ' ,' + ' '.join(x['actors'])

In [22]:
# Create a new soup column representing the features
df_clean['soup'] = df_clean.apply(create_soup, axis=1)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['soup'] = df_clean.apply(create_soup, axis=1)


In [23]:
df_clean['soup'].head()
df_clean['soup']

2         n m 0 8 4 6 8 7 9 ,a c t i o n , a d v e n t u...
51        n m 0 3 0 9 1 3 0 ,d r a m a , h i s t o r y ,...
52        n m 0 0 7 8 2 0 5 , n m 0 6 5 5 8 2 4 , n m 0 ...
55        n m 0 6 4 6 0 5 8 ,b i o g r a p h y , d r a m...
61        n m 0 5 2 3 9 3 2 ,b i o g r a p h y , d r a m...
                                ...                        
219222    n m 4 9 4 2 1 4 2 ,d o c u m e n t a r y , s p...
219223    n m 1 0 5 3 3 8 9 0 ,d r a m a ,n m 1 0 5 3 3 ...
219224    n m 0 1 5 1 5 3 5 ,d r a m a ,n m 0 4 8 2 3 0 ...
219232    n m 7 0 1 1 9 9 4 ,c o m e d y , d r a m a ,n ...
219234    n m 1 1 9 3 3 4 6 ,f a n t a s y , h o r r o r...
Name: soup, Length: 44112, dtype: object

In [24]:
# clean the soup column
features = ['soup']

for feature in features:
    df_clean[feature] = df_clean[feature].apply(clean_data)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean[feature] = df_clean[feature].apply(clean_data)


In [25]:
print(df_clean['soup'])

2         nm0846879,action,adventure,biography,nm0846887...
51        nm0309130,drama,history,nm0906610,nm0306947,nm...
52        nm0078205,nm0655824,nm0209738,adventure,drama,...
55        nm0646058,biography,drama,nm0446092,nm0087381,...
61        nm0523932,biography,drama,romance,nm0624470,nm...
                                ...                        
219222                         nm4942142,documentary,sport,
219223    nm10533890,drama,nm10533891,nm10379526,nm81667...
219224    nm0151535,drama,nm0482309,nm7785804,nm4710376,...
219232    nm7011994,comedy,drama,nm2068971,nm10375007,nm...
219234    nm1193346,fantasy,horror,mystery,nm2933542,nm0...
Name: soup, Length: 44112, dtype: object


In [26]:
# Function to convert all strings to lower case and strip names of spaces
df_clean['soup'] = df_clean['soup'].replace(',', ' ', regex=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['soup'] = df_clean['soup'].replace(',', ' ', regex=True)


In [27]:
df_clean.shape

(44112, 7)

In [28]:
# convert data types
df_clean['primaryTitle'] = df_clean['primaryTitle'].astype("string")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['primaryTitle'] = df_clean['primaryTitle'].astype("string")


In [29]:
df_clean = df_clean.reset_index()

In [30]:
df_clean.dtypes

index              int64
primaryTitle      string
genres            object
directors         object
averageRating    float64
numVotes           int64
actors            object
soup              object
dtype: object

## Create 1 vector for each movie and calculate their similarities

In [31]:
# Import CountVectorizer and one vector per movie for the soup column
count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(df_clean['soup'])

In [32]:
type(count_matrix)

scipy.sparse._csr.csr_matrix

In [33]:
count_matrix[0]

<1x89963 sparse matrix of type '<class 'numpy.int64'>'
	with 8 stored elements in Compressed Sparse Row format>

In [34]:
count_matrix.shape

(44112, 89963)

In [35]:
nbr_movies = count_matrix.shape[0]
nbr_movies

44112

In [36]:
count_matrix[0].dot(count_matrix.T)

<1x44112 sparse matrix of type '<class 'numpy.int64'>'
	with 10842 stored elements in Compressed Sparse Row format>

In [37]:
#test_array = np.empty([nbr_movies, nbr_movies])
#
#for vector in count_matrix:
#    result_vector = vector.dot(count_matrix.T).toarray()
#    np.concatenate((test_array, result_vector), axis=0)

In [38]:
#len(test_list)

In [39]:
#vector1 = test_list[0]
#vector1

In [40]:
#test_list[0][0]

In [41]:
#test_array = np.array(test_list)
#test_array

In [42]:
#test_array

In [43]:
#squeezed_array = np.squeeze(test_array)

In [44]:
#squeezed_array

In [45]:
#Compute the Cosine Similarity matrix for every vector
#similarities_array = np.empty([nbr_movies, nbr_movies])
#
#for vector in count_matrix:
#    result_vector = vector.dot(count_matrix.T).toarray()
#    np.concatenate((similarities_array, result_vector), axis=0)

In [46]:
# Compute the Cosine Similarity matrix for every vector
#similarities_list = []
#for vector in count_matrix:
#    similarities_list.append(vector.dot(count_matrix.T).todense()[0])
    #.argsort())    

In [47]:
#similarities_list[0:3]

In [48]:
#np.squeeze(np.array(similarities_list[0]))

In [49]:
#similarities_array = np.array(similarities_list[0])
#similarities = np.squeeze(similarities_array)
#similarities

In [50]:
#print(type(similarities))
#print(similarities.shape)
#print(similarities)

In [51]:
# Function that takes in a list of movie titles as input and outputs 10 most similar movies
#def get_recommendations_list(liked_movies_list, cosine_sim=similarities_list):
#    # Get the index of the movie that matches the title
#    
#    movie_id_list=[]
#    for movie_title in liked_movies_list:
#        
#        movie_id = df_clean.index[df_clean['primaryTitle'] == movie_title]
#        movie_id_list.append(movie_id[0])
#
#    df_sim_list=pd.DataFrame()
#    print(movie_id_list)
#    for movie_id in movie_id_list:
#        sim_scores = np.squeeze(np.array(cosine_sim[movie_id]))
#        df_sim=pd.DataFrame(sim_scores.reshape(-1),columns=[movie_id])
#        df_sim_list[movie_id]=df_sim
#
#    df_sim_list['average']=df_sim_list.mean(numeric_only=True, axis=1)
#
#    rslt_df = df_sim_list.sort_values(by='average',ascending=False)
#
#    # print(rslt_df[0:11])
#    rec_list=rslt_df.reset_index()
#    list1=[]
#    list1=rec_list['index'][len(liked_movies_list):(len(liked_movies_list)+10)].values
#
#    # Return the top 10 most similar movies
#    return df_clean[['primaryTitle', 'genres']].iloc[list1]


In [52]:
# Function that takes in a list of movie titles as input and outputs most similar movies ON THE FLY
def get_recommendations_list(liked_movies_list, similar_movies=10, vectorized_count_matrix=count_matrix):
    # Get the index of the movies that matches the title
    movie_id_list=[]
    for movie_title in liked_movies_list:
        movie_id = df_clean.index[df_clean['primaryTitle'] == movie_title]
        movie_id_list.append(movie_id[0])

    df_sim_list=pd.DataFrame()
    print(movie_id_list)
    for movie_id in movie_id_list:
        vector = vectorized_count_matrix[movie_id]
        simil = vector.dot(count_matrix.T).todense()[0]
        sim_scores = np.squeeze(np.array(simil))
        df_sim=pd.DataFrame(sim_scores.reshape(-1),columns=[movie_id])
        df_sim_list[movie_id]=df_sim

    df_sim_list['average']=df_sim_list.mean(numeric_only=True, axis=1)

    rslt_df = df_sim_list.sort_values(by='average',ascending=False)

    # print(rslt_df[0:11])
    rec_list=rslt_df.reset_index()
    list1=[]
    list1=rec_list['index'][len(liked_movies_list):(len(liked_movies_list)+similar_movies)].values

    # Return the top 10 most similar movies
    return df_clean[['primaryTitle', 'genres']].iloc[list1]


In [61]:
liked_movies_list=['The Little Mermaid','The Lion King', 'Pocahontas']

recommendations = get_recommendations_list(liked_movies_list)
print(recommendations)

[2747, 9449, 10039]
                            primaryTitle                     genres
42371  Apollo 10½: A Space Age Childhood  adventure,animation,drama
28084                              Belle  adventure,animation,drama
32583                 Big Fish & Begonia  adventure,animation,drama
7072                             The BFG  adventure,animation,drama
26069                     A Whisker Away  adventure,animation,drama
29959                      Drifting Home  adventure,animation,drama
4641                    The Last Unicorn  adventure,animation,drama
4266               The Fox and the Hound  adventure,animation,drama
31725                  The Little Prince  adventure,animation,drama
41348             The Summit of the Gods  adventure,animation,drama
