# Recommendation system

## Imports and db connection

In [1]:
import sqlite3
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
#import streamlit as st
import time
import matplotlib.pyplot as plt

In [2]:
connexion = sqlite3.connect("database/imdb/imdb.db")
cursor = connexion.cursor()

In [3]:
# creating dataframe for movie details table
cursor.execute("""
    SELECT tb.primaryTitle, tb.startYear, tb.genres, tc.directors, tr.averageRating, tr.numVotes
    FROM title_basics tb, title_crew tc, title_ratings tr 
    ON tb.tconst = tc.tconst AND tb.tconst = tr.tconst
""")
data = cursor.fetchall()
col = [description[0] for description in cursor.description]

df = pd.DataFrame.from_records(data=data, columns=col)

In [4]:
df.shape

(1011940, 6)

In [5]:
df.head()

Unnamed: 0,primaryTitle,startYear,genres,directors,averageRating,numVotes
0,Carmencita,1894,"Documentary,Short",nm0005690,5.7,1924
1,Le clown et ses chiens,1892,"Animation,Short",nm0721526,5.8,259
2,Pauvre Pierrot,1892,"Animation,Comedy,Romance",nm0721526,6.5,1737
3,Un bon bock,1892,"Animation,Short",nm0721526,5.6,174
4,Blacksmith Scene,1893,"Comedy,Short",nm0005690,6.2,2550


In [6]:
df['averageRating'] = df['averageRating'].astype(float)
df['numVotes'] = df['numVotes'].astype(int)

In [7]:
df_above5 = df[df['averageRating'] >= 8.0]
df_above5

Unnamed: 0,primaryTitle,startYear,genres,directors,averageRating,numVotes
289,A Trip to the Moon,1902,"Action,Adventure,Comedy",nm0617588,8.2,50411
380,Salaviinanpolttajat,1907,"Comedy,Short","nm0700930,nm0817086",8.7,14
476,"A Visit to the Seaside at Brighton Beach, England",1910,"Documentary,Short",nm0808310,8.3,19
584,The Message,1909,"Drama,Short",nm0000428,8.1,22
660,We Must Do Our Best,1909,"Comedy,Short",nm0111753,8.2,38
...,...,...,...,...,...,...
1011933,Professionalism,2018,"Comedy,Short",\N,8.3,31
1011934,Workplace Safety,2018,"Comedy,Short",\N,8.3,31
1011935,Confidentiality,2018,"Comedy,Short",\N,8.7,32
1011936,Morale,2018,"Comedy,Short",\N,8.2,32


In [8]:
df_clean = df_above5[df_above5['numVotes'] >= 1000]
df_clean

Unnamed: 0,primaryTitle,startYear,genres,directors,averageRating,numVotes
289,A Trip to the Moon,1902,"Action,Adventure,Comedy",nm0617588,8.2,50411
2672,The Cabinet of Dr. Caligari,1920,"Horror,Mystery,Thriller",nm0927468,8.0,64645
2970,One Week,1920,"Comedy,Short","nm0166836,nm0000036",8.1,8274
3179,The Kid,1921,"Comedy,Drama,Family",nm0000122,8.3,126941
3184,The Phantom Carriage,1921,"Drama,Fantasy,Horror",nm0803705,8.0,12789
...,...,...,...,...,...,...
1011676,Vaulter,2019,Drama,nm0661238,8.5,3581
1011677,Hunting,2019,Drama,nm0661238,9.0,4262
1011707,The Winter Line,2020,"Drama,Mystery,Sci-Fi",nm0003557,8.2,9150
1011762,Kaithi,2019,"Action,Adventure,Crime",nm7992231,8.5,32445


In [9]:
df_clean['genres'] = df_clean['genres'].astype("string")
df_clean['directors'] = df_clean['directors'].astype("string")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['genres'] = df_clean['genres'].astype("string")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['directors'] = df_clean['directors'].astype("string")


In [10]:
df_clean.dtypes

primaryTitle      object
startYear         object
genres            string
directors         string
averageRating    float64
numVotes           int64
dtype: object

In [11]:
df_clean.head()

Unnamed: 0,primaryTitle,startYear,genres,directors,averageRating,numVotes
289,A Trip to the Moon,1902,"Action,Adventure,Comedy",nm0617588,8.2,50411
2672,The Cabinet of Dr. Caligari,1920,"Horror,Mystery,Thriller",nm0927468,8.0,64645
2970,One Week,1920,"Comedy,Short","nm0166836,nm0000036",8.1,8274
3179,The Kid,1921,"Comedy,Drama,Family",nm0000122,8.3,126941
3184,The Phantom Carriage,1921,"Drama,Fantasy,Horror",nm0803705,8.0,12789


In [12]:
# Function to convert all strings to lower case and strip names of spaces
def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        #Check if director exists. If not, return empty string
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''


In [13]:
# Apply clean_data function to your features.
features = ['directors', 'genres']

for feature in features:
    df_clean[feature] = df_clean[feature].apply(clean_data)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean[feature] = df_clean[feature].apply(clean_data)


In [14]:
def create_soup(x):
    return ' '.join(x['directors']) + ' ,' + ' '.join(x['genres'])


In [15]:
# Create a new soup feature
df_clean['soup'] = df_clean.apply(create_soup, axis=1)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['soup'] = df_clean.apply(create_soup, axis=1)


In [16]:
df_clean['soup'].head()
print(df_clean['soup'][0])

KeyError: 0

In [17]:
features = ['soup']

for feature in features:
    df_clean[feature] = df_clean[feature].apply(clean_data)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean[feature] = df_clean[feature].apply(clean_data)


In [18]:
print(df_clean['soup'][0])

KeyError: 0

In [19]:
# Function to convert all strings to lower case and strip names of spaces
df_clean['soup'] = df_clean['soup'].replace(',', ' ', regex=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['soup'] = df_clean['soup'].replace(',', ' ', regex=True)


In [20]:
df_clean.head()

Unnamed: 0,primaryTitle,startYear,genres,directors,averageRating,numVotes,soup
289,A Trip to the Moon,1902,"action,adventure,comedy",nm0617588,8.2,50411,nm0617588 action adventure comedy
2672,The Cabinet of Dr. Caligari,1920,"horror,mystery,thriller",nm0927468,8.0,64645,nm0927468 horror mystery thriller
2970,One Week,1920,"comedy,short","nm0166836,nm0000036",8.1,8274,nm0166836 nm0000036 comedy short
3179,The Kid,1921,"comedy,drama,family",nm0000122,8.3,126941,nm0000122 comedy drama family
3184,The Phantom Carriage,1921,"drama,fantasy,horror",nm0803705,8.0,12789,nm0803705 drama fantasy horror


In [32]:
df_clean['primaryTitle'] = df_clean['primaryTitle'].astype("string")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['primaryTitle'] = df_clean['primaryTitle'].astype("string")


In [44]:
df_clean = df_clean.reset_index()

In [47]:
df_clean.dtypes

index              int64
primaryTitle      string
startYear         object
genres            object
directors         object
averageRating    float64
numVotes           int64
soup              object
dtype: object

In [21]:
# Import CountVectorizer and create the count matrix
count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(df_clean['soup'])

In [22]:
count_matrix.shape

(14891, 10044)

In [None]:
# 

In [54]:
# Compute the Cosine Similarity matrix based on the count_matrix
cosine_sim2 = cosine_similarity(count_matrix[0], count_matrix)


In [56]:
cosine_sim2.shape

(1, 14891)

In [59]:
# Function that takes in movie title as input and outputs most similar movies
def get_recommendations(title, cosine_sim=cosine_sim2):
    # Get the index of the movie that matches the title
    idx = df_clean.index[df_clean['primaryTitle'] == title]
    print(f"1. idx : {idx}")

    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_similarity(count_matrix[idx], count_matrix)))
    print(f"2. sim_scores : {sim_scores}")

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    print(f"3. sorted sim_scores : {sim_scores}")

    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores.iloc[1:11]
    print(f"4. top 10 sim_scores : {sim_scores}")

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]
    print(f"5. movie_indices : {movie_indices}")

    # Return the top 10 most similar movies
    return df_clean['primaryTitle'].iloc[movie_indices]


In [60]:
var = get_recommendations('The Kid')
print(var)

1. idx : Int64Index([3], dtype='int64')
2. sim_scores : [(0, array([0.25     , 0.       , 0.25     , ..., 0.2236068, 0.       ,
       0.       ]))]
3. sorted sim_scores : [(0, array([0.25     , 0.       , 0.25     , ..., 0.2236068, 0.       ,
       0.       ]))]


AttributeError: 'list' object has no attribute 'iloc'