## Possible Solution to the cold start problem 

In [1]:
import pandas as pd
import numpy as np

In [2]:
movie_titles = pd.read_csv('../../data/movies.csv')
ratings = pd.read_csv('../../data/ratings.csv')

In [3]:
movie_titles.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [5]:
data = ratings.merge(movie_titles, on='movieId')

In [6]:
data.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,5,1,4.0,847434962,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,7,1,4.5,1106635946,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3,15,1,2.5,1510577970,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
4,17,1,4.5,1305696483,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy


In [7]:
data.shape

(100836, 6)

Create a dataframe that gives us the average rating for every movie

In [8]:
average_ratings = pd.DataFrame(data.groupby('title')['rating'].mean())
average_ratings.head(10)

Unnamed: 0_level_0,rating
title,Unnamed: 1_level_1
'71 (2014),4.0
'Hellboy': The Seeds of Creation (2004),4.0
'Round Midnight (1986),3.5
'Salem's Lot (2004),5.0
'Til There Was You (1997),4.0
'Tis the Season for Love (2015),1.5
"'burbs, The (1989)",3.176471
'night Mother (1986),3.0
(500) Days of Summer (2009),3.666667
*batteries not included (1987),3.285714


Add the quantity of ratings that each movie recieved to our dataframe

In [9]:
average_ratings['total ratings'] = pd.DataFrame(data.groupby('title')['rating'].count())
average_ratings.head()

Unnamed: 0_level_0,rating,total ratings
title,Unnamed: 1_level_1,Unnamed: 2_level_1
'71 (2014),4.0,1
'Hellboy': The Seeds of Creation (2004),4.0,1
'Round Midnight (1986),3.5,2
'Salem's Lot (2004),5.0,1
'Til There Was You (1997),4.0,2


##### Create a ratings table
- The individual users make up the Index
- The movies are the columns
- The Values are the ratings recieved by that individual customer

In [10]:
user_ratings = data.pivot_table(index='userId',columns='title',values='rating')

In [11]:
user_ratings.head(15)

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,4.0,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
6,,,,,,,,,,,...,,,,,,,,,,
7,,,,,,,,,,,...,,,,,,,,,,
8,,,,,,,,,,,...,,,,,,,,,,
9,,,,,,,,,,,...,,,,,,,1.0,,,
10,,,,,,,,,,,...,,,,,,,,,,


There are so many movies in the world, most individuals are not expected to have rated or to even have watched every movie out there.

In [13]:
import warnings

warnings.filterwarnings('ignore')

In [25]:
def new_user_recommendations(input, n=5):
    
    # Find movies that have a high correlation with the movie that the user selected    
    correlations = user_ratings.corrwith(user_ratings[input])
    
    # Build a new Dataframe that has 
    recommendations = pd.DataFrame(correlations,columns=['correlation'])
    
    # Drop rows with missing ratings
    recommendations.dropna(inplace=True)
    
    # Join the Num Ratings columns so that customers can see how many ratings a movie recieved
    recommendations = recommendations.join(average_ratings['total ratings'])
    
    # Select movies with more than 100 ratings, and put the highest correlated movies at the top
    user_recs = recommendations[recommendations['total ratings']>100].sort_values('correlation',ascending=False).reset_index()
    
    # merge this dataframe with our original movies dataframe
    user_recs = user_recs.merge(movie_titles,on='title')
    
    user_recs = user_recs.iloc[1: , :]
    
    user_recs['correlation'] = user_recs['correlation'].round(decimals = 2)
    
    return user_recs[['title', 'correlation']].head(n)

The Above Function allows a new user that we do not have data on to input a movie title that they really enjoy, and then recieve a selected number of movies that are similar. We only recommend movies that have over a 100 ratings to new customers because we want to recommend a generally popular movie so that the customer is inclined to continue using our service.

In [26]:
new_user_recommendations(input())

Aladdin (1992)


Unnamed: 0,title,correlation
1,Toy Story (1995),0.61
2,"Lion King, The (1994)",0.59
3,Beauty and the Beast (1991),0.58
4,"Truman Show, The (1998)",0.56
5,Finding Nemo (2003),0.54


In [27]:
new_user_recommendations(input())

Aladdin (1992)


Unnamed: 0,title,correlation
1,Toy Story (1995),0.61
2,"Lion King, The (1994)",0.59
3,Beauty and the Beast (1991),0.58
4,"Truman Show, The (1998)",0.56
5,Finding Nemo (2003),0.54
