In [1]:
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
import sklearn
from sklearn.decomposition import TruncatedSVD
from sklearn.neighbors import NearestNeighbors

In [2]:
movies = pd.read_csv("movies_new.csv")
ratings = pd.read_csv("ratings_new.csv")
users = pd.read_csv("users_new.csv")

In [3]:
movies.head()

Unnamed: 0,serial_no,movie_id,movie_title,movie_genre
0,0,1,Toy Story (1995),Animation|Children's|Comedy
1,1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,2,3,Grumpier Old Men (1995),Comedy|Romance
3,3,4,Waiting to Exhale (1995),Comedy|Drama
4,4,5,Father of the Bride Part II (1995),Comedy


In [4]:
ratings.head()

Unnamed: 0,serial_no,user_id,movie_id,rating,timestamp
0,0,1,1193,5,978300760
1,1,1,661,3,978302109
2,2,1,914,3,978301968
3,3,1,3408,4,978300275
4,4,1,2355,5,978824291


In [5]:
users.head()

Unnamed: 0,serial_no,user_id,gender,age,occupation,zipcode
0,0,1,F,1,10,48067
1,1,2,M,56,16,70072
2,2,3,M,25,15,55117
3,3,4,M,45,7,2460
4,4,5,M,25,20,55455


In [6]:
combine_movie_rating = pd.merge(movies, ratings, on='movie_id')

In [7]:
combine_movie_rating.head()

Unnamed: 0,serial_no_x,movie_id,movie_title,movie_genre,serial_no_y,user_id,rating,timestamp
0,0,1,Toy Story (1995),Animation|Children's|Comedy,40,1,5,978824268
1,0,1,Toy Story (1995),Animation|Children's|Comedy,469,6,4,978237008
2,0,1,Toy Story (1995),Animation|Children's|Comedy,581,8,4,978233496
3,0,1,Toy Story (1995),Animation|Children's|Comedy,711,9,5,978225952
4,0,1,Toy Story (1995),Animation|Children's|Comedy,837,10,5,978226474


In [8]:
combine_movie_rating = combine_movie_rating.drop(['serial_no_x', 'serial_no_y', 'timestamp'], axis = 1)

In [9]:
combine_movie_rating = combine_movie_rating.dropna(axis = 0, subset = ['movie_title'])

In [10]:
movie_ratingCount = (combine_movie_rating.
                        groupby(by = ['movie_title'])['rating'].
                        count().
                        reset_index().
                        rename(columns = {'rating': 'TotalRatingCount'})
                        [['movie_title', 'TotalRatingCount']]
                    )
movie_ratingCount.head()

Unnamed: 0,movie_title,TotalRatingCount
0,"$1,000,000 Duck (1971)",37
1,'Night Mother (1986),70
2,'Til There Was You (1997),52
3,"'burbs, The (1989)",303
4,...And Justice for All (1979),199


In [11]:
combine_movie_rating_with_rating_count = pd.merge(movie_ratingCount, combine_movie_rating, on='movie_title')

In [12]:
combine_movie_rating_with_rating_count.head()

Unnamed: 0,movie_title,TotalRatingCount,movie_id,movie_genre,user_id,rating
0,"$1,000,000 Duck (1971)",37,2031,Children's|Comedy,216,2
1,"$1,000,000 Duck (1971)",37,2031,Children's|Comedy,494,5
2,"$1,000,000 Duck (1971)",37,2031,Children's|Comedy,714,4
3,"$1,000,000 Duck (1971)",37,2031,Children's|Comedy,869,1
4,"$1,000,000 Duck (1971)",37,2031,Children's|Comedy,1034,3


In [13]:
combine_movie_rating_with_rating_count = combine_movie_rating_with_rating_count.drop(['movie_genre'], axis = 1)

In [14]:
pd.set_option('display.float_format', lambda x: '%.3f' %x)
print(movie_ratingCount['TotalRatingCount'].describe())

count   3706.000
mean     269.889
std      384.048
min        1.000
25%       33.000
50%      123.500
75%      350.000
max     3428.000
Name: TotalRatingCount, dtype: float64


In [15]:
print(movie_ratingCount['TotalRatingCount'].quantile(np.arange(.9, 1, .01)))

0.900    729.500
0.910    773.550
0.920    825.000
0.930    887.300
0.940    971.400
0.950   1051.500
0.960   1133.800
0.970   1268.100
0.980   1446.600
0.990   1784.900
Name: TotalRatingCount, dtype: float64


In [16]:
popularity_threshold = 1784
rating_popular_movie = combine_movie_rating_with_rating_count.query('TotalRatingCount >= @popularity_threshold')
rating_popular_movie.head()

Unnamed: 0,movie_title,TotalRatingCount,movie_id,user_id,rating
27878,Alien (1979),2024,1214,4,4
27879,Alien (1979),2024,1214,10,4
27880,Alien (1979),2024,1214,17,5
27881,Alien (1979),2024,1214,19,2
27882,Alien (1979),2024,1214,22,4


In [17]:
rating_popular_movie_user_location = pd.merge(rating_popular_movie, users, on='user_id')
rating_popular_movie_user_location = rating_popular_movie_user_location.drop(['serial_no', 'gender', 'age','occupation'], axis = 1)
rating_popular_movie_user_location.head()

Unnamed: 0,movie_title,TotalRatingCount,movie_id,user_id,rating,zipcode
0,Alien (1979),2024,1214,4,4,2460
1,E.T. the Extra-Terrestrial (1982),2269,1097,4,4,2460
2,Jurassic Park (1993),2672,480,4,4,2460
3,Raiders of the Lost Ark (1981),2514,1198,4,5,2460
4,Saving Private Ryan (1998),2653,2028,4,5,2460


In [18]:
rating_popular_movie_user_location_filtered = rating_popular_movie_user_location[(rating_popular_movie_user_location['zipcode'] >= '10000') & (rating_popular_movie_user_location['zipcode'] <= '40000')]
rating_popular_movie_user_location_filtered.head()

Unnamed: 0,movie_title,TotalRatingCount,movie_id,user_id,rating,zipcode
153,Alien (1979),2024,1214,28,5,14607
154,Aliens (1986),1820,1200,28,4,14607
155,American Beauty (1999),3428,2858,28,4,14607
156,Being John Malkovich (1999),2241,2997,28,4,14607
157,E.T. the Extra-Terrestrial (1982),2269,1097,28,3,14607


In [19]:
len(rating_popular_movie_user_location_filtered.index)

21341

In [20]:
rating_popular_movie_user_location_filtered_pivot = rating_popular_movie_user_location_filtered.pivot(index = 'movie_title', columns = 'user_id', values = 'rating').fillna(0)
rating_popular_movie_user_location_filtered_matrix = csr_matrix(rating_popular_movie_user_location_filtered_pivot.values)

In [23]:
model_knn = NearestNeighbors(metric = 'cosine', algorithm = 'brute')
model_knn.fit(rating_popular_movie_user_location_filtered_matrix)

NearestNeighbors(algorithm='brute', leaf_size=30, metric='cosine',
         metric_params=None, n_jobs=None, n_neighbors=5, p=2, radius=1.0)

In [37]:
query_index = np.random.choice(rating_popular_movie_user_location_filtered_pivot.shape[0])
distances, indices = model_knn.kneighbors(rating_popular_movie_user_location_filtered_pivot.iloc[query_index, :].values.reshape(1, -1), n_neighbors = 6)

for i in range(0, len(distances.flatten())):
    if i == 0:
        print('Recommendations for {0}:\n'.format(rating_popular_movie_user_location_filtered_pivot.index[query_index]))
    else:
        print('{0}: {1}, with distance of {2}:'.format(i, rating_popular_movie_user_location_filtered_pivot.index[indices.flatten()[i]], distances.flatten()[i]))

Recommendations for Total Recall (1990):

1: Terminator 2: Judgment Day (1991), with distance of 0.2708903015258485:
2: Matrix, The (1999), with distance of 0.30353566244295704:
3: Men in Black (1997), with distance of 0.307087872483639:
4: Jurassic Park (1993), with distance of 0.31145311760784145:
5: Terminator, The (1984), with distance of 0.33987467369852553:
