In [None]:
# Ignore  the warnings
import warnings
warnings.filterwarnings('always')
warnings.filterwarnings('ignore')
import time
# data visualisation and manipulation
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import style
import seaborn as sns
 
#configure
# sets matplotlib to inline and displays graphs below the corressponding cell.
%matplotlib inline  
style.use('fivethirtyeight')
sns.set(style='whitegrid',color_codes=True)

#model selection
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score,precision_score,recall_score,confusion_matrix,roc_curve,roc_auc_score
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder

In [None]:


# reading users file:
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('u.user', sep='|', names=u_cols,encoding='latin-1')

# reading ratings file:
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv('u.data', sep='\t', names=r_cols,encoding='latin-1')

# reading items file:
i_cols = ['movie id', 'movie title' ,'release date','video release date', 'IMDb URL', 'unknown', 'Action', 'Adventure',
'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
items = pd.read_csv('u.item', sep='|', names=i_cols,
encoding='latin-1')

# After loading the dataset, we should look at the content of each file (users, ratings, items).

# Looking at the user file
print("\nUser Data :")
print("shape : ", users.shape)
#print(users.head())

# We have 943 users in the dataset and each user has 5 features, i.e. user_ID, age, sex, occupation and zip_code. Now let’s look at the ratings file.

# Ratings Data
print("\nRatings Data :")
print("shape : ", ratings.shape)
#print(ratings.head())

# We have 100k ratings for different user and movie combinations. Now finally examine the items file.

# Item Data
print("\nItem Data :")
print("shape : ", items.shape)
#print(items.head())


User Data :
shape :  (943, 5)

Ratings Data :
shape :  (100000, 4)

Item Data :
shape :  (1682, 24)


In [None]:
users.head()

Unnamed: 0,user_id,age,sex,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


In [None]:
ratings.head()

Unnamed: 0,user_id,movie_id,rating,unix_timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [None]:
#converting the unix timestamp of ratings to a  date
from datetime import datetime
def time_stamp(k):
  return datetime.fromtimestamp(k).strftime('%d-%m-%Y')
ratings['rating_date'] = ratings['unix_timestamp'].apply(time_stamp)
ratings.head()

Unnamed: 0,user_id,movie_id,rating,unix_timestamp,rating_date
0,196,242,3,881250949,04-12-1997
1,186,302,3,891717742,04-04-1998
2,22,377,1,878887116,07-11-1997
3,244,51,2,880606923,27-11-1997
4,166,346,1,886397596,02-02-1998


In [None]:
print(pd.DatetimeIndex(ratings['rating_date']).year.min())
print(pd.DatetimeIndex(ratings['rating_date']).year.max())

1997
1998


In [None]:
#finding the no of days since the rating is given on movies from a specified date('01-11-1998')
date_format = "%d-%m-%Y"
def sub_dates(a):
  return (datetime.strptime('01-11-1998', date_format)-a).days   
def dat_strp(a):
    return datetime.strptime(a, date_format)
ratings['new_date']=ratings['rating_date'].apply(dat_strp)    
ratings['days_diff']=ratings['new_date'].apply(sub_dates) 
#conversion of no of days to years
ratings['years_diff']=round(ratings['days_diff']/365,2)
ratings.head() 

In [None]:
#dropping the below columns 
ratings.drop(['unix_timestamp','new_date'],axis=1,inplace=True)
ratings.head()

Unnamed: 0,user_id,movie_id,rating,rating_date,days_diff,years_diff
0,196,242,3,04-12-1997,332,0.91
1,186,302,3,04-04-1998,211,0.58
2,22,377,1,07-11-1997,359,0.98
3,244,51,2,27-11-1997,339,0.93
4,166,346,1,02-02-1998,272,0.75


In [None]:
ratings['years_diff'].describe()

In [None]:
#for these values select m=0.425 and n=0.25, this idea is based on max and min values of years difference to get weighted difference
m,n=0.425,0.25
ratings['weighted_diff']=m*ratings['years_diff']+n
ratings.head()

In [None]:
#Now I am gonna create new ratings which are time based/temporal
ratings['final_ratings']=round(ratings['rating']/ratings['weighted_diff'],2)
ratings.head()

Unnamed: 0,user_id,movie_id,rating,rating_date,days_diff,years_diff,weighted_diff,final_ratings
0,196,242,3,04-12-1997,332,0.91,0.63675,4.71
1,186,302,3,04-04-1998,211,0.58,0.4965,6.04
2,22,377,1,07-11-1997,359,0.98,0.6665,1.5
3,244,51,2,27-11-1997,339,0.93,0.64525,3.1
4,166,346,1,02-02-1998,272,0.75,0.56875,1.76


In [None]:
ratings['final_ratings'].describe()

In [None]:
n_users = ratings.user_id.unique().shape[0]
n_items = ratings.movie_id.unique().shape[0]
data_matrix = np.zeros((n_users, n_items))

In [None]:
#basically here I am filling the respective user ids(line[1]) and movies(line[2]) with the value final ratings(line[8])
for line in ratings.itertuples():
    data_matrix[line[1]-1,line[2]-1] = line[8]
data_matrix    

array([[6.93, 4.31, 5.96, ..., 0.  , 0.  , 0.  ],
       [7.42, 0.  , 0.  , ..., 0.  , 0.  , 0.  ],
       [0.  , 0.  , 0.  , ..., 0.  , 0.  , 0.  ],
       ...,
       [6.93, 0.  , 0.  , ..., 0.  , 0.  , 0.  ],
       [0.  , 0.  , 0.  , ..., 0.  , 0.  , 0.  ],
       [0.  , 9.35, 0.  , ..., 0.  , 0.  , 0.  ]])

#A Recommender Model Using KNN

In [None]:
#Now I am gotta merge two columns of items dataset to ratings dataset
items_new=items[['movie id', 'movie title']]
items_new.rename(columns={"movie id": "movie_id"},inplace=True)
df=pd.merge(ratings,items_new,on=['movie_id'],how='inner')
df.head()
# items_new.columns

Unnamed: 0,user_id,movie_id,rating,rating_date,days_diff,years_diff,weighted_diff,final_ratings,movie title
0,196,242,3,04-12-1997,332,0.91,0.63675,4.71,Kolya (1996)
1,63,242,3,01-10-1997,396,1.08,0.709,4.23,Kolya (1996)
2,226,242,5,04-01-1998,301,0.82,0.5985,8.35,Kolya (1996)
3,154,242,3,10-11-1997,356,0.98,0.6665,4.5,Kolya (1996)
4,306,242,5,10-10-1997,387,1.06,0.7005,7.14,Kolya (1996)


In [None]:
df.isnull().sum()

In [None]:
#removed the movies having null

movie_ratingCount = (df.groupby(by = ['movie title'])['rating'].count().reset_index().rename(columns = {'rating': 'totalRatingCount'})
[['movie title', 'totalRatingCount']])
movie_ratingCount.head()

In [None]:
ratings_with_RatingCounts = df.merge(movie_ratingCount, left_on = 'movie title', right_on = 'movie title', how = 'left')
ratings_with_RatingCounts.head()

Unnamed: 0,user_id,movie_id,rating,rating_date,days_diff,years_diff,weighted_diff,final_ratings,movie title,totalRatingCount
0,196,242,3,04-12-1997,332,0.91,0.63675,4.71,Kolya (1996),117
1,63,242,3,01-10-1997,396,1.08,0.709,4.23,Kolya (1996),117
2,226,242,5,04-01-1998,301,0.82,0.5985,8.35,Kolya (1996),117
3,154,242,3,10-11-1997,356,0.98,0.6665,4.5,Kolya (1996),117
4,306,242,5,10-10-1997,387,1.06,0.7005,7.14,Kolya (1996),117


In [None]:
print(movie_ratingCount['totalRatingCount'].describe())

In [None]:
#considering movies having atleast 55 views 
popularity_threshold = 55
rating_popular_movie= ratings_with_RatingCounts[ratings_with_RatingCounts['totalRatingCount']>= popularity_threshold]
rating_popular_movie.head()

In [None]:
rating_popular_movie.shape

(82407, 10)

In [None]:
## Creating a pivot table with rows as movie title and columns as userid filling with values of final ratings 
movie_features_df=rating_popular_movie.pivot_table(index='movie title',columns='user_id',values='final_ratings').fillna(0)
movie_features_df.head()

user_id,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,...,904,905,906,907,908,909,910,911,912,913,914,915,916,917,918,919,920,921,922,923,924,925,926,927,928,929,930,931,932,933,934,935,936,937,938,939,940,941,942,943
movie title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
101 Dalmatians (1996),2.98,0.0,0.0,0.0,2.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.23,0.0,4.53,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.34,0.0,0.0,...,0.0,0.0,0.0,7.7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.99,0.0,3.99,0.0,0.0,2.87,7.99,0.0,0.0,0.0,0.0,0.0
12 Angry Men (1957),7.45,0.0,0.0,0.0,0.0,6.59,7.99,0.0,0.0,7.32,0.0,0.0,6.41,0.0,0.0,7.32,0.0,4.59,0.0,0.0,0.0,0.0,0.0,6.97,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.99,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2 Days in the Valley (1996),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.99,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,5.12,6.04,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.24,0.0,0.0,0.0,0.0,0.0,0.0,6.16,5.35,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.14,0.0,0.0,0.0,0.0,0.0,0.0,2.8
"20,000 Leagues Under the Sea (1954)",4.47,0.0,0.0,0.0,0.0,0.0,9.99,0.0,0.0,0.0,0.0,0.0,3.93,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.98,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.99,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2001: A Space Odyssey (1968),5.57,0.0,0.0,0.0,5.61,8.24,9.99,0.0,0.0,7.32,8.13,0.0,8.01,0.0,0.0,5.85,0.0,4.59,0.0,0.0,0.0,0.0,0.0,0.0,5.24,0.0,0.0,0.0,0.0,8.73,6.32,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.24,0.0,2.03,0.0,0.0,0.0,3.99,0.0,0.0,0.0,0.0,0.0,6.24,7.8,0.0,0.0,9.99,5.54,7.99,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.99,0.0


In [None]:
#converting movie_features_df to csr format sparse matrices making efficient operations
from scipy.sparse import csr_matrix
movie_features_df_matrix = csr_matrix(movie_features_df.values)
from sklearn.neighbors import NearestNeighbors
model_knn = NearestNeighbors(metric = 'cosine', algorithm = 'brute')
model_knn.fit(movie_features_df_matrix)


NearestNeighbors(algorithm='brute', leaf_size=30, metric='cosine',
                 metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                 radius=1.0)

In [None]:
movie_features_df.shape

(573, 943)

In [None]:
#selecting a movie whose similar movies would be shown
select_index = np.random.choice(movie_features_df.shape[0])
print(select_index)
distances, indices = model_knn.kneighbors(movie_features_df.iloc[select_index,:].values.reshape(1, -1), n_neighbors = 6)

387


In [None]:
indices.flatten()

In [None]:
for i in range(0, len(distances.flatten())):
    if i == 0:
        print('Recommendations for {0}:\n'.format(movie_features_df.index[select_index]))
    else:
        print('{0}: {1}, with distance of {2}:'.format(i, movie_features_df.index[indices.flatten()[i]], distances.flatten()[i]))

Recommendations for Philadelphia (1993):

1: Quiz Show (1994), with distance of 0.42301895071069096:
2: Forrest Gump (1994), with distance of 0.4295194341881575:
3: Field of Dreams (1989), with distance of 0.43039839664095536:
4: Dances with Wolves (1990), with distance of 0.43385506253718853:
5: When Harry Met Sally... (1989), with distance of 0.44594911323197384:


Hence the nearest movies to the given movie has been recommended by KNN Algorithm