### Movie dataset from https://grouplens.org/datasets/movielens/latest/

In [1]:
import numpy as np
import pandas as pd

### Read Movie dataset

In [2]:
movies_df = pd.read_csv("/Users/vishalbarad/Desktop/Python/ML/Recommender_system/collaborative_based/Model_based/ml-latest-small/movies.csv")
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


### Drop 'genres' column because we don't need this column as this is collaborative filtering (user-user)

In [3]:
movies_df.drop('genres',axis=1,inplace=True)

In [4]:
movies_df.head()

Unnamed: 0,movieId,title
0,1,Toy Story (1995)
1,2,Jumanji (1995)
2,3,Grumpier Old Men (1995)
3,4,Waiting to Exhale (1995)
4,5,Father of the Bride Part II (1995)


### Read Rating dataset

In [5]:
rating_df = pd.read_csv("/Users/vishalbarad/Desktop/Python/ML/Recommender_system/collaborative_based/Model_based/ml-latest-small/ratings.csv")
rating_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


### Drop 'timestamp' column

In [6]:
rating_df.drop("timestamp",axis=1,inplace=True)

In [7]:
rating_df.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


### Merge 'movie' and 'rating' dataset on 'movieId ' column

In [8]:
df = pd.merge(movies_df,rating_df,on='movieId')

In [9]:
df.head()

Unnamed: 0,movieId,title,userId,rating
0,1,Toy Story (1995),1,4.0
1,1,Toy Story (1995),5,4.0
2,1,Toy Story (1995),7,4.5
3,1,Toy Story (1995),15,2.5
4,1,Toy Story (1995),17,4.5


### Find total rating count movie has and drop those movie which has count<50. Means keep only popular movie and delete other

In [10]:
movie_ratingCount = (df.groupby(by = ['title'])['rating'].count().reset_index().
    rename(columns = {'rating': 'totalRatingCount'})[['title', 'totalRatingCount']])
movie_ratingCount.head()

Unnamed: 0,title,totalRatingCount
0,'71 (2014),1
1,'Hellboy': The Seeds of Creation (2004),1
2,'Round Midnight (1986),2
3,'Salem's Lot (2004),1
4,'Til There Was You (1997),2


In [11]:
rating_with_totalRatingCount = pd.merge(df,movie_ratingCount,on=['title'])

In [12]:
rating_with_totalRatingCount.head()

Unnamed: 0,movieId,title,userId,rating,totalRatingCount
0,1,Toy Story (1995),1,4.0,215
1,1,Toy Story (1995),5,4.0,215
2,1,Toy Story (1995),7,4.5,215
3,1,Toy Story (1995),15,2.5,215
4,1,Toy Story (1995),17,4.5,215


In [13]:
rating_with_totalRatingCount.shape

(100836, 5)

In [14]:
# Keep only those movies which has total rating > 50
movierating_threshold = 50
popular_movie = rating_with_totalRatingCount[rating_with_totalRatingCount['totalRatingCount']>50]

In [15]:
popular_movie.head()

Unnamed: 0,movieId,title,userId,rating,totalRatingCount
0,1,Toy Story (1995),1,4.0,215
1,1,Toy Story (1995),5,4.0,215
2,1,Toy Story (1995),7,4.5,215
3,1,Toy Story (1995),15,2.5,215
4,1,Toy Story (1995),17,4.5,215


In [16]:
popular_movie.shape

(40712, 5)

### Create pivote table. index='title' and columns='userId' and value='rating' so it is easy to apply cosine similarity on user

In [17]:
movie_feature_df=popular_movie.pivot_table(index='title',columns='userId',values='rating')
movie_feature_df.head()

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10 Things I Hate About You (1999),,,,,,,,,,,...,,,3.0,,5.0,,,,,
12 Angry Men (1957),,,,5.0,,,,,,,...,5.0,,,,,,,,,
2001: A Space Odyssey (1968),,,,,,,4.0,,,,...,,,5.0,,,5.0,,3.0,,4.5
28 Days Later (2002),,,,,,,,,,,...,,,,,,,,3.5,,5.0
300 (2007),,,,,,,,,,3.0,...,,,,,3.0,,,5.0,,4.0


In [18]:
#Fill all NaN as 0
movie_feature_df.fillna(0,inplace=True)

In [19]:
movie_feature_df.head()

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10 Things I Hate About You (1999),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,3.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0
12 Angry Men (1957),0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,...,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2001: A Space Odyssey (1968),0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,...,0.0,0.0,5.0,0.0,0.0,5.0,0.0,3.0,0.0,4.5
28 Days Later (2002),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.5,0.0,5.0
300 (2007),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,...,0.0,0.0,0.0,0.0,3.0,0.0,0.0,5.0,0.0,4.0


### Create sparse matrix of above (movie_feature_df) dataframe. Beacause the classes in 'sklearn.neighbors' can handle either NumPy arrays or scipy.sparse matrices as input. 
### Sparse matrix = Sparse matrix is a matrix which contains very few non-zero elements.

In [20]:
from scipy.sparse import csr_matrix
movie_feature_df_matrix = csr_matrix(movie_feature_df.values)

### Now we'll use 'NearestNeighbor' unsupervised algo use 'cosine' distance

In [21]:
from sklearn.neighbors import NearestNeighbors

In [22]:
model = NearestNeighbors(metric='cosine',algorithm='brute')
model.fit(movie_feature_df_matrix)#Fit spares matrix

NearestNeighbors(algorithm='brute', metric='cosine')

In [23]:
a = movie_feature_df.index
a

Index(['10 Things I Hate About You (1999)', '12 Angry Men (1957)',
       '2001: A Space Odyssey (1968)', '28 Days Later (2002)', '300 (2007)',
       '40-Year-Old Virgin, The (2005)', 'A.I. Artificial Intelligence (2001)',
       'Abyss, The (1989)', 'Ace Ventura: Pet Detective (1994)',
       'Ace Ventura: When Nature Calls (1995)',
       ...
       'Wild Wild West (1999)', 'Willy Wonka & the Chocolate Factory (1971)',
       'Wizard of Oz, The (1939)', 'Wolf of Wall Street, The (2013)',
       'X-Men (2000)', 'X-Men: The Last Stand (2006)',
       'X2: X-Men United (2003)', 'Young Frankenstein (1974)',
       'Zombieland (2009)', 'Zoolander (2001)'],
      dtype='object', name='title', length=437)

In [24]:
movie_ind = pd.DataFrame({'asso_num':[i for i in range(movie_feature_df.shape[0])]},index=a)

In [25]:
movie_ind.sample(5)

Unnamed: 0_level_0,asso_num
title,Unnamed: 1_level_1
"Christmas Story, A (1983)",91
Army of Darkness (1993),31
Wallace & Gromit: The Wrong Trousers (1993),416
Romancing the Stone (1984),328
Avatar (2009),36


In [26]:
b = input("Enter movie name : ")

Enter movie name : Avatar (2009)


In [27]:
moive_index = movie_ind.asso_num[b]

In [28]:
moive_index #As we can saw the associated num of this movie is 236

36

In [29]:
movie_feature_df.iloc[moive_index,:].values

array([0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 2.5, 0. , 0. , 0. ,
       0. , 3. , 0. , 0. , 4. , 0. , 0. , 4. , 3.5, 0. , 0. , 0. , 0. ,
       0. , 5. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,
       0. , 0. , 0. , 0. , 0. , 0. , 0. , 2.5, 0. , 0. , 2. , 0. , 4.5,
       0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 4. , 0. , 0. , 0. , 0. ,
       0. , 5. , 0. , 0. , 0. , 0. , 3.5, 0. , 0. , 0. , 0. , 0. , 0. ,
       4. , 0. , 0. , 4.5, 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,
       0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 3. , 3. , 4. ,
       5. , 0. , 0. , 0. , 0. , 2. , 4. , 0. , 0. , 0. , 0. , 0. , 0. ,
       5. , 0. , 0. , 4.5, 0. , 0. , 4.5, 0. , 0. , 0. , 0. , 0. , 0. ,
       0. , 0. , 0. , 0. , 0. , 0. , 0. , 1.5, 0. , 2.5, 0. , 0. , 0. ,
       0. , 0. , 0. , 4. , 0. , 0. , 0. , 0. , 1. , 5. , 0. , 0. , 0. ,
       0. , 4. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. , 0. ,
       0. , 0. , 0. , 0. , 0. , 2.5, 0. , 0. , 0. , 0. , 0. , 0.

### Now we'll use 'kneighbors()'(which returns distance and indices) of 'nearestneighbor' model to predict the similar user 

In [30]:
distance,indices = model.kneighbors(movie_feature_df.iloc[moive_index,:].values.reshape(1,-1),n_neighbors=10)

In [31]:
movie_feature_df.index

Index(['10 Things I Hate About You (1999)', '12 Angry Men (1957)',
       '2001: A Space Odyssey (1968)', '28 Days Later (2002)', '300 (2007)',
       '40-Year-Old Virgin, The (2005)', 'A.I. Artificial Intelligence (2001)',
       'Abyss, The (1989)', 'Ace Ventura: Pet Detective (1994)',
       'Ace Ventura: When Nature Calls (1995)',
       ...
       'Wild Wild West (1999)', 'Willy Wonka & the Chocolate Factory (1971)',
       'Wizard of Oz, The (1939)', 'Wolf of Wall Street, The (2013)',
       'X-Men (2000)', 'X-Men: The Last Stand (2006)',
       'X2: X-Men United (2003)', 'Young Frankenstein (1974)',
       'Zombieland (2009)', 'Zoolander (2001)'],
      dtype='object', name='title', length=437)

In [32]:
movie_feature_df.index[moive_index]

'Avatar (2009)'

### Here we'll use flatten() of numpy to convert 2D in to 1D array

In [33]:
for i in range(0,len(distance.flatten())):
    if i==0:
        print("Recommondation for movie '{}' are\n".format(movie_feature_df.index[moive_index]))
    else:
        print("{0}: '{1}', with distance of {2}:".format(i,movie_feature_df.index[indices.flatten()[i]],distance.flatten()[i]))

Recommondation for movie 'Avatar (2009)' are

1: 'Up (2009)', with distance of 0.36544462382709086:
2: 'Iron Man (2008)', with distance of 0.3757322316601438:
3: 'District 9 (2009)', with distance of 0.38800147272775876:
4: 'WALL·E (2008)', with distance of 0.3971347162739116:
5: 'Hangover, The (2009)', with distance of 0.3982949120164926:
6: 'Kung Fu Panda (2008)', with distance of 0.41309164032650225:
7: 'Harry Potter and the Half-Blood Prince (2009)', with distance of 0.42631786369091795:
8: 'Avengers, The (2012)', with distance of 0.44044654900512115:
9: 'Sherlock Holmes (2009)', with distance of 0.44427210599550426:
