In [48]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [49]:
def convert_onehot_to_category(df, id_col, one_hot_columns, category_col='category'):
    df_concat = pd.DataFrame(columns=[id_col, category_col])
    for col in one_hot_columns:
        df_each = df[df[col] >= 1][[id_col, col]]
        df_each[col] = col

        df_each.columns = [id_col, category_col]
        df_concat = pd.concat([df_concat, df_each], axis=0)
    df_concat = df_concat.drop_duplicates().reset_index(drop=True)
    
    return df_concat

In [50]:
df_rating = pd.read_csv("C:/Users/91995/Downloads/archive/ml-100k/u.data", sep="\t")
df_rating.columns = ['user_id', 'movie_id', 'rating', 'timestamp']
df_rating = df_rating.drop(['timestamp'], axis=1)

df_user = pd.read_csv('C:/Users/91995/Downloads/archive/ml-100k/u.user', sep='|', header=None)
df_user.columns = ['user_id', 'age', 'gender', 'occupation', 'zip_code']
from sklearn.preprocessing import LabelEncoder
en = LabelEncoder()
df_user['gender'] = en.fit_transform(df_user['gender'])


item_cols = ['movie_id', 'movie_title', 'release_date', 'video_release_date', 'imdb_url']
genre_cols = ['unknown', 'action', 'adventure', 'animation', 'children', 'comedy', 'crime', 'documentary',
              'drama','fantasy', 'film_noir', 'horror', 'musical', 'mystery', 'romance', 'sci_fi', 'thriller', 
              'war','western']

df_item_genre = pd.read_csv('C:/Users/91995/Downloads/archive/ml-100k/u.item', sep='|',encoding='latin-1')
df_item_genre.columns = item_cols + genre_cols


df_item = df_item_genre[item_cols]
use_item_cols = ['movie_id', 'movie_title', 'release_date']
df_item = df_item[use_item_cols]

df_genre = df_item_genre[['movie_id'] + genre_cols]

use_genre_cols = ['action', 'adventure', 'animation', 'children', 'comedy', 'crime', 'drama', 'horror', 'musical', 'mystery', 'romance', 'sci_fi', 'thriller']
df_genre2 = df_genre[['movie_id'] + use_genre_cols]
df_genre2 = convert_onehot_to_category(df_genre, 'movie_id', use_genre_cols, category_col='genre')
df_genre2 = df_genre2.groupby('movie_id').agg(list).reset_index()

df2 = pd.merge(df_rating, df_item, on='movie_id')
df2 = pd.merge(df2, df_genre2, on='movie_id')
df2 = pd.merge(df2, df_user, on='user_id')



links = pd.read_csv("C:/Users/91995/Downloads/archive/ml-100k/links.csv")
links = links.dropna()


movies = pd.read_csv("C:/Users/91995/Downloads/archive/ml-100k/movies.csv")
movies.columns = ['movie_id2', 'movie_title', 'genres']


df2 = df2.join(movies.set_index('movie_title'), on='movie_title')
df2 = df2.drop(['occupation', 'zip_code', 'genres'], axis=1)
df2 = df2.dropna()
df2 = df2[df2['movie_id'].isin(links['movieId'].values)]

In [51]:
matrix_id = df2.pivot_table(index='user_id', columns='movie_id', values='rating')

In [58]:
from surprise import SVD
from surprise import Dataset, Reader, NormalPredictor
from surprise.model_selection import train_test_split
from surprise.model_selection import GridSearchCV

reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df_rating, reader)
trainData, testData = train_test_split(data, test_size=0.2, random_state=1)

model = SVD()
model.fit(trainData)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x2a104ae8b20>

In [59]:
import math

def recommend(uid):
    movies_unwatched = []
    temp = matrix_id.loc[uid]
    
    for a, b in temp.iteritems():
        if math.isnan(b):
            movies_unwatched.append([a, model.predict(uid, a).est])
    
    movies_unwatched = sorted(movies_unwatched, key = lambda x: x[1], reverse=True)
    
    return movies_unwatched

In [60]:
recommendations = []
movies_unwatched = recommend(2)
for i in range(15):
    print(movies_unwatched[i][0])
    recommendations.append(links[links['movieId']==movies_unwatched[i][0]].tmdbId)

print(recommendations)

318
173
483
98
64
963
22
520
427
187
56
114
1137
178
134
[314    278.0
Name: tmdbId, dtype: float64, 171    9482.0
Name: tmdbId, dtype: float64, 478    34024.0
Name: tmdbId, dtype: float64, 96    45549.0
Name: tmdbId, dtype: float64, 63    19760.0
Name: tmdbId, dtype: float64, 942    40206.0
Name: tmdbId, dtype: float64, 21    1710.0
Name: tmdbId, dtype: float64, 515    8005.0
Name: tmdbId, dtype: float64, 422    18215.0
Name: tmdbId, dtype: float64, 185    36196.0
Name: tmdbId, dtype: float64, 55    124057.0
Name: tmdbId, dtype: float64, 112    71754.0
Name: tmdbId, dtype: float64, 1109    44497.0
Name: tmdbId, dtype: float64, 176    15730.0
Name: tmdbId, dtype: float64, 132    124636.0
Name: tmdbId, dtype: float64]
