In [1]:
import numpy as np
import pandas as pd

In [2]:
# Reading data
# MovieID, Title, Genres
movie_data = pd.DataFrame([movie.replace('\n','').split('::') for movie in open('movies.txt', encoding="ISO-8859-1").readlines()],
                          columns=['MovieID', 'Title', 'Genres'])

# UserID, Gender, Age, Occupation, Zip-code
user_data = pd.DataFrame([user.replace('\n','').split('::') for user in open('users.txt', encoding="ISO-8859-1").readlines()],
                         columns=['UserID', 'Gender', 'Age', 'Occupation', 'Zip-code'])
user_data['Age'] = user_data['Age'].astype(int)

# UserID, MovieID, Rating, Timestamp
rating_data = pd.DataFrame([rating.replace('\n','').split('::') for rating in open('ratings.txt', encoding="ISO-8859-1").readlines()],
                           columns=['UserID', 'MovieID', 'Rating', 'Timestamp'])
rating_data['Rating'] = rating_data['Rating'].astype(int)
rating_data['Timestamp'] = pd.to_datetime(rating_data['Timestamp'], unit='s')


### Popular Movies for New Users

In [3]:
def weighted_rating(v,m,R,C):
    '''
    Calculate the weighted rating
    
    Args:
    v -> average rating for each item (float)
    m -> minimum votes required to be classified as popular (float)
    R -> average rating for the item (pd.Series)
    C -> average rating for the whole dataset (pd.Series)
    
    Returns:
    pd.Series
    '''
    return ( (v / (v + m)) * R) + ( (m / (v + m)) * C )

In [4]:
def assign_popular_based_score(rating_df, item_df, user_col, item_col, rating_col):
    '''
    Assigned popular based score based on the IMDB weighted average.
    
    Args:
    rating -> pd.DataFrame contains ['item_id', 'rating'] for each user.
    
    Returns
    popular_items -> pd.DataFrame contains item and IMDB weighted score.
    '''
    
    # pre processing
    vote_count = (
        rating_df
        .groupby(item_col,as_index=False)
        .agg( {user_col:'count', rating_col:'mean'} )
        )
    vote_count.columns = [item_col, 'vote_count', 'avg_rating']
    
    # calcuate input parameters
    C = np.mean(vote_count['avg_rating'])
    m = np.percentile(vote_count['vote_count'], 80)
    vote_count = vote_count[vote_count['vote_count'] >= m]
    R = vote_count['avg_rating']
    v = vote_count['vote_count']
    vote_count['weighted_rating'] = weighted_rating(v,m,R,C)
    
    # post processing
    vote_count = vote_count.merge(item_df, on = [item_col], how = 'left')
    popular_items = vote_count.loc[:,[item_col, 'vote_count', 'avg_rating', 'weighted_rating']]
    
    return popular_items

In [5]:
USER_COL = 'UserID'
ITEM_COL = 'MovieID'
RATING_COL = 'Rating'

pop_items = assign_popular_based_score(rating_data, movie_data, USER_COL, ITEM_COL, RATING_COL)
pop_items = pop_items.sort_values('weighted_rating', ascending = False)

In [6]:
PopularMovies = pd.merge(pop_items, movie_data, how = 'left', on = 'MovieID')

In [16]:
PopularMovies.to_csv("PopularMovies.csv", index = None)

In [7]:
!pip install scikit-surprise -q

In [8]:
from __future__ import (absolute_import, division, print_function,
                        unicode_literals)
from collections import defaultdict

from surprise import SVD
from surprise import Dataset

In [9]:
def get_top_n(predictions, n=10):
    """Return the top-N recommendation for each user from a set of predictions.
    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.
    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    """

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

### SVD using Surprise

In [10]:
# Selecting Data
# validation set (5 users)
# user_validation = np.random.choice(rating_data['UserID'], 5)
#user_validation = ['2484', '4448', '2106', '5702', '1018']
# dataset for training and testing
#rating_data_selected = rating_data[~rating_data['UserID'].isin(user_validation)]
rating_data['Rank_Latest'] = rating_data.groupby(['UserID', 'Rating'])['Timestamp'].rank(method='first',ascending=False)

# dataset 1: training dataset
train = rating_data[rating_data['Rank_Latest'] != 1].drop(['Rank_Latest'], axis=1)
# dataset 2: testing dataset
test = rating_data[rating_data['Rank_Latest'] == 1].drop(['Rank_Latest'], axis=1)

In [11]:
train['Rating'] = pd.to_numeric(train['Rating'])
test['Rating'] = pd.to_numeric(test['Rating'])
train.to_csv("train_1.csv",index = None)
test.to_csv("test_1.csv",index = None)

In [12]:
import os
from surprise import SVD
from surprise import Dataset
from surprise import Reader
from surprise import accuracy
from surprise.model_selection import PredefinedKFold

# path to dataset folder
files_dir = os.path.expanduser('C://Users//thari//Desktop//Summer 2022//DS4A_Women//DS4A Project//')

# This time, we'll use the built-in reader.
reader = Reader(line_format='user item rating timestamp', sep=',', skip_lines=1)


# folds_files is a list of tuples containing file paths:
# [(u1.base, u1.test), (u2.base, u2.test), ... (u5.base, u5.test)]
train_file = files_dir + 'train_%d.csv'
test_file = files_dir + 'test_%d.csv'
folds_files = [(train_file % i, test_file % i) for i in (1,)]

data = Dataset.load_from_folds(folds_files, reader=reader)
pkf = PredefinedKFold()


In [13]:
algo = SVD()

for trainset, testset in pkf.split(data):

    # train and test algorithm.
    algo.fit(trainset)
    predictions = algo.test(testset)

    # Compute and print Root Mean Squared Error
    accuracy.rmse(predictions, verbose=True)

RMSE: 1.2740


In [14]:
top_n = get_top_n(predictions, n=10)
for uid, user_ratings in top_n.items():
    if uid in ['2484', '4448', '2106', '5702', '1018']:
        print(uid, [iid for (iid, _) in user_ratings])

1018 ['318', '2313', '1379', '288']
2106 ['778', '2232', '3568', '1816', '3948']
2484 ['3256', '2058', '2259', '2459', '546']
4448 ['3235', '808', '2004', '1998', '2053']
5702 ['1274', '2081', '3636', '1907', '783']


In [21]:
movie_rec = ['318', '2313', '1379', '288', '778', '2232', '3568', '1816', '3948', '3256', '2058', '2259', '2459', '546', '3235', '808', '2004', '1998', '2053', '1274', '2081', '3636', '1907', '783']
movie_data[movie_data['MovieID'].isin(movie_rec)]

Unnamed: 0,MovieID,Title,Genres
285,288,Natural Born Killers (1994),Action|Thriller
315,318,"Shawshank Redemption, The (1994)",Drama
542,546,Super Mario Bros. (1993),Action|Adventure|Children's|Sci-Fi
768,778,Trainspotting (1996),Drama
773,783,"Hunchback of Notre Dame, The (1996)",Animation|Children's|Musical
798,808,Alaska (1996),Adventure|Children's
1254,1274,Akira (1988),Adventure|Animation|Sci-Fi|Thriller
1358,1379,Young Guns II (1990),Action|Comedy|Western
1751,1816,Two Girls and a Guy (1997),Comedy|Drama
1838,1907,Mulan (1998),Animation|Children's
