<a href="https://colab.research.google.com/github/vaguiar/CodingSamples/blob/master/hw2/hw2_collab_filtering_exp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from IPython.display import display, Markdown, HTML
import os

In [4]:
display(Markdown("### Mounting GDrive in Colab" ))
from google.colab import drive
drive.mount("/content/drive", force_remount=True)

### Mounting GDrive in Colab

Mounted at /content/drive


In [5]:
def get_data(file):
  data = pd.read_csv(f"/content/drive/My Drive/Personalization/hw2/data/ml-latest/{file}")
  display(Markdown(f"{file} has %s columns & %s rows" % ( len(data.columns), len(data.index) ) ))
  return data

display(Markdown("### Reading data"))
movies = get_data("movies.csv")
ratings = get_data("ratings.csv")

### Reading data

movies.csv has 3 columns & 58098 rows

ratings.csv has 4 columns & 27753444 rows

In [6]:
def get_popularity(ratings_df, id_col):
    # count the number of ratings 
    popularity_df = ratings_df[id_col].value_counts().to_frame().reset_index()
    popularity_df.columns = [id_col, 'count']

    # assign weights
    popularity_df['weights'] = 1/(1-popularity_df['count'].map(popularity_df['count'].value_counts(normalize = True)))

    return popularity_df

def sampling_id(popularity_df, id_col, threshold, num, random_state=123):
    # filter by threshold
    valid_df = popularity_df[popularity_df['count'] >= threshold] 
    # sample by weights 
    sampled_id = valid_df.sample(n = num, weights = 'weights', random_state=random_state)[id_col].tolist()
    
    return sampled_id

In [7]:
# initiate constants
MOVIE_THRESHOLD = 50
USER_THRESHOLD = 50

# get movie rating counts df & sampled movie_ids
popularity_movie_df = get_popularity(ratings, id_col='movieId')
sampled_movieId = sampling_id(popularity_movie_df, id_col='movieId', threshold=MOVIE_THRESHOLD, num=1500)

# get user rating counts df & sampled user_ids
popularity_user_df = get_popularity(ratings, id_col='userId')
sampled_userId = sampling_id(popularity_user_df, id_col='userId', threshold=USER_THRESHOLD, num=25000)

In [8]:
display(Markdown("#### Sampled data"))

# select by sampled movieId
sampled_df = ratings[ratings['movieId'].isin(sampled_movieId)]
# select by sampled userId
sampled_df = sampled_df[sampled_df['userId'].isin(sampled_userId)]

#### Sampled data

In [9]:
sampled_df

Unnamed: 0,userId,movieId,rating,timestamp
2122,31,608,5.0,968886879
2135,31,1240,4.0,968887413
2144,31,1304,3.0,968870967
2154,31,2396,5.0,968886749
2158,31,2716,4.0,968871098
...,...,...,...,...
27752635,283215,276,4.0,862414494
27752642,283215,494,5.0,854982148
27752648,283215,608,3.0,854982147
27752664,283215,788,5.0,854982198


### Surprise - KNN User based recommendations

In [10]:
# Only once per session
!pip install scikit-surprise



In [12]:
from surprise import Dataset, Reader
from surprise.model_selection import train_test_split, cross_validate
from surprise import KNNWithMeans
from surprise import accuracy

# Step 1 = Data Import & Prep
# ratings_full = get_data("ratings.csv")
ratings_full = sampled_df

# 20% of the dataset
ratings = ratings_full.iloc[:150000]

# try out the full sample set
# ratings = sampled_df

reader = Reader(rating_scale=(0.5,5))

data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)

In [13]:
# Create the train test split
from surprise.model_selection import train_test_split

trainset, testset = train_test_split(data, test_size=0.2, shuffle=True)

In [14]:
## If we had the use data as a training chunk
# # trainsetfull = data.build_full_trainset()

display(Markdown("## Training Set:"))
print('Number of ratings: ', trainset.n_ratings)
print('Number of users: ', trainset.n_users)
print('Number of items: ', trainset.n_items)
print('Global Mean:', trainset.global_mean )

display(Markdown("## Test Set:"))
print('Number of ratings: ', len(testset), '\n')

## Training Set:

Number of ratings:  120000
Number of users:  5763
Number of items:  1468
Global Mean: 3.5425375


## Test Set:

Number of ratings:  30000 



In [15]:
# Step 2 - Cross-Validation

my_k = 25
my_min_k = 5    # if min_k is not satisfied the global average will be used for estimates. (default 1)
my_sim_option = {
    'name':'pearson', 'user_based':True
    }


algo = KNNWithMeans(
  k = my_k, 
  min_k = my_min_k, 
  sim_options = my_sim_option, 
  verbose = True
  )
  
results = cross_validate(
    algo = algo, 
    data = data, 
    measures=['RMSE', 'MAE', 'MSE'], 
    cv=5, 
    return_train_measures=True
    )
    
print("RMSE: ", results['test_rmse'].mean())
print("MAE: ", results['test_mae'].mean())
print("MSE: ", results['test_mse'].mean())

Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
RMSE:  0.9109500024804905
MAE:  0.696665342386623
MSE:  0.829849221183655


In [16]:
def algo_name(algo):
  return str(algo).split(".")[-1].split("'")[0]

def try_algo(algo_fn, k_val, min_k_val=5, similarity_method='pearson', user_based=True):

  my_k = k_val
  my_min_k = min_k_val
  my_sim_option = {
      'name':similarity_method, 
      'user_based':user_based
      }

  algo = algo_fn(
    k = my_k, 
    min_k = my_min_k, 
    sim_options = my_sim_option, 
    verbose = True
    )
  
  display(Markdown("#### Running '%s' algo (user based=%s) '%s' similarity" %(algo_name(algo_fn), user_based, similarity_method)))

  results = cross_validate(
      algo = algo, 
      data = data, 
      measures=['RMSE', 'MAE', 'MSE'], 
      cv=5, 
      return_train_measures=True
      )

  display(Markdown("#### Results:"))
  print("RMSE: ", results['test_rmse'].mean(), ", MAE: ", results['test_mae'].mean(), ", MSE: ", results['test_mse'].mean())

  return results

display(Markdown("### A single movie in the sampled dataset has atleast 30 user ratings. Hence while finding computing the KNN similarities, with can use a k with value upto 30. I will use 25"))

### A single movie in the sampled dataset has atleast 30 user ratings. Hence while finding computing the KNN similarities, with can use a k with value upto 30. I will use 25

In [None]:
from surprise import KNNWithMeans, KNNWithZScore, KNNBaseline, KNNBasic

algos = [ KNNBasic, KNNWithMeans, KNNWithZScore, KNNBaseline ]
sims = ['cosine', 'msd', 'pearson', 'pearson_baseline']
user_based_opts = [ True, False ]
k_val = 25

display(Markdown("## Cross Validating to find best Memory Based option"))
display(Markdown("k-NN based nearest neighbor algorithm with k value of '%s'" %(str(k_val))))

userBasedOp = pd.DataFrame(index=[algo_name(i) for i in algos], columns=sims)
itemBasedOp = pd.DataFrame(index=[algo_name(i) for i in algos], columns=sims)

for algo in algos:
  for sim in sims:
    for user_based_opt in user_based_opts:

      rslt = try_algo(algo, k_val=k_val, similarity_method=sim, user_based=user_based_opt)
      
      if user_based_opt:
        userBasedOp.at[algo_name(algo), sim] = rslt['test_rmse'].mean()
      else:
        itemBasedOp.at[algo_name(algo), sim] = rslt['test_rmse'].mean()

In [None]:
userBasedOp

In [None]:
itemBasedOp

In [18]:
# Train model and predicts with train/test data cuts separately.

display(Markdown("#### Our best RMSE is with 'KNNBaseline', UserBased and 'msd' similarity"))

algo = KNNBaseline(
    k=25,
    min_k=5,
    sim_options={
      'name':'msd', 'user_based':True
    },
    verbose=True
)

algo.fit(trainset)
predictions = algo.test(testset)

#### Our best RMSE is with 'KNNBaseline', UserBased and 'msd' similarity

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.


In [19]:
accuracy.rmse(predictions)

RMSE: 0.8802


0.8802459297173804

In [20]:
print(testset)

[(63231, 165, 5.0), (7551, 333, 4.5), (55318, 193, 1.0), (21768, 81229, 4.0), (40771, 2927, 4.5), (26372, 53468, 3.0), (36072, 8644, 4.0), (13707, 5349, 2.0), (48491, 8011, 4.0), (26372, 2420, 2.5), (5319, 442, 3.0), (46579, 550, 1.0), (9129, 6373, 3.5), (23728, 1958, 4.0), (45531, 356, 4.0), (42255, 2716, 3.0), (39760, 1370, 3.0), (7263, 104241, 3.5), (33153, 1588, 3.0), (28030, 276, 3.0), (46177, 122882, 4.0), (25338, 1079, 4.0), (29197, 1527, 5.0), (23728, 1269, 3.5), (28227, 778, 3.5), (59455, 504, 1.0), (7272, 924, 5.0), (15811, 61240, 3.5), (40092, 3289, 4.0), (30353, 1240, 5.0), (3882, 585, 4.0), (48213, 41569, 4.0), (22857, 926, 3.0), (9808, 8957, 0.5), (4027, 3969, 4.0), (9685, 166486, 3.0), (33081, 1036, 5.0), (52305, 6934, 2.5), (4796, 48744, 3.0), (38328, 4643, 3.5), (18082, 356, 4.0), (54594, 4226, 4.0), (55157, 1527, 1.0), (53651, 3623, 0.5), (42586, 6333, 1.5), (59455, 6059, 4.0), (27859, 119145, 5.0), (26529, 5349, 3.5), (47675, 4223, 4.0), (10614, 1717, 2.5), (21680, 8

In [21]:
def test_user_mov(user, movie):
  print(algo.predict(uid = user, iid = movie))
  print("User seen in training:", trainset.knows_user(user))
  print("Movie seen in training:", trainset.knows_item(movie))

In [22]:
test_user_mov(7551, 333)

user: 7551       item: 333        r_ui = None   est = 3.53   {'actual_k': 25, 'was_impossible': False}
User seen in training: False
Movie seen in training: True


In [23]:
test_user_mov(20047, 5)

user: 20047      item: 5          r_ui = None   est = 3.50   {'was_impossible': False}
User seen in training: False
Movie seen in training: True


In [24]:
test_user_mov(3344, 5015)

user: 3344       item: 5015       r_ui = None   est = 3.88   {'actual_k': 25, 'was_impossible': False}
User seen in training: True
Movie seen in training: False


In [28]:
display(Markdown("### Computing Baseline"))

# print(ratings.head())
display(Markdown("#### Our goal is the most popular movies"))

uniq_mov_rated = pd.Series.unique(ratings["movieId"].dropna())
print(f"Total movies rated: {len(uniq_mov_rated)} out of total {len(ratings)}")

mov_by_rating = ratings.groupby(["movieId"], dropna=True).agg({"rating": ['min', 'max', 'mean', 'count']})

mov_by_rating.columns = mov_by_rating.columns.get_level_values(1)
mov_by_rating.sort_values("count", ascending=False, inplace=True)

mov_by_rating

### Computing Baseline

#### Our goal is the most popular movies

Total movies rated: 1480 out of total 150000


Unnamed: 0_level_0,min,max,mean,count
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
356,0.5,5.0,4.034784,3608
2858,0.5,5.0,4.087581,2609
608,0.5,5.0,4.085303,2327
364,0.5,5.0,3.803767,2097
4226,0.5,5.0,4.107804,1999
...,...,...,...,...
6572,2.5,2.5,2.500000,1
6406,2.5,2.5,2.500000,1
6111,3.5,3.5,3.500000,1
6009,3.0,3.0,3.000000,1


In [29]:
mov_deets = movies.join(mov_by_rating, on="movieId")
mov_deets.dropna(inplace=True)
mov_deets

Unnamed: 0,movieId,title,genres,min,max,mean,count
11,12,Dracula: Dead and Loving It (1995),Comedy|Horror,0.5,5.0,2.486339,183.0
22,23,Assassins (1995),Action|Crime|Thriller,0.5,5.0,3.106481,216.0
27,28,Persuasion (1995),Drama|Romance,0.5,5.0,3.975000,160.0
37,38,It Takes Two (1995),Children|Comedy,0.5,5.0,2.846154,39.0
47,48,Pocahontas (1995),Animation|Children|Drama|Musical|Romance,0.5,5.0,2.917749,693.0
...,...,...,...,...,...,...,...
54776,185927,Wild Wild Country (2018),Documentary,4.0,4.0,4.000000,3.0
54809,185997,Life of the Party (2018),Comedy,1.5,4.0,3.000000,6.0
55222,186863,Beirut (2018),Action|Thriller,3.0,3.5,3.250000,2.0
55318,187067,Super Troopers 2 (2018),Comedy,2.0,3.5,2.875000,4.0


In [30]:
top_10_most_rated = mov_deets.sort_values("count", ascending=False, inplace=False)
top_10_highly_rated = mov_deets.sort_values(["mean", "count"], ascending=False, inplace=False)

In [31]:
display(Markdown("## Top 10 most rated movies"))
baseline = top_10_most_rated.head(10)
baseline

## Top 10 most rated movies

Unnamed: 0,movieId,title,genres,min,max,mean,count
352,356,Forrest Gump (1994),Comedy|Drama|Romance|War,0.5,5.0,4.034784,3608.0
2773,2858,American Beauty (1999),Drama|Romance,0.5,5.0,4.087581,2609.0
602,608,Fargo (1996),Comedy|Crime|Drama|Thriller,0.5,5.0,4.085303,2327.0
360,364,"Lion King, The (1994)",Adventure|Animation|Children|Drama|Musical|IMAX,0.5,5.0,3.803767,2097.0
4133,4226,Memento (2000),Mystery|Thriller,0.5,5.0,4.107804,1999.0
1212,1240,"Terminator, The (1984)",Action|Sci-Fi|Thriller,0.5,5.0,3.839879,1986.0
1645,1704,Good Will Hunting (1997),Drama|Romance,0.5,5.0,4.046049,1911.0
1017,1036,Die Hard (1988),Action|Crime|Thriller,0.5,5.0,3.900498,1809.0
1169,1193,One Flew Over the Cuckoo's Nest (1975),Drama,0.5,5.0,4.21527,1742.0
6765,6874,Kill Bill: Vol. 1 (2003),Action|Crime|Thriller,0.5,5.0,3.812945,1684.0


In [32]:
display(Markdown("## Top 10 highly rated movies"))
top_10_highly_rated.head(10)

## Top 10 highly rated movies

Unnamed: 0,movieId,title,genres,min,max,mean,count
15654,79333,Watch Out for the Automobile (Beregis avtomobi...,Comedy|Crime|Romance,4.5,5.0,4.9,5.0
25542,116989,Tales of the Grim Sleeper (2014),Crime|Documentary,4.5,5.0,4.666667,3.0
30769,131389,Sabrina the Teenage Witch (1996),Children|Comedy|Fantasy|Sci-Fi,4.0,5.0,4.625,4.0
37025,145994,Formula of Love (1984),Comedy,4.0,5.0,4.5,4.0
8670,26181,"War Wagon, The (1967)",Action|Western,4.0,5.0,4.5,3.0
17574,88022,Hot Coffee (2011),Documentary,3.5,5.0,4.5,3.0
8140,8823,"Sting II, The (1983)",Comedy|Crime,4.5,4.5,4.5,2.0
11215,47148,Mrs. Palfrey at the Claremont (2005),Comedy|Drama,4.5,4.5,4.5,2.0
16510,82934,Most Dangerous Man in America: Daniel Ellsberg...,Documentary,4.0,5.0,4.5,2.0
7525,7878,Straight to Hell (1987),Comedy|Crime|Western,4.5,4.5,4.5,1.0


In [33]:
def recommend_top_n_movies_to_user (userId, n, model, sampled_df):

    display(Markdown("## Recommendations for User: %s" %userId))

    if userId not in sampled_df.userId.unique():
        print("user does not exist in sample")
        return None
    else:
        unique_m_all = movies.movieId.unique()
        unique_m_user = sampled_df.loc[sampled_df['userId'] == userId].movieId.unique()

        #get all movies that user hasn't watched
        diff = np.setdiff1d(unique_m_all, unique_m_user)

        predicted_ratings = []
        for m in diff:
            predicted_ratings.append((m,model.predict(userId,m)[3]))

        predicted_ratings_df = pd.DataFrame(predicted_ratings,columns = ['movieId','p_ratings'])
        predicted_ratings_with_mnames_df= pd.merge(predicted_ratings_df,movies,on='movieId',how="left")
        predicted_ratings_with_mnames_df.sort_values('p_ratings', ascending=False, inplace=True)
        top_10_rec_movies = predicted_ratings_with_mnames_df.head(n)

        return top_10_rec_movies

In [34]:
recommend_top_n_movies_to_user(3344, 10, algo, ratings)

## Recommendations for User: 3344

Unnamed: 0,movieId,p_ratings,title,genres
47675,170705,4.931273,Band of Brothers (2001),Action|Drama|War
10623,42217,4.864984,Late Spring (Banshun) (1949),Drama
5944,6123,4.810477,Sunless (Sans Soleil) (1983),Documentary
3740,3894,4.783104,Solas (1999),Drama
26023,118890,4.763407,Bill Hicks: Relentless (1992),Comedy
814,850,4.734411,Cyclo (Xich lo) (1995),Crime|Drama
15538,79333,4.734007,Watch Out for the Automobile (Beregis avtomobi...,Comedy|Crime|Romance
884,922,4.728474,Sunset Blvd. (a.k.a. Sunset Boulevard) (1950),Drama|Film-Noir|Romance
316,326,4.713722,To Live (Huozhe) (1994),Drama
45544,166024,4.709983,Whiplash (2013),(no genres listed)


In [68]:
baseline

Unnamed: 0,movieId,title,genres,min,max,mean,count
352,356,Forrest Gump (1994),Comedy|Drama|Romance|War,0.5,5.0,4.119238,499.0
2773,2858,American Beauty (1999),Drama|Romance,0.5,5.0,4.147059,340.0
602,608,Fargo (1996),Comedy|Crime|Drama|Thriller,0.5,5.0,4.08363,281.0
360,364,"Lion King, The (1994)",Adventure|Animation|Children|Drama|Musical|IMAX,1.0,5.0,3.94,275.0
1212,1240,"Terminator, The (1984)",Action|Sci-Fi|Thriller,0.5,5.0,3.832075,265.0
4133,4226,Memento (2000),Mystery|Thriller,0.5,5.0,4.127451,255.0
1645,1704,Good Will Hunting (1997),Drama|Romance,2.0,5.0,4.133065,248.0
1017,1036,Die Hard (1988),Action|Crime|Thriller,0.5,5.0,3.82636,239.0
1480,1527,"Fifth Element, The (1997)",Action|Adventure|Comedy|Sci-Fi,1.0,5.0,3.725,220.0
2631,2716,Ghostbusters (a.k.a. Ghost Busters) (1984),Action|Comedy|Sci-Fi,0.5,5.0,3.743182,220.0
