## 1. Importing packages

In [1]:
# Ignore warnings
import warnings
warnings.simplefilter(action='ignore')

# Install Prerequisites
# import sys
# !{sys.executable} -m pip install scikit-learn scikit-surprise
# !pip install git+https://github.com/gbolmier/funk-svd

# Exploratory Data Analysis
import pickle
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Data Preprocessing
import random
from time import time
import cufflinks as cf
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from mpl_toolkits.mplot3d import Axes3D
from matplotlib.ticker import NullFormatter
from sklearn.preprocessing import StandardScaler
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)

# Models
from surprise import Reader, Dataset
from surprise import SVD, NormalPredictor, BaselineOnly, NMF, SlopeOne, CoClustering
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

# Performance Evaluation
from surprise import accuracy
from sklearn.metrics import mean_squared_error
from surprise.model_selection import GridSearchCV, cross_validate, train_test_split

# Display
%matplotlib inline
sns.set(font_scale=1)
sns.set_style("white")
pd.set_option('display.max_columns', 37)

In [None]:
!pip install comet_ml

In [None]:
# import comet_ml at the top of your file
from comet_ml import Experiment

In [None]:
# Create an experiment with your api key:
experiment = Experiment(
    api_key="GS3AtcawU4R2B2a6cEb67GGZi",
    project_name="edsa-recommender",
    workspace="janleg"
)

## 2. loading data

In [2]:
# laoding scores dataset
genom_score = pd.read_csv("genome_scores.csv")
genom_score.head()

FileNotFoundError: ignored

In [None]:
# loading tags dataset
genom_tags = pd.read_csv("genome_tags.csv")
genom_tags.head()

In [None]:
# loading imdb_data dataset
imdb = pd.read_csv("imdb_data.csv")
imdb.head()

In [None]:
# loading links dataset
links = pd.read_csv("links.csv")
links.head()

In [None]:
# loading movies dataset
movies = pd.read_csv("movies.csv")
movies.head()

In [None]:
# loading tags dataset
tags = pd.read_csv("tags.csv")
tags.head()

In [None]:
# loading test dataset
test = pd.read_csv("test.csv")
test.head()

In [None]:
# loading test dataset
train = pd.read_csv("train.csv")
train.head()

In [None]:
train_df = train.drop("timestamp", 1)
train_df.head()

In [None]:
train_main = pd.merge(train_df, movies)
train_main.head()

## 3. Exploratory Data Analysis

In [None]:
# average rating for each movie
train_main.groupby('title')['rating'].mean().sort_values(ascending= False).head()

In [None]:
# Total number of rating per movie
train_main.groupby('title')['rating'].count().sort_values(ascending= False).head()

In [None]:
ratings = pd.DataFrame(train_main.groupby('title')['rating'].mean())
ratings.head()

In [None]:
ratings['num_of_ratings'] = pd.DataFrame(train_main.groupby('title')['rating'].count())
ratings.head()

In [None]:
plt.figure(figsize=(10,4))
ratings['num_of_ratings'].hist(bins=70) 

In [None]:
# distribution of ratings
plt.figure(figsize=(20,8))
ratings['rating'].hist(bins=40)

In [None]:
# distribution on=f the ratings

with sns.axes_style('white'):
    g = sns.factorplot("rating", data=train_main, aspect=2.0, kind='count')
    g.set_ylabels("Total number of ratings")
print(f'Average rating in dataset : {np.mean(train_main["rating"])}')

In [None]:
chunk_size = 5000
chunks = [x for x in range(0, train_main.shape[0], chunk_size)]

for i in range(0, len(chunks) -  1):
    print(chunks[i], chunks[i + 1] - 1)

In [None]:
# distribution of movie genre
plt.figure(figsize=(20,7))
generlist = movies['genres'].apply(lambda generlist_movie : str(generlist_movie).split("|"))
geners_count = {}

for generlist_movie in generlist:
    for gener in generlist_movie:
        if (geners_count.get(gener,False)):
            geners_count[gener] = geners_count[gener]+1
        else:
            geners_count[gener] = 1
#geners_count.pop("(No genre listed)")
plt.bar(geners_count.keys(), geners_count.values(), color='m')

In [None]:
# grouping by rating based on users
ratings_grouped_by_users = train_main.groupby('userId').agg([np.size, np.mean])

In [None]:
ratings_grouped_by_users

In [None]:
# top ten users who have rated most movies
ratings_grouped_by_users['rating']['size'].sort_values(ascending=False).head(10).plot(kind = 'bar', figsize = (10,5))

In [None]:
ratings_grouped_by_movies = train_main.groupby('movieId').agg([np.mean], np.size)
ratings_grouped_by_movies

In [None]:
ratings_grouped_by_movies = ratings_grouped_by_movies.drop('userId', axis=1)

In [None]:
ratings_grouped_by_movies

In [None]:
#movies with high average ratings
ratings_grouped_by_movies['rating']['mean'].sort_values(ascending=False).head(20).plot(kind='bar', figsize=(7,6));

In [None]:
#movies with low average ratings 
low_rated_movies_filter = ratings_grouped_by_movies['rating']['mean']< 1.5

In [None]:
low_rated_movies = ratings_grouped_by_movies[low_rated_movies_filter]

In [None]:
low_rated_movies.head(20).plot(kind='bar', figsize=(7,5));

In [None]:
low_rated_movies.head(10)

In [None]:
agg_ratings = train_main.groupby('title').agg(mean_rating = ('rating', 'mean'), 
                                      number_of_ratings = ('rating', 'count')).reset_index('title')

In [None]:
agg_ratings_5000 = agg_ratings[agg_ratings['number_of_ratings']>100]

In [None]:
agg_ratings_5000

In [None]:
#checking popular movies
agg_ratings_5000.sort_values(by = 'number_of_ratings', ascending=False).head()

In [None]:
# creates a new column for publish year
# shows the number of observations without publish year
years = []

for title in train_main['title']:
    year_subset = title[-5:-1]
    try: years.append(int(year_subset))
    except: years.append(0)
        
train_main['moviePubYear'] = years
print(len(train_main[train_main['moviePubYear'] == 0]))

## 4. Data Cleaning

In [None]:
print("Train: ")
print(str(train.info()))
print("============")
print("Test: ")
print(str(test.info()))
print("============")
print("Movies: ")
print(str(movies.info()))
# print("============")
# print("Tags: ")
# print(str(tags.info()))
print("============")
print("Links: ")
print(str(links.info()))
print("============")
print("IMDB: ")
print(str(imdb.info()))
print("============")
print("Genome score: ")
print(str(genom_score.info()))
print("============")
print("Genome tags: ")
print(str(genom_tags.info()))

In [None]:
print("Train: ")
print(str(train.isnull().sum()))
print("============")
print("Test: ")
print(str(test.isnull().sum()))
print("============")
print("Movies: ")
print(str(movies.isnull().sum()))
# print("============")
# print("Tags: ")
# print(str(tags.isnull().sum()))
print("============")
print("Links: ")
print(str(links.isnull().sum()))
print("============")
print("IMDB: ")
print(str(imdb.isnull().sum()))
print("============")
print("Genome score: ")
print(str(genom_score.isnull().sum()))
print("============")
print("Genome tags: ")
print(str(genom_tags.isnull().sum()))

In [None]:
# Drop missing rows
links.dropna(axis=0,inplace=True)

### Scaling Scores

In [None]:
# This might not be necessary as scores isbetween 0 and 1 already
scaler_mds = StandardScaler()
mds_genome = scaler_mds.fit_transform(genom_score.sample(frac=0.0001))

In [None]:
tsne = TSNE(3, n_jobs = -1, verbose = 2, perplexity = 10, learning_rate = 0.1)

In [None]:
tsne.fit(mds_genome)

In [None]:
Axes3D

fig = plt.figure(figsize=(15, 8))

# Add 3D scatter plot
ax = fig.add_subplot(projection='3d')
ax.scatter(tsne.embedding_[:,0], tsne.embedding_[:,1], tsne.embedding_[:,2], color='#4D17A0')
plt.show()

In [None]:
fig = plt.figure(figsize=(8, 6))
sns.scatterplot(x = tsne.embedding_[:,0], y = tsne.embedding_[:,1], size=tsne.embedding_[:,2],color='#4DA017')
plt.show()

In [None]:
# Manually pivot table as data is too large for in-built functions
def pivot_(df):
    """
    Pivots table.
    """
    new_dict = {'movieId':sorted(set(df.index))}
    pivoted = pd.DataFrame(new_dict)
    tagids = sorted(set(df['tagId']))
    for Id in range(len(tagids)):
        pivoted[f'{Id+1}'] = list(df[df['tagId'] == Id+1]['relevance'])
    return pivoted

In [None]:
pca_data_pivoted = pivot_(genom_score).set_index('movieId')

In [None]:
pca_data_pivoted.shape

In [None]:
pca_data_pivoted.columns = list(genom_tags['tag'])

In [None]:
pca_data_pivoted.head()

In [None]:
features = [col for col in pca_data_pivoted.columns]

In [None]:
# boxplot of unscaled features
cf.set_config_file(offline=True, world_readable=True, theme='white')
columns = random.sample(range(0, 1129), 20)
pca_data_pivoted.iloc[:,columns].iplot(kind='box', title="Boxplots of Features (Unscaled)")

In [None]:
# define a scaling function
def scaler(df):
    """
    Scales data.
    """
    scaler = StandardScaler(with_std=True)
    scaled_data = scaler.fit_transform(df)
    return scaled_data

In [None]:
# apply function on data
pca_scaled = scaler(pca_data_pivoted)

In [None]:
# convert into a data frame
scaled_pca = pd.DataFrame(pca_scaled, index = pca_data_pivoted.index, columns = pca_data_pivoted.columns)

In [None]:
# boxplot of scaled features
cf.set_config_file(offline=True, world_readable=True, theme='white')
# using plotly to plot the boxplot
scaled_pca.iloc[:,columns].iplot(kind='box', title="Boxplots of Features (Scaled)")

### Principal Component Analysis

In [None]:
# define PCA object
pca = PCA()

# fit the PCA model to our data and apply the dimensionality reduction 
prin_comp = pca.fit_transform(pca_data_pivoted[features])

# create a dataframe containing the principal components
pca_df = pd.DataFrame(data = prin_comp,
                      index=pca_data_pivoted.index, columns=pca_data_pivoted.columns
                     )

# plot line graph of cumulative variance explained
plt.plot(np.cumsum(pca.explained_variance_ratio_),color='#4D17A0')
plt.xlabel('Number of components')
plt.ylabel('Cumulative explained variance')

In [None]:
# use 85% of features
pca_85 = PCA(.85)
pca_85_df = pca_85.fit_transform(pca_data_pivoted)
print(round(pca_85.explained_variance_ratio_.sum()*100, 1),
      "% of variance explained by",
      pca_85.n_components_,
      "components.")

In [None]:
# create a data frame of features
pca_85_df = pd.DataFrame(pca_85_df, index = pca_data_pivoted.index)
pca_85_df.head()

### WCSS

In [None]:
# Manually implement the WCSS
def within_cluster_variation(df, label_col='cluster_label'):
    """
    Manually implements the WCSS.
    """
    centroids = df.groupby(label_col).mean()
    out = 0
    for label, point in centroids.iterrows():
        df_features = df[df[label_col] == label].drop(label_col, axis=1)
        out += (df_features - point).pow(2).sum(axis=1).sum()
    return out

In [None]:
# let's try everything between 2 and 18 clusters where 18 is the number of genres
n_clusters = np.arange(2, 19)

# store errors for each value of k
errors = []

# for i between 2 and 19
for k in n_clusters:
    print(f'training model with {k} clusters')
    # perform k-means clustering
    km = KMeans(n_clusters=k, n_init=10, max_iter=300, random_state=42)
    km.fit(pca_85_df)
    
    # measure BCSS
    print(f'evaluating model with {k} clusters')
    y_preds = km.predict(pca_85_df)
    pca_85_df = pd.DataFrame(pca_85_df)
    pca_85_df['cluster_label'] = y_preds
    errors.append(within_cluster_variation(pca_85_df, 'cluster_label'))
    print(errors[-1])

In [None]:
# let's try everything between 2 and 18 clusters where 18 is the number of genres
n_clusters = np.arange(2, 19)

# store errors for each value of k
errors = []

# for i between 2 and 19
for k in n_clusters:
    print(f'training model with {k} clusters')
    # perform k-means clustering
    km = KMeans(n_clusters=k, n_init=10, max_iter=300, random_state=42)
    km.fit(pca_85_df)
    
    # measure BCSS
    print(f'evaluating model with {k} clusters')
    y_preds = km.predict(pca_85_df)
    pca_85_df = pd.DataFrame(pca_85_df)
    pca_85_df['cluster_label'] = y_preds
    errors.append(within_cluster_variation(pca_85_df, 'cluster_label'))
    print(errors[-1])

### BCSS

In [None]:
# Between cluster variation
def between_cluster_variation(df, label_col='label'):
    centroids = df.groupby(label_col).mean()
    global_mean = df.drop(label_col, axis=1).mean()
    centroid_count = df.groupby(label_col).size()
    centroid_to_mean_dist = (centroids - global_mean).pow(2).sum(axis=1)
    return (centroid_count*centroid_to_mean_dist).sum()

In [None]:
# let's try everything between 2 and 18 clusters
n_clusters = np.arange(2, 19)

# store errors for each value of k
errors = []

# for i between 2 and 19
for k in n_clusters:
    # perform k-means clustering
    km = KMeans(n_clusters=k, n_init=10, max_iter=300, random_state=42)
    km.fit(pca_85_df)

    # measure BCSS
    
    print(f'evaluating model with {k} clusters')
    y_preds = km.predict(pca_85_df)
    pca_85_df = pd.DataFrame(pca_85_df)
    pca_85_df['cluster_label'] = y_preds
    errors.append(between_cluster_variation(pca_85_df, 'cluster_label'))
    print(errors[-1])

In [None]:
plt.figure(figsize=(12,8))
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Between-Cluster Sum of Squares (BCSS)')
plt.title('Elbow Method for Determining Optimal Value of k')
plt.scatter(n_clusters, errors)
plt.plot(n_clusters, errors)
plt.xticks(n_clusters)
plt.show()


### CH

In [None]:
def ch_index(df, label_col='label'):
    n = len(df)
    K = df[label_col].nunique()
    B = between_cluster_variation(df, label_col)
    W = within_cluster_variation(df, label_col)
    return (B / (K-1)) / (W / (n-K))

In [None]:
# let's try everything between 2 and 18 clusters where 18 is the number of genres
n_clusters = np.arange(2, 19)

# store errors for each value of k
errors = []

# for i between 2 and 19
for k in n_clusters:
    print(f'training model with {k} clusters')
    # perform k-means clustering
    km = KMeans(n_clusters=k, n_init=10, max_iter=300, random_state=42)
    km.fit(pca_85_df)
    
    # measure CH
    print(f'evaluating model with {k} clusters')
    y_preds = km.predict(pca_85_df)
    pca_85_df = pd.DataFrame(pca_85_df)
    pca_85_df['cluster_label'] = y_preds
    errors.append(ch_index(pca_85_df, 'cluster_label'))
    print(errors[-1])

In [None]:
plt.figure(figsize=(12,8))
plt.xlabel('Number of Clusters (k)')
plt.ylabel('CH index')
plt.title('Elbow Method for Determining Optimal Value of k')
plt.scatter(n_clusters, errors, color="#4DA017")
plt.plot(n_clusters, errors)
plt.xticks(n_clusters)
#plt.axvline(x=3, color='#4D17A0', lw=2)
plt.show()

In [None]:
K = 3
# Remember to set the random state for reproducibility
km = KMeans(n_clusters=K, verbose=0, random_state=42)
print("Clustering sparse data with %s" % km)
t0 = time()
km.fit(pca_85_df)
print("done in %0.3fs" % (time() - t0))

In [None]:
# Obtain cluster memberships for each item in the data
y_preds = km.predict(pca_85_df)
pca_85_df['cluster_label'] = y_preds
centers = km.cluster_centers_

In [None]:
plt.figure(dpi=120)
for k in range(K):
    x1 = pca_85_df[pca_85_df['cluster_label'] == k][0]
    x2 = pca_85_df[pca_85_df['cluster_label'] == k][1]
    plt.scatter(x1, x2, label="k = "+str(k+1),alpha=0.85)
# Show cluster centroid locations    
plt.scatter(centers[:,0],centers[:,1],label="centroid")
plt.legend()
plt.title(f"K = {K}")
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.show()

### Training  model with a subset (100k samples)

In [None]:
# Load the 100k dataset
train.drop('timestamp', axis=1, inplace=True)
train_subset = train[:100000]
reader = Reader(rating_scale=(train_subset['rating'].min(), train_subset['rating'].max()))
data = Dataset.load_from_df(train_subset[['userId', 'movieId', 'rating']], reader)
trainset, testset = train_test_split(data, test_size=.25, random_state=42)

In [None]:
svd_test = SVD(n_epochs = 30, n_factors = 200, init_std_dev = 0.05, random_state=42)
svd_test.fit(trainset)
predictions = svd_test.test(testset)
# Calculate RMSE
svd_rmse = accuracy.rmse(predictions)

In [None]:
np_test = NormalPredictor()
np_test.fit(trainset)
predictions = np_test.test(testset)
# Calculate RMSE
np_rmse = accuracy.rmse(predictions)

In [None]:
bsl_options = {'method': 'sgd','n_epochs': 40}
blo_test = BaselineOnly(bsl_options=bsl_options)
blo_test.fit(trainset)
predictions = blo_test.test(testset)
# Calculate RMSE
blo_rmse = accuracy.rmse(predictions)

In [None]:
nmf_test = NMF()
nmf_test.fit(trainset)
predictions = nmf_test.test(testset)
# Calculate RMSE
nmf_rmse = accuracy.rmse(predictions)

In [None]:
slo_test = SlopeOne()
slo_test.fit(trainset)
predictions = slo_test.test(testset)
# Calculate RMSE
slo_rmse = accuracy.rmse(predictions)

In [None]:
cc_test = CoClustering(random_state=42)
cc_test.fit(trainset)
predictions = cc_test.test(testset)
# Calculate RMSE
cc_rmse = accuracy.rmse(predictions)

###  Content Based Filtering

In [None]:
def data_preprocessing(subset_size):
    """Prepare data for use within Content filtering algorithm.

    Parameters
    ----------
    subset_size : int
        Number of movies to use within the algorithm.

    Returns
    -------
    Pandas Dataframe
        Subset of movies selected for content-based filtering.

    """
    # Split genre data into individual words.
    movies['keyWords'] = movies['genres'].str.replace('|', ' ')
    # Subset of the data
    movies_subset = movies[:subset_size]
    return movies_subset
 
def content_model(movie_list,top_n=10): 
    """Performs Content filtering based upon a list of movies supplied
       by the app user.

    Parameters
    ----------
    movie_list : list (str)
        Favorite movies chosen by the app user.
    top_n : type
        Number of top recommendations to return to the user.

    Returns
    -------
    list (str)
        Titles of the top-n movie recommendations to the user.

    """
    # Initializing the empty list of recommended movies
    data = data_preprocessing(2000)
    
    # Instantiating and generating the count matrix
    count_vec = CountVectorizer()
    count_matrix = count_vec.fit_transform(data['keyWords'])
    indices = pd.Series(data['title'])
    cosine_sim = cosine_similarity(count_matrix, count_matrix)
    cosine_sim = pd.DataFrame(cosine_sim, index = data.index, columns = data.index)
    
    # Getting the index of the movie that matches the title
    idx_1 = indices[indices == movie_list[0]].index[0]
    idx_2 = indices[indices == movie_list[1]].index[0]
    idx_3 = indices[indices == movie_list[2]].index[0]
    
    # Creating a Series with the similarity scores in descending order
    rank_1 = cosine_sim[idx_1]
    rank_2 = cosine_sim[idx_2]
    rank_3 = cosine_sim[idx_3]
    
    # Calculating the scores
    score_series_1 = pd.Series(rank_1).sort_values(ascending = False)
    score_series_2 = pd.Series(rank_2).sort_values(ascending = False)
    score_series_3 = pd.Series(rank_3).sort_values(ascending = False)
    
    # Getting the indexes of the 10 most similar movies
    listings = score_series_1.append(score_series_2).append(score_series_3).sort_values(ascending = False)

    # Store movie names
    recommended_movies = []
    
    # Appending the names of movies
    top_50_indexes = list(listings.iloc[1:50].index)
    
    # Removing chosen movies
    top_indexes = np.setdiff1d(top_50_indexes,[idx_1,idx_2,idx_3])
    for i in top_indexes[:top_n]:
        recommended_movies.append(list(movies['title'])[i])
    return recommended_movies

### Movies Recommendation

In [None]:
movies = movies.dropna()
movie_list = ['Grumpier Old Men (1995)','Ace Ventura: When Nature Calls (1995)','Father of the Bride Part II (1995)']
content_model(movie_list,top_n=10)

### Comparing *Models*

In [None]:
# Compare RMSE values between models
fig,axis = plt.subplots(figsize=(8, 5))
rmse_x = ['SVD','NormalPredictor','BaselineOnly','NMF','SlopeOne','CoClustering']
rmse_y = [svd_rmse,np_rmse,blo_rmse,nmf_rmse,slo_rmse,cc_rmse]
ax = sns.barplot(x=rmse_x, y=rmse_y,palette='brg',edgecolor='black')
plt.title('RMSE Value Per Collaborative-based Filtering Model',fontsize=14)
plt.xticks(rotation=90)
plt.ylabel('RMSE')
for p in ax.patches:
    ax.text(p.get_x() + p.get_width()/2, p.get_y() + p.get_height(), round(p.get_height(),2), fontsize=12, ha="center", va='bottom')
    
plt.show()

### Cross Validation

In [None]:
svd_test = SVD(n_epochs = 40, n_factors = 200, init_std_dev = 0.05, random_state=42)
# Run 5-fold cross-validation and print results
a = cross_validate(svd_test, data, measures=['RMSE'], cv=5, verbose=True)

In [None]:
bsl_options = {'method': 'sgd','n_epochs': 40}
blo_test = BaselineOnly(bsl_options=bsl_options)
# Run 5-fold cross-validation and print results
b = cross_validate(blo_test, data, measures=['RMSE'], cv=5, verbose=True)

### Grid Search

In [None]:
param_grid = {'n_epochs':[40], #[30,40,50],
              'n_factors':[400], #[100,200,300,400],
              'init_std_dev':[0.005], #[0.001,0.005,0.05,0.1],
              'random_state':[42]} 
grid_SVD = GridSearchCV(SVD, cv=5, measures=['rmse'], param_grid=param_grid, n_jobs=-1)
grid_SVD.fit(data)
print('***Best score:***')
print(grid_SVD.best_score['rmse'])
print('***Best parameters:***')
print(grid_SVD.best_params['rmse'])

### Use Best Parameters to Train Model

In [None]:
svd_test = SVD(n_epochs = 40, n_factors = 400, init_std_dev = 0.005, random_state=42)
svd_test.fit(trainset)
predictions = svd_test.test(testset)
# Calculate RMSE
svd_rmse = accuracy.rmse(predictions)

In [None]:
# Predicted Target Values vs. Actual Target Values
new_df = pd.DataFrame(columns=['uid', 'iid', 'rating'])
i = 0
for (uid, iid, rating) in testset:
    new_df.loc[i] = [uid, iid, rating]
    i = i+1
true = new_df['rating']
pred = []
for i in predictions:
    pred.append(i.est)
fig,axis = plt.subplots(figsize=(8, 5))
sns.boxplot(x=true, y=pred, palette="brg")
plt.title("Predicted Target Values vs. Actual Target Values", fontsize=14)
plt.xlabel("Actual Target Values")
plt.ylabel("Predicted Target Values")
plt.show()

### Training the whole Data

In [None]:
# Train model on whole dataset
reader = Reader(rating_scale=(train_df['rating'].min(), train_df['rating'].max()))
data = Dataset.load_from_df(train_df[['userId', 'movieId', 'rating']], reader)
trainset = data.build_full_trainset()
svd = SVD(n_epochs = 30, n_factors = 300, init_std_dev = 0.005, random_state=42, verbose=True)
svd.fit(trainset)

# Create Kaggle submission file
predictions = []
for i, row in test_df.iterrows():
    x = (svd.predict(row.userId, row.movieId))
    pred = x[3]
    predictions.append(pred)
test_df['Id'] = test_df['userId'].map(str) +'_'+ test_df['movieId'].map(str)
results = pd.DataFrame({"Id":test_df['Id'],"rating": predictions})
results.to_csv("ZF2_first_submission.csv", index=False)

### Modelling

In [None]:
def content_generate_rating_estimate(movie_id, user, rating_data, k=20, threshold=0.0):
    # Convert the movie title to a numeric index for our 
    # similarity matrix
    b_idx = indices[movie_id]
    pep = [] # <-- Stores our collection of similarity values 
     
    # Gather the similarity ratings between each movie the user has rated
    # and the reference movie 
    for index, row in rating_data[rating_data['userId']==user].iterrows():
        sim = cosine_sim_tfidf[b_idx-1, indices[row['movieId']]-1]
        pep.append((sim, row['rating']))
    # Select the top-N values from our collection
    k_pep = heapq.nlargest(k, pep, key=lambda t: t[0])

    # Compute the weighted average using similarity scores and 
    # user item ratings. 
    simTotal, weightedSum = 0, 0
    for (simScore, rating) in k_pep:
        # Ensure that similarity ratings are above a given threshold
        if (simScore > threshold):
            simTotal += simScore
            weightedSum += simScore * rating
    try:
        pred_rate = weightedSum / simTotal
    except ZeroDivisionError:
        # Cold-start problem - No ratings given by user. 
        # We use the average rating for the reference item as a proxy in this case 
        pred_rate = np.mean(rating_data[rating_data['movieId']==movie_id]['rating'])
    return pred_rate

### Model Evaluation

In [None]:
actual = 

In [None]:
rmse = mean_squared_error((y_actual, pred_rate), squared=False)