In [1]:
import pyspark as ps
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from pyspark.ml.recommendation import ALS

In [2]:
movies = pd.read_table('data/movies.dat', delimiter='::', names=['movie', 'name', 'genre'], engine='python')
users = pd.read_table('data/users.dat', delimiter='::', names=['user', 'gender', 'age', 'occupation', 'zip_code'], engine='python')
requests = pd.read_csv('data/requests.csv')
training = pd.read_csv('data/training.csv')

In [3]:
def split_date(df_column):
    lst = []
    for string in df_column:
        x = string.split('(')
        sub_lst = []
        for substring in x:
            
            dropped_string = substring.replace(')','')
#             print(dropped_string)
            sub_lst.append(dropped_string)
#         print(sub_lst)
        lst.append(sub_lst)
    pd.DataFrame(sub_lst)
    
print(split_date(movies.name))

None


In [4]:
lst = []
for string in movies.name:
    x = string.split('(')
    sub_lst = []
    for substring in x:
        dropped_string = substring.replace(')','')
        sub_lst.append(dropped_string)
    lst.append(sub_lst)
lst
        
titles = pd.DataFrame(lst)
titles[1].value_counts()


1996              327
1995              321
1998              305
1997              298
1999              265
                 ... 
Le Proc�s           1
Sibak               1
Train De Vie        1
Xizhao              1
Le Grand Bleu       1
Name: 1, Length: 348, dtype: int64

In [5]:
mask = movies.name.apply(lambda x: True if (len(x.split('(')) > 2) else False)
movies[mask]

Unnamed: 0,movie,name,genre
29,30,Shanghai Triad (Yao a yao yao dao waipo qiao) ...,Drama
46,47,Seven (Se7en) (1995),Crime|Thriller
57,58,"Postino, Il (The Postman) (1994)",Drama|Romance
58,59,"Confessional, The (Le Confessionnal) (1995)",Drama|Mystery
67,68,French Twist (Gazon maudit) (1995),Comedy|Romance
...,...,...,...
3794,3864,Godzilla 2000 (Gojira ni-sen mireniamu) (1999),Action|Adventure|Sci-Fi
3797,3867,All the Rage (a.k.a. It's the Rage) (1999),Drama
3822,3892,Anatomy (Anatomie) (2000),Horror
3832,3902,Goya in Bordeaux (Goya en Bodeos) (1999),Drama


In [6]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()

from pyspark.ml.recommendation import ALS
spark_df = spark.createDataFrame(training) 
als_model = ALS(
    itemCol='movie',
    userCol='user',
    ratingCol='rating',
    nonnegative=True,    
    regParam=0.1,
    rank=10) 

als = als_model.fit(spark_df)

In [7]:
genres = set()
for lst in list(movies.genre):
    lst = lst.split('|')
    
    for genre in lst:
        genre = genre.lower()
        genre = genre.strip()
        
        # genre = '_'.join(genre.split('-'))
        genre = genre.split('\'')[0]
        genres.add(genre)  
genres

def glst(row):
    row = row.lower()
    row = row.strip()
    row = row.replace('\'s', '')
    
    return row.split('|')

movies_df = movies.copy()
# make genre to a dummy variable 
def dummy(col, data):
    data[col] = data.genre.apply(lambda x: 1 if col in glst(x) else 0)

for c in list(genres):
    dummy(c, movies_df)

In [8]:
spark = ps.sql.SparkSession.builder.getOrCreate()
df = spark.createDataFrame(training.drop('timestamp', axis = 1))
train, test = df.randomSplit([0.8, 0.2], seed = 427471138)

In [11]:


# Make spark data frame and train and test split 
# Set up spark 
#### Setting Up Spark Session


# Create First model 
params = {'itemCol': 'movie',
          'userCol': 'user',
          'ratingCol': 'rating',
          'nonnegative': True,
          'regParam': 0.1,
          'rank': 10 }

als_model = ALS(**params)
recommender = als_model.fit(train)
test_pred = recommender.transform(test).toPandas()



In [12]:
test_pred


Unnamed: 0,user,movie,rating,prediction
0,3539,148,3,2.589459
1,4277,463,4,3.175510
2,5047,463,3,2.041967
3,2210,463,3,2.561360
4,2777,463,3,2.979098
...,...,...,...,...
160432,1625,3910,4,3.508585
160433,5927,3910,3,3.972936
160434,1936,3910,5,4.047053
160435,1173,3910,4,3.543314


In [13]:
# do first RMSE test 
mask = test_pred.prediction.isna()
temp = test_pred[~mask]
mean_squared_error(temp.rating, temp.prediction)

0.7690945756908956

In [16]:
def cv_search(df, regParams, ranks):
    score = []
    train, test = df.randomSplit([0.8, 0.2])
    
        
    for regParam in regParams:
        for rank in ranks:
            params = {'itemCol': 'movie',
            'userCol': 'user',
            'ratingCol': 'rating',
            'nonnegative': True,
            'regParam': regParam,
            'rank': rank }
            als_model = ALS(**params)
            recommender = als_model.fit(train)
            test_pred = recommender.transform(test).toPandas()
            mask = test_pred.prediction.isna()
            temp = test_pred[~mask]
            score.append(f'regparam: {regParam}, rank: {rank}' )
            score.append(mean_squared_error(temp.rating, temp.prediction))
            
    return score

In [19]:
# cv_search(df, [.001,.01, .1], [25, 50, 100, 200])

In [27]:
pvt_training = pd.pivot_table(training, index = 'user', columns = 'movie', values = 'rating')

In [23]:
import numpy as np
import pandas as pd
from scipy import sparse
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity
from time import time


class ItemItemRecommender(object):

    def __init__(self, neighborhood_size):
        self.neighborhood_size = neighborhood_size

    def fit(self, ratings_mat):
        self.ratings_mat = ratings_mat
        self.n_users = ratings_mat.shape[0]
        self.n_items = ratings_mat.shape[1]
        self.item_sim_mat = cosine_similarity(self.ratings_mat.T)
        self._set_neighborhoods()

    def _set_neighborhoods(self):
        least_to_most_sim_indexes = np.argsort(self.item_sim_mat, 1)
        self.neighborhoods = least_to_most_sim_indexes[:, -self.neighborhood_size:]

    def pred_one_user(self, user_id, report_run_time=False):
        start_time = time()
        items_rated_by_this_user = self.ratings_mat[user_id].nonzero()[1]
        # Just initializing so we have somewhere to put rating preds
        out = np.zeros(self.n_items)
        for item_to_rate in range(self.n_items):
            relevant_items = np.intersect1d(self.neighborhoods[item_to_rate],
                                            items_rated_by_this_user,
                                            assume_unique=True)  # assume_unique speeds up intersection op
        # note: self.ratings_mat has data type `sparse_lil_matrix`, while
        # self.items_sim_mat is a numpy array. Luckily for us, multiplication
        # between these two classes is defined, and even more luckily,
        # it is defined to as the dot product. So the numerator
        # in the following expression is an array of a single float
        # (not an array of elementwise products as you would expect
        #  if both things were numpy arrays)
            out[item_to_rate] = self.ratings_mat[user_id, relevant_items] * \
                self.item_sim_mat[item_to_rate, relevant_items] / \
                self.item_sim_mat[item_to_rate, relevant_items].sum()
        if report_run_time:
            print("Execution time: %f seconds" % (time()-start_time))
        cleaned_out = np.nan_to_num(out)
        return cleaned_out

    def pred_all_users(self, report_run_time=False):
        start_time = time()
        all_ratings = [
            self.pred_one_user(user_id) for user_id in range(self.n_users)]
        if report_run_time:
            print("Execution time: %f seconds" % (time()-start_time))
        return np.array(all_ratings)

    def top_n_recs(self, user_id, n=10):
        pred_ratings = self.pred_one_user(user_id)
        item_index_sorted_by_pred_rating = list(np.argsort(pred_ratings))
        items_rated_by_this_user = self.ratings_mat[user_id].nonzero()[1]
        unrated_items_by_pred_rating = [item for item in item_index_sorted_by_pred_rating
                                        if item not in items_rated_by_this_user]
        return unrated_items_by_pred_rating[-n:]


def get_ratings_data():
    ratings_contents = pd.read_table("./data/u.data",
        names=["user", "movie", "rating", "timestamp"])

    return ratings_contents

def load_movies():
    columns = """movie id | movie title | release date | video release date |          IMDb URL | unknown | Action | Adventure | Animation |
            Children's | Comedy | Crime | Documentary | Drama | Fantasy |
              Film-Noir | Horror | Musical | Mystery | Romance | Sci-Fi |
              Thriller | War | Western """
    columns = [word.strip() for word in columns.split('|')]
    columns = [word.replace(' ','_') for word in columns]
    movies = pd.read_table("./data/u.item", names= columns, sep='|', encoding='latin-1')
    movies = movies[['movie_id', 'movie_title']]
    return movies

In [32]:
# rec = ItemItemRecommender(neighborhood_size=75)
# rec.fit(pvt_training)
# rec.pred_one_user(user_id=600)
pvt_training

movie,1,2,3,4,5,6,7,8,9,10,...,3943,3944,3945,3946,3947,3948,3949,3950,3951,3952
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
636,,,,,,,,,,,...,,,,,,,,,,
637,5.0,,,,,,,,,,...,,,,,,,,,,
638,,,,,,,,,,,...,,,,,,,,,,
639,,,,,,,,,,,...,,,,,,,,,,
640,,,,,,4.0,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6036,,,,2.0,,3.0,,,,,...,,,,,,,,,,
6037,,,,,,,,,,,...,,,,,,,,,,
6038,,,,,,,,,,,...,,,,,,,,,,
6039,,,,,,,,,,,...,,,,,,,,,,


In [37]:
from sklearn.impute import KNNImputer
from sklearn.metrics.pairwise import cosine_similarity
imp = KNNImputer()
imp.fit(pvt_training)

X = imp.transform(pvt_training)

sim = cosine_similarity(X,X)

sim.argsort()
sim = cosine_similarity(X,X)

In [58]:
pvt_training = pvt_training.fillna(0)
sim = cosine_similarity(pvt_training,pvt_training)

In [56]:
pvt_training

movie,1,2,3,4,5,6,7,8,9,10,...,3943,3944,3945,3946,3947,3948,3949,3950,3951,3952
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
636,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
637,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
638,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
639,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
640,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6036,0.0,0.0,0.0,2.0,0.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6037,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6038,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6039,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [67]:
pd.DataFrame(sim)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5389,5390,5391,5392,5393,5394,5395,5396,5397,5398
0,1.000000,0.174665,0.170987,0.157088,0.000000,0.020390,0.131076,0.021000,0.126666,0.021551,...,0.111710,0.047174,0.183838,0.000000,0.213842,0.192785,0.125157,0.046980,0.069927,0.090936
1,0.174665,1.000000,0.235314,0.230834,0.046894,0.189858,0.100528,0.052834,0.111137,0.122242,...,0.152505,0.247358,0.214006,0.060986,0.165909,0.354850,0.291603,0.070144,0.166865,0.264876
2,0.170987,0.235314,1.000000,0.170137,0.026690,0.087494,0.073155,0.060311,0.132101,0.040049,...,0.243314,0.080705,0.125949,0.039858,0.127226,0.182618,0.117020,0.068208,0.068578,0.186783
3,0.157088,0.230834,0.170137,1.000000,0.065769,0.021692,0.124507,0.173628,0.154658,0.080744,...,0.200709,0.087696,0.401663,0.045211,0.133153,0.212165,0.116557,0.025466,0.049340,0.102460
4,0.000000,0.046894,0.026690,0.065769,1.000000,0.128755,0.098976,0.157864,0.006650,0.105305,...,0.086754,0.039279,0.000000,0.050674,0.096608,0.133385,0.193022,0.000000,0.006682,0.162138
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5394,0.192785,0.354850,0.182618,0.212165,0.133385,0.173980,0.103364,0.190972,0.212391,0.186790,...,0.131294,0.209843,0.186426,0.103431,0.267405,1.000000,0.341462,0.124174,0.219115,0.400968
5395,0.125157,0.291603,0.117020,0.116557,0.193022,0.268235,0.097265,0.108147,0.179449,0.297592,...,0.142309,0.276134,0.129985,0.118749,0.141676,0.341462,1.000000,0.049015,0.252146,0.428425
5396,0.046980,0.070144,0.068208,0.025466,0.000000,0.119295,0.000000,0.041260,0.012359,0.084686,...,0.108837,0.106897,0.040689,0.000000,0.063967,0.124174,0.049015,1.000000,0.161714,0.086536
5397,0.069927,0.166865,0.068578,0.049340,0.006682,0.160267,0.062908,0.041635,0.120970,0.167874,...,0.118776,0.250994,0.053750,0.102168,0.068399,0.219115,0.252146,0.161714,1.000000,0.210945


In [57]:
pd.DataFrame(X)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3652,3653,3654,3655,3656,3657,3658,3659,3660,3661
0,4.0,4.0,4.0,3.6,3.4,4.2,4.0,3.4,3.2,4.2,...,2.6,2.0,1.4,1.8,3.4,4.0,5.0,3.0,4.2,4.2
1,5.0,3.8,2.8,3.4,3.4,3.8,3.4,3.4,2.8,3.8,...,3.0,2.0,1.0,2.0,3.0,3.0,5.0,3.2,4.2,4.4
2,4.4,4.0,3.6,2.2,2.8,4.2,4.0,3.6,3.6,3.8,...,2.8,2.0,1.6,2.2,3.2,4.2,5.0,3.4,4.4,3.4
3,3.8,3.4,2.6,3.2,2.6,4.4,3.8,2.8,3.0,3.8,...,2.6,2.0,1.4,2.2,3.8,3.6,4.8,3.4,4.4,3.8
4,4.4,3.6,3.2,2.6,3.0,4.0,3.6,2.4,2.4,4.0,...,2.0,2.0,1.6,2.0,4.2,3.8,4.4,3.6,4.0,4.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5394,5.0,2.6,2.6,2.0,2.2,3.0,2.8,2.6,2.2,3.0,...,3.0,2.0,1.0,2.4,4.0,3.8,4.4,3.4,4.4,3.6
5395,4.0,3.2,3.4,3.4,3.2,3.8,3.8,3.0,2.4,3.4,...,2.2,2.0,1.2,2.8,3.4,3.8,4.8,3.0,4.8,3.6
5396,4.0,2.4,3.4,3.2,3.2,4.8,4.0,2.8,2.6,4.4,...,2.6,2.0,2.0,2.0,3.2,4.0,4.2,3.6,4.4,3.4
5397,3.8,3.2,2.6,2.8,3.2,3.8,3.2,3.2,3.4,4.0,...,3.2,2.0,1.4,2.2,2.6,4.2,4.4,3.4,4.6,3.6


In [42]:

sim = cosine_similarity(X,X)

In [44]:
sim.argsort()

array([[ 514,  702, 2834, ..., 2814, 1498,    0],
       [ 702,  514, 2834, ..., 5159,  687,    1],
       [ 514,  702, 5044, ..., 3535, 3562,    2],
       ...,
       [ 702,  514, 5044, ..., 3401, 3088, 5396],
       [ 514,  702, 2834, ..., 3535,  288, 5397],
       [ 702, 2834,  514, ..., 3824, 4524, 5398]])

In [51]:
movies.name[sim.argsort]

TypeError: 'Series' object cannot be interpreted as an integer