In [1]:
# Dependencies
import matplotlib.pyplot as plt
%matplotlib inline

import os
import pandas as pd
import numpy as np
import tensorflow as tf
from math import pow, sqrt
from sklearn.model_selection import train_test_split

#os.environ['KMP_DUPLICATE_LIB_OK']='True'

from tensorflow import keras
import scipy.stats
import scipy.spatial
#from sklearn.model_selection import KFold
import random
from sklearn.metrics import mean_squared_error


In [2]:
# Reading movies dataset into a pandas dataframe object.
movies = pd.read_csv("data/ml-latest/movies.csv", low_memory=False, encoding='utf-8')
movies.head()


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [3]:
# Reading ratings dataset into a pandas dataframe object.
ratings = pd.read_csv("data/ml-latest/ratings.csv", low_memory=False, encoding='utf-8')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,307,3.5,1256677221
1,1,481,3.5,1256677456
2,1,1091,1.5,1256677471
3,1,1257,4.5,1256677460
4,1,1449,4.5,1256677264


In [5]:
len(ratings.userId.unique())


283228

In [6]:
#ratings['userId'].value_counts()

In [7]:
# Getting number of users and movies from the dataset.
user_ids = ratings.userId.unique().tolist()
movie_ids = ratings.movieId.unique().tolist()
#tag_ids = tags.movieId.unique().tolist()
print('Number of Uniq-Users: {}'.format(len(user_ids)))
print('Number of Movies: {}'.format(len(movie_ids)))
#print('Number of Movies Tags: {}'.format(len(tag_ids)))


Number of Uniq-Users: 283228
Number of Movies: 53889


In [47]:
# Getting series of lists by applying split operation.
movies.genres = movies.genres.str.split('|')


In [48]:
# Getting distinct genre types for generating columns of genre type.
genre_columns = list(set([j for i in movies['genres'].tolist() for j in i]))


In [49]:
genre_columns


['Comedy',
 'IMAX',
 'Thriller',
 'Musical',
 'Drama',
 'Fantasy',
 'Adventure',
 'Animation',
 'Sci-Fi',
 'Romance',
 'Crime',
 'Documentary',
 'Action',
 'Children',
 'War',
 'Film-Noir',
 '(no genres listed)',
 'Mystery',
 'Western',
 'Horror']

In [50]:
len(genre_columns)

20

In [51]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]"
1,2,Jumanji (1995),"[Adventure, Children, Fantasy]"
2,3,Grumpier Old Men (1995),"[Comedy, Romance]"
3,4,Waiting to Exhale (1995),"[Comedy, Drama, Romance]"
4,5,Father of the Bride Part II (1995),[Comedy]


In [52]:
for j in genre_columns:
    movies[j] = 0
movies.head()

Unnamed: 0,movieId,title,genres,Comedy,IMAX,Thriller,Musical,Drama,Fantasy,Adventure,...,Crime,Documentary,Action,Children,War,Film-Noir,(no genres listed),Mystery,Western,Horror
0,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,Jumanji (1995),"[Adventure, Children, Fantasy]",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,Grumpier Old Men (1995),"[Comedy, Romance]",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,Waiting to Exhale (1995),"[Comedy, Drama, Romance]",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Father of the Bride Part II (1995),[Comedy],0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [53]:
movies.shape[0]

58098

In [54]:
# Iterating over every list to create and fill values into columns.
       
for i in range(movies.shape[0]):
    if (i%10000)==0:
        print(i)
        print("--"*20)
    for j in genre_columns:
        if(j in movies['genres'].iloc[i]):
            movies.loc[i,j] = 1
#            print(f"found {j, 1}")

print("completed")

0
----------------------------------------
10000
----------------------------------------
20000
----------------------------------------
30000
----------------------------------------
40000
----------------------------------------
50000
----------------------------------------
completed


In [55]:
# Separting movie title and year part using split function.
split_values = movies['title'].str.split("(", n = 1, expand = True)
# split_values

In [56]:
# setting 'title' values to title part.
movies.title = split_values[0]

In [57]:
# creating 'release_year' column.
movies['release_year'] = split_values[1]


In [58]:
# Cleaning the release_year series.
movies['release_year'] = movies.release_year.str.replace(')','')


In [59]:
movies.head()


Unnamed: 0,movieId,title,genres,Comedy,IMAX,Thriller,Musical,Drama,Fantasy,Adventure,...,Documentary,Action,Children,War,Film-Noir,(no genres listed),Mystery,Western,Horror,release_year
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1,0,0,0,0,1,1,...,0,0,1,0,0,0,0,0,0,1995
1,2,Jumanji,"[Adventure, Children, Fantasy]",0,0,0,0,0,1,1,...,0,0,1,0,0,0,0,0,0,1995
2,3,Grumpier Old Men,"[Comedy, Romance]",1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1995
3,4,Waiting to Exhale,"[Comedy, Drama, Romance]",1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1995
4,5,Father of the Bride Part II,[Comedy],1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1995


In [60]:
movies.to_csv("output/movies_encoded.csv")


In [2]:
movies = pd.read_csv("output/movies_encoded.csv")
del movies['Unnamed: 0']
movies.head()


Unnamed: 0,movieId,title,genres,Comedy,IMAX,Thriller,Musical,Drama,Fantasy,Adventure,...,Documentary,Action,Children,War,Film-Noir,(no genres listed),Mystery,Western,Horror,release_year
0,1,Toy Story,"['Adventure', 'Animation', 'Children', 'Comedy...",1,0,0,0,0,1,1,...,0,0,1,0,0,0,0,0,0,1995
1,2,Jumanji,"['Adventure', 'Children', 'Fantasy']",0,0,0,0,0,1,1,...,0,0,1,0,0,0,0,0,0,1995
2,3,Grumpier Old Men,"['Comedy', 'Romance']",1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1995
3,4,Waiting to Exhale,"['Comedy', 'Drama', 'Romance']",1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1995
4,5,Father of the Bride Part II,['Comedy'],1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1995


In [3]:
# dropping 'genre' columns as it has already been one hot encoded.
del movies['genres']
movies.head()


Unnamed: 0,movieId,title,Comedy,IMAX,Thriller,Musical,Drama,Fantasy,Adventure,Animation,...,Documentary,Action,Children,War,Film-Noir,(no genres listed),Mystery,Western,Horror,release_year
0,1,Toy Story,1,0,0,0,0,1,1,1,...,0,0,1,0,0,0,0,0,0,1995
1,2,Jumanji,0,0,0,0,0,1,1,0,...,0,0,1,0,0,0,0,0,0,1995
2,3,Grumpier Old Men,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1995
3,4,Waiting to Exhale,1,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1995
4,5,Father of the Bride Part II,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1995


In [4]:
splited_years = movies['release_year'].str.split("(", n = 1, expand = True)
#splited_years.fillna(0)
splited_years.head()


Unnamed: 0,0,1
0,1995,
1,1995,
2,1995,
3,1995,
4,1995,


In [5]:
splited_years[1] = splited_years[1].replace('nan', np.nan).fillna(0)
splited_years.head()


Unnamed: 0,0,1
0,1995,0
1,1995,0
2,1995,0
3,1995,0
4,1995,0


In [6]:
splited_years[1][29]


'1995'

In [7]:
splited_years[0][29]


'Yao a yao yao dao waipo qiao '

In [8]:
print(splited_years[1][27])


0


In [9]:
print(splited_years[0][27])


1995


In [10]:
for i in range(len(splited_years)):
    if (splited_years[1][i] !=0):
#        print(i, splited_years[0][i], splited_years[1][i])
        splited_years[0][i]=splited_years[1][i]


In [11]:
splited_years[0][29]


'1995'

In [12]:
movies['release_year_cleaned'] = splited_years[0]
movies.head(100)


Unnamed: 0,movieId,title,Comedy,IMAX,Thriller,Musical,Drama,Fantasy,Adventure,Animation,...,Action,Children,War,Film-Noir,(no genres listed),Mystery,Western,Horror,release_year,release_year_cleaned
0,1,Toy Story,1,0,0,0,0,1,1,1,...,0,1,0,0,0,0,0,0,1995,1995
1,2,Jumanji,0,0,0,0,0,1,1,0,...,0,1,0,0,0,0,0,0,1995,1995
2,3,Grumpier Old Men,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1995,1995
3,4,Waiting to Exhale,1,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1995,1995
4,5,Father of the Bride Part II,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1995,1995
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,97,Hate,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,"Haine, La (1995",1995
96,98,Shopping,0,0,1,0,0,0,0,0,...,1,0,0,0,0,0,0,0,1994,1994
97,99,Heidi Fleiss: Hollywood Madam,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1995,1995
98,100,City Hall,0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1996,1996


In [13]:
del movies['release_year']
#movies.head(100)


In [198]:
movies.to_csv("output/movies_encoded_v2.csv")


In [8]:
movies = pd.read_csv("output/movies_encoded_v2.csv")
del movies['Unnamed: 0']
movies.head()


Unnamed: 0,movieId,title,Comedy,IMAX,Thriller,Musical,Drama,Fantasy,Adventure,Animation,...,Documentary,Action,Children,War,Film-Noir,(no genres listed),Mystery,Western,Horror,release_year_cleaned
0,1,Toy Story,1,0,0,0,0,1,1,1,...,0,0,1,0,0,0,0,0,0,1995
1,2,Jumanji,0,0,0,0,0,1,1,0,...,0,0,1,0,0,0,0,0,0,1995
2,3,Grumpier Old Men,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1995
3,4,Waiting to Exhale,1,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1995
4,5,Father of the Bride Part II,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1995


In [9]:
# Getting the rating given by a user to a movie.
def get_rating_(userid, movieid):
    return (ratings.loc[(ratings.userId==userid) & (ratings.movieId == movieid),'rating'].iloc[0])


In [10]:
# Getting the list of all movie ids the specified user has rated.
def get_movieids_(userid):
    return (ratings.loc[(ratings.userId==userid),'movieId'].tolist())


In [11]:
# Getting the movie titles against the movie id.
def get_movie_title_(movieid):
    return (movies.loc[(movies.movieId == movieid),'title'].iloc[0])


In [12]:
def distance_similarity_score(user1,user2):
    '''
    user1 & user2 : user ids of two users between which similarity        score is to be calculated.
    '''
    # Count of movies watched by both the users.
    both_watch_count = 0
    for element in ratings.loc[ratings.userId==user1,'movieId'].tolist():
        if element in ratings.loc[ratings.userId==user2,'movieId'].tolist():
            both_watch_count += 1
    if both_watch_count == 0 :
        return 0
    
    # Calculating distance based similarity between both the users.
    distance = []
    for element in ratings.loc[ratings.userId==user1,'movieId'].tolist():
        if element in ratings.loc[ratings.userId==user2,'movieId'].tolist():
            rating1 = get_rating_(user1,element)
            rating2 = get_rating_(user2,element)
            distance.append(pow(rating1 - rating2, 2))
    total_distance = sum(distance)
    
    # Adding one to the denominator to avoid divide by zero error.
    return 1/(1+sqrt(total_distance))



In [13]:
print('Distance based similarity between user ids 101 & 3107: {} '.format(distance_similarity_score(101,3107)))


Distance based similarity between user ids 101 & 3107: 0.11787845042814334 


In [14]:
def pearson_correlation_score(user1,user2):
    '''
    user1 & user2 : user ids of two users between which similarity score is to be calculated.
    '''
    # A list of movies watched by both the users.
    both_watch_count = []
    
    # Finding movies watched by both the users.
    for element in ratings.loc[ratings.userId==user1,'movieId'].tolist():
        if element in ratings.loc[ratings.userId==user2,'movieId'].tolist():
            both_watch_count.append(element)
    
    # Returning '0' correlation for bo common movies.
    if len(both_watch_count) == 0 :
        return 0
    
    # Calculating Co-Variances.
    rating_sum_1 = sum([get_rating_(user1,element) for element in both_watch_count])
    rating_sum_2 = sum([get_rating_(user2,element) for element in both_watch_count])
    rating_squared_sum_1 = sum([pow(get_rating_(user1,element),2) for element in both_watch_count])
    rating_squared_sum_2 = sum([pow(get_rating_(user2,element),2) for element in both_watch_count])
    product_sum_rating = sum([get_rating_(user1,element) * get_rating_(user2,element) for element in both_watch_count])
    
    # Returning pearson correlation between both the users.
    numerator = product_sum_rating - ((rating_sum_1 * rating_sum_2) / len(both_watch_count))
    denominator = sqrt((rating_squared_sum_1 - pow(rating_sum_1,2) / len(both_watch_count)) * (rating_squared_sum_2 - pow(rating_sum_2,2) / len(both_watch_count)))
    
    # Handling 'Divide by Zero' error.
    if denominator == 0:
        return 0
    return numerator/denominator


In [15]:
print('Pearson Corelation between user ids 101 & 3107: {}'.format(pearson_correlation_score(101,3107)))


Pearson Corelation between user ids 101 & 3107: -0.11417114183449117


In [16]:
def most_similar_users_(user1, number_of_users, index, metric='pearson'):
    '''
    user1 : Targeted User
    number_of_users : number of most similar users you want to user1.
    metric : metric to be used to calculate inter-user similarity score. ('pearson' or else)
    '''
    # Getting distinct user ids.
    user_ids = ratings.userId.unique().tolist()
    
    sid=index
    eid=index+10
    
    # Getting similarity score between targeted and every other suer in the list(or subset of the list).
    if(metric == 'pearson'):
        similarity_score = [(pearson_correlation_score(user1,nth_user),nth_user) for nth_user in user_ids[sid:eid] if nth_user != user1]
    else:
        similarity_score = [(distance_similarity_score(user1,nth_user),nth_user) for nth_user in user_ids[sid:eid] if nth_user != user1]
    
    # Sorting in descending order.
    similarity_score.sort()
    similarity_score.reverse()
    
    # Returning the top most 'number_of_users' similar users. 
    return similarity_score[:number_of_users]


In [18]:
print("Two most similar users to the targetegetd u1=101 (scores of similarity and # of similar users)", most_similar_users_(101,2, 100))

Two most similar users to the targetegetd u1=101 (scores of similarity and # of similar users) [(0.6286185570937116, 107), (0.4221158824088695, 103)]


In [35]:
#print("5-similar users to u1=101 from 11 to 20 (scores_sim and # of sim_users)", most_similar_users_(101,5))


5-similar users to u1=101 from 11 to 20 (scores_sim and # of sim_users) [(1.0000000000000018, 15), (0.11558714512987596, 18), (0, 20), (0, 17), (0.0, 16)]


In [21]:
sim_users_ids = most_similar_users_(101, 2, 100)


In [22]:
print(sim_users_ids, type(sim_users_ids))


[(0.6286185570937116, 107), (0.4221158824088695, 103)] <class 'list'>


In [23]:
sim_users_ids[0]


(0.6286185570937116, 107)

In [24]:
sim_users_ids[0][1]


107

In [25]:
sindex = sim_users_ids[0][1]
print(sindex)


107


In [26]:
# user_ids = ratings.userId.unique().tolist()
# user = sindex
# print("most similar user_id from 10 to 20 is", user)


In [27]:
# llength = len(user_ids)
# #random_seed = 42
# sin = random.randint(0, llength)
# print(sin, llength)


In [28]:
def get_most_similar_users_(userid, index, user_ids):
#    user_ids = ratings.userId.unique().tolist()
    ratings_suser_ids = []
    llength = len(user_ids)
    sin = random.randint(0, llength-10)
#    user_ids = ratings.userId.unique().tolist()[sin:sin+index]
    
    for i in range(sin,sin+index,10): 
        print(f"at step# {i}, strating from {sin}")
        sim_users_ids = most_similar_users_(userid, 2, i)
        sindex = sim_users_ids[0][1]
        print("sim_score and it's user_id ", sim_users_ids[0][0], sindex)
        
        ratings_suser_ids.append(user_ids[sindex]-1)
    
    return ratings_suser_ids


In [30]:
user_ids = ratings.userId.unique().tolist()
user_list = get_most_similar_users_(101, 20, user_ids)
print("most similar user_ids for u1=101 are", user_list)


at step# 53324, strating from 53324
sim_score and it's user_id  0.42257712736425823 53333
at step# 53334, strating from 53324
sim_score and it's user_id  0.5095246653650684 53336
most similar user_ids for u1=101 are [53333, 53336]


In [31]:
# user_ids = ratings.userId.unique().tolist()
# user_list = get_most_similar_users_(101, 50, user_ids)
# print("most similar user_ids for u1=101 are", user_list)


In [32]:
# for i in range(len(user_list)):
# #    for user in user_ids[11:20]:
#     user = user_list[i]
#     print(user)

In [33]:
#get_most_similar_users_(userid, index, user_ids):
#llength = len(user_ids)
# sin = random.randint(0, llength-10)
# ratings.userId.unique().tolist()[sin:sin+10]


In [34]:
def get_recommendation_(userid, steps):
    
    user_ids = ratings.userId.unique().tolist()
    
    user_list = get_most_similar_users_(userid, steps*10, user_ids)
    
    print("length of list: ", len(user_list), user_list)
        
    total = {}
    similariy_sum = {}
    
    for user in user_list:
        print("user_id ", user)
        
        # not comparing the user to itself (obviously!)
        if user == userid:
            continue
        
        # Getting similarity score between the users.
        score = pearson_correlation_score(userid,user)
        print("score ", score)
        
        # not considering users having zero or less similarity score.
        if score <= 0:
            continue
        
        # Getting weighted similarity score and sum of similarities between both the users.
        for movieid in get_movieids_(user):
            # Only considering not watched/rated movies
            if movieid not in get_movieids_(userid) or get_rating_(userid,movieid) == 0:
       #         print("entered 'if' under main_ ")
                total[movieid] = 0
                total[movieid] += get_rating_(user,movieid) * score
                similariy_sum[movieid] = 0
                similariy_sum[movieid] += score
    
    # Normalizing ratings
    ranking = [(tot/similariy_sum[movieid],movieid) for movieid,tot in total.items()]
    ranking.sort()
    ranking.reverse()
    
    # Getting movie titles against the movie ids.
    recommendations = [get_movie_title_(movieid) for score,movieid in ranking]
    return recommendations[:10]


In [35]:
print("Found reccomendations for u1=5333 with 2 steps at random start: ", get_recommendation_(5333, 2))


at step# 204172, strating from 204172
sim_score and it's user_id  0.8427009716003844 204182
at step# 204182, strating from 204172
sim_score and it's user_id  1.0 204192
length of list:  2 [204182, 204192]
user_id  204182
score  0.8427009716003844
user_id  204192
score  1.0
Found reccomendations for u1=5333 with 2 steps at random start:  ['World Is Not Enough, The ', 'Sleepy Hollow ', 'Fight Club ', 'Sixth Sense, The ', 'Matrix, The ', 'Saving Private Ryan ', 'Mask of Zorro, The ', 'Rain Man ', 'Armageddon ', 'Hunt for Red October, The ']


In [46]:
steps=2
userin = int(input("Enter the user id to whom you want to recommend : "))

print(f"Found reccomendations for user {userin} with 2 steps at random start: {get_recommendation_(userin, 2)}")


Enter the user id to whom you want to recommend : 222
at step# 168769, strating from 168769
sim_score and it's user_id  1.0 168770
at step# 168779, strating from 168769
sim_score and it's user_id  0.4073091397134609 168782
length of list:  2 [168770, 168782]
user_id  168770
score  1.0
user_id  168782
score  0.4073091397134609
Found reccomendations for user 222 with 2 steps at random start: ['Grand Seduction, The ', 'Horrible Bosses ', 'Silent Hill ', 'Jacket, The ', 'The Butterfly Effect ', 'Pianist, The ', 'Equilibrium ', 'Beautiful Mind, A ', 'Scarface ', 'Dogma ']


In [42]:
#print(get_movieids_(77))


In [39]:
# Getting the list of all user_ids the specified movie has been rated by.
def get_userids_(movieid):
    return (ratings.loc[(ratings.movieId==movieid),'userId'].tolist())


In [40]:
# Getting the movie title with specified movieid.
def get_movietitle_(movieid):
    return (movies.loc[(movies.movieId==movieid),'title'].iloc[0])


In [41]:
print(get_movietitle_(1091))


Weekend at Bernie's 


# KFold Model

In [2]:
# Dependencies

import scipy.stats
import scipy.spatial

#from sklearn.cross_validation import KFold
from sklearn.model_selection import KFold
import random

from sklearn.metrics import mean_squared_error
#from math import sqrt
import math
import warnings
import sys

In [6]:
# Getting number of users and movies from the dataset.
user_ids = ratings.userId.unique().tolist()
movie_ids = ratings.movieId.unique().tolist()
tag_ids = tags.movieId.unique().tolist()
print('Number of Users: {}'.format(len(user_ids)))
print('Number of Movies: {}'.format(len(movie_ids)))
print('Number of Movies Tags: {}'.format(len(tag_ids)))

Number of Users: 283228
Number of Movies: 53889
Number of Movies Tags: 45981


In [7]:
warnings.simplefilter("error")

# users = 6040
# items = 3952
users = len(user_ids)
items = len(movie_ids)


In [8]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,307,3.5,1256677221
1,1,481,3.5,1256677456
2,1,1091,1.5,1256677471
3,1,1257,4.5,1256677460
4,1,1449,4.5,1256677264


In [240]:
#ratings.to_csv("output/ratings_hs.csv", header=False, index=False)

In [241]:
# def readingFile(filename):
#     f = open(filename,"r")
# #    print(f[])
#     data = []
#     for row in f:
#         r = row.split(',')
#         print(r, r[0], r[1], r[2])
#         e = [int(r[0]), int(r[1]), int(r[2])]
#         data.append(e)
#     return data

#data[2][0]

In [9]:
r0 = ratings['userId'].astype('int').to_list()
r1 = ratings['movieId'].astype('int').to_list()
r2 = round(ratings['rating']).astype('int').to_list()
r3 = ratings['rating'].astype('float').to_list()

In [10]:
print(r2[2], r3[2])

2 1.5


In [24]:
data = [r0, r1, r2]
print(type(data), type(data[0]))

<class 'list'> <class 'list'>


In [25]:
data[2][0]

4

In [26]:
len(data[2])

27753444

In [27]:
len(data)

3

In [28]:

Mat = np.zeros((users,items))

In [29]:
print(Mat)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [30]:
for e in data:
    print(e[0])
    Mat[e[0]-1][e[1]-1] = e[2]
    
print(Mat)

1
307
4
[[1. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [31]:
#print(Mat)

In [305]:
# item_similarity_cosine = np.zeros((items,items))
# print(item_similarity_cosine)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [43]:
#data[:1]
range(items)

In [None]:
for item in range(items):
#    print(item1)
    item1 = int(item)
#    print(item1, type(item1))
    print(data[:item1])
#data[:,item1]

In [None]:
def similarity_item(data):
    print "Hello"
#f_i_d = open("sim_item_based.txt","w")
    item_similarity_cosine = np.zeros((items,items))
    item_similarity_jaccard = np.zeros((items,items))
    item_similarity_pearson = np.zeros((items,items))
    for item1 in range(items):
        print item1
        for item2 in range(items):
            if np.count_nonzero(data[:,item1]) and np.count_nonzero(data[:,item2]):
                item_similarity_cosine[item1][item2] = 1-scipy.spatial.distance.cosine(data[:,item1],data[:,item2])
                item_similarity_jaccard[item1][item2] = 1-scipy.spatial.distance.jaccard(data[:,item1],data[:,item2])
                try:
                    if not math.isnan(scipy.stats.pearsonr(data[:,item1],data[:,item2])[0]):
                        item_similarity_pearson[item1][item2] = scipy.stats.pearsonr(data[:,item1],data[:,item2])[0]
                    else:
                        item_similarity_pearson[item1][item2] = 0
                except:
                    item_similarity_pearson[item1][item2] = 0

#f_i_d.write(str(item1) + "," + str(item2) + "," + str(item_similarity_cosine[item1][item2]) + "," + str(item_similarity_jaccard[item1][item2]) + "," + str(item_similarity_pearson[item1][item2]) + "\n")
#f_i_d.close()
    return item_similarity_cosine, item_similarity_jaccard, item_similarity_pearson


In [None]:
def crossValidation(data):
    k_fold = KFold(n=len(data), n_folds=10)

    Mat = np.zeros((users,items))
    for e in data:
        Mat[e[0]-1][e[1]-1] = e[2]

    sim_item_cosine, sim_item_jaccard, sim_item_pearson = similarity_item(Mat)
    #sim_item_cosine, sim_item_jaccard, sim_item_pearson = np.random.rand(items,items), np.random.rand(items,items), np.random.rand(items,items) 

    '''sim_item_cosine = np.zeros((items,items))
    sim_item_jaccard = np.zeros((items,items))
    sim_item_pearson = np.zeros((items,items))
    f_sim_i = open("sim_item_based.txt", "r")
    for row in f_sim_i:
        r = row.strip().split(',')
        sim_item_cosine[int(r[0])][int(r[1])] = float(r[2])
        sim_item_jaccard[int(r[0])][int(r[1])] = float(r[3])
        sim_item_pearson[int(r[0])][int(r[1])] = float(r[4])
    f_sim_i.close()'''

    rmse_cosine = []
    rmse_jaccard = []
    rmse_pearson = []

    for train_indices, test_indices in k_fold:
        train = [data[i] for i in train_indices]
        test = [data[i] for i in test_indices]

        M = np.zeros((users,items))

        for e in train:
            M[e[0]-1][e[1]-1] = e[2]

        true_rate = []
        pred_rate_cosine = []
        pred_rate_jaccard = []
        pred_rate_pearson = []

        for e in test:
            user = e[0]
            item = e[1]
            true_rate.append(e[2])

            pred_cosine = 3.0
            pred_jaccard = 3.0
            pred_pearson = 3.0

            #item-based
            if np.count_nonzero(M[:,item-1]):
                sim_cosine = sim_item_cosine[item-1]
                sim_jaccard = sim_item_jaccard[item-1]
                sim_pearson = sim_item_pearson[item-1]
                ind = (M[user-1] > 0)
                #ind[item-1] = False
                normal_cosine = np.sum(np.absolute(sim_cosine[ind]))
                normal_jaccard = np.sum(np.absolute(sim_jaccard[ind]))
                normal_pearson = np.sum(np.absolute(sim_pearson[ind]))
                if normal_cosine > 0:
                    pred_cosine = np.dot(sim_cosine,M[user-1])/normal_cosine

                if normal_jaccard > 0:
                    pred_jaccard = np.dot(sim_jaccard,M[user-1])/normal_jaccard

                if normal_pearson > 0:
                    pred_pearson = np.dot(sim_pearson,M[user-1])/normal_pearson

            if pred_cosine < 0:
                pred_cosine = 0

            if pred_cosine > 5:
                pred_cosine = 5

            if pred_jaccard < 0:
                pred_jaccard = 0

            if pred_jaccard > 5:
                pred_jaccard = 5

            if pred_pearson < 0:
                pred_pearson = 0

            if pred_pearson > 5:
                pred_pearson = 5

            print str(user) + "\t" + str(item) + "\t" + str(e[2]) + "\t" + str(pred_cosine) + "\t" + str(pred_jaccard) + "\t" + str(pred_pearson)
            pred_rate_cosine.append(pred_cosine)
            pred_rate_jaccard.append(pred_jaccard)
            pred_rate_pearson.append(pred_pearson)

        rmse_cosine.append(sqrt(mean_squared_error(true_rate, pred_rate_cosine)))
        rmse_jaccard.append(sqrt(mean_squared_error(true_rate, pred_rate_jaccard)))
        rmse_pearson.append(sqrt(mean_squared_error(true_rate, pred_rate_pearson)))

        print str(sqrt(mean_squared_error(true_rate, pred_rate_cosine))) + "\t" + str(sqrt(mean_squared_error(true_rate, pred_rate_jaccard))) + "\t" + str(sqrt(mean_squared_error(true_rate, pred_rate_pearson)))
        #raw_input()

    #print sum(rms) / float(len(rms))
    rmse_cosine = sum(rmse_cosine) / float(len(rmse_cosine))
    rmse_pearson = sum(rmse_pearson) / float(len(rmse_pearson))
    rmse_jaccard = sum(rmse_jaccard) / float(len(rmse_jaccard))

    print str(rmse_cosine) + "\t" + str(rmse_jaccard) + "\t" + str(rmse_pearson)

    f_rmse = open("rmse_item.txt","w")
    f_rmse.write(str(rmse_cosine) + "\t" + str(rmse_jaccard) + "\t" + str(rmse_pearson) + "\n")

    rmse = [rmse_cosine, rmse_jaccard, rmse_pearson]
    req_sim = rmse.index(min(rmse))

    print req_sim
    f_rmse.write(str(req_sim))
    f_rmse.close()

    if req_sim == 0:
        sim_mat_item = sim_item_cosine

    if req_sim == 1:
        sim_mat_item = sim_item_jaccard

    if req_sim == 2:
        sim_mat_item = sim_item_pearson

    #predictRating(Mat, sim_mat_item)
    return Mat, sim_mat_item