In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb
import math 
import random
import time as t

from pandas import DataFrame, merge
from sklearn.mixture import GaussianMixture
from sklearn.modela_selection import StratifiedKFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn import preprocessing
from sklearn import metrics
from sklearn.model_selection import GridSearchCV, StratifiedKFold

pd.options.display.max_columns = None
import warnings; warnings.simplefilter('ignore')

In [3]:
# Getting user data
users = pd.read_csv('customer.csv')
users.head()

Unnamed: 0,userId,avg_rating,avg_rating_customer,quantity_customer,seen_popularity,seen_rating,gender,age
0,1,2.55,2.55,20,45.55,3.562529,female,39
1,2,3.486842,3.486842,76,106.578947,3.536646,male,35
2,3,3.568627,3.568627,51,116.843137,3.716531,male,36
3,4,4.348039,4.348039,204,72.480392,3.610074,female,24
4,5,3.91,3.91,100,92.28,3.567149,female,32


### Getting item table

In [4]:
# Getting item data
items = pd.read_csv('items.csv',encoding='latin1')
items.head()

Unnamed: 0,movieId,avg_rating_item,quantity_item,likability,title,year,genre,cost
0,31,3.178571,42,-0.277838,Toy Story,1995,Animation,149
1,1029,3.702381,42,0.156436,Jumanji,1995,Adventure,583
2,1061,3.545455,33,0.082347,Grumpier Old Men,1995,Romance,450
3,1129,3.3125,48,-0.160192,Waiting to Exhale,1995,Comedy,839
4,1172,4.26087,46,0.646038,Father of the Bride Part II,1995,Comedy,846


## 1. Partitioning items into Head and Tail
Break the total items I into head and tail part. We have selected breakdown point $\alpha$ = 10 where $\alpha$ means items rating frequency > $\alpha$. In this case, the $\alpha$ is 10. The item quantity 
- '>30' are labeled as 'head' and 
- <=30 are labeled as 'tail' 

In [5]:
# Mark movie popular or tail
def cal_item_cat(row,break_point):
    popularity = int(row['quantity_item'])
    if(popularity > break_point):
        return 'head'
    return 'tail'

def assign_item_cat(df,break_point):
    df['item_cat'] = df.apply(lambda row: cal_item_cat(row,break_point), axis = 1)
    return df

In [6]:
# The tail breakdown is at 30
items = assign_item_cat(items,30)
items['item_cat'].value_counts()

tail    8245
head     821
Name: item_cat, dtype: int64

## 2. Cluster tail items

After identifying head and tail item, tail items T are clusterd using Expectation Maximization (EM) Clustering. EM is more likely to K-mean cluster model, but the advantage of EM is that it maximize the likelihood of data distribution in cluster by estimating the means and standard deviations(SD) for each cluster. Clustering featurs considered are
- I_aver_rating 
- I_popularity
- I_likablility 
from the derived variable list.

Algorithm maps each tail T items into the 3-dimensional Euclidian space formed from above three variables and performs the clustering on that. 

Currently we have considered 10 number of clusters to be formed.

In [7]:
# getting deta with 3 drived variables
train_df = items[items['item_cat'] == 'tail'][['movieId','avg_rating_item','quantity_item','likability']]

In [8]:
#train_df.head()

In [9]:
gmm = GaussianMixture(n_components=10)
x = gmm.fit(train_df)

In [10]:
#pd.DataFrame({'cluster': gmm.predict(train_df)}).cluster.value_counts()

In [11]:
# Storing clusters
items.loc[items.item_cat == 'tail','cluster'] = gmm.predict(items[items.item_cat == 'tail']
                                                             [['movieId','avg_rating_item',
                                                               'quantity_item','likability']]).astype(int)

In [12]:
# Assigining head items as 11th cluster
items.loc[items.item_cat == 'head','cluster'] = 10
items['cluster'] = items['cluster'].astype(int)

In [13]:
items.head(10)

Unnamed: 0,movieId,avg_rating_item,quantity_item,likability,title,year,genre,cost,item_cat,cluster
0,31,3.178571,42,-0.277838,Toy Story,1995,Animation,149,head,10
1,1029,3.702381,42,0.156436,Jumanji,1995,Adventure,583,head,10
2,1061,3.545455,33,0.082347,Grumpier Old Men,1995,Romance,450,head,10
3,1129,3.3125,48,-0.160192,Waiting to Exhale,1995,Comedy,839,head,10
4,1172,4.26087,46,0.646038,Father of the Bride Part II,1995,Comedy,846,head,10
5,1263,3.864583,48,0.316899,Heat,1995,Action,533,head,10
6,1287,3.891304,46,0.391538,Sabrina,1995,Comedy,419,head,10
7,1293,3.978261,46,0.450087,Tom and Huck,1995,Action,1694,head,10
8,1339,3.298077,52,-0.16306,Sudden Death,1995,Action,508,head,10
9,1343,3.74359,39,0.207635,GoldenEye,1995,Adventure,1142,head,10


In [14]:
df = pd.read_csv('movie_data_final.csv',encoding='latin1')
data = pd.merge(df,items[['movieId','cluster','item_cat']], on='movieId',how='inner')
len(data)

100004

## 3. Training Data Mining model for each cluster

Next we use Random Forest Regression(RF) to predict new or unkown ratings. For each cluster this data mining models are trained and store to predict ratings of cluster items. 

We have performed grid serch to opt best parameter for the RF with 5 fold cross validation. And after that best performing parameters are chosen to train the model.

In [15]:
col_list = ['userId','age','movieId','cost','avg_rating_item','quantity_item','likability',
       'avg_rating_customer','quantity_customer','seen_popularity','seen_rating']

def train_models(df):
    model = []
    for i in range(0,11):
        features_df = df[df.cluster == i][col_list]
        target_df = df[df.cluster == i]['rating']
        features_df = pd.get_dummies(features_df)
        X_train, X_test, y_train, y_test = train_test_split(features_df,target_df,
                                                               test_size=0.3,shuffle=True)
        tuned_parameters ={'max_features': ['sqrt','log2','auto'],
                              'max_depth': [30,25,20,10,5],}
        t1 = t.time()
        gridsearch_rf = GridSearchCV(RandomForestRegressor(),tuned_parameters,cv =5, 
                                         scoring = 'neg_mean_squared_error')
        gridsearch_rf.fit(X_train,y_train)
        
        print("##############################################################################")
        print("Gridsearch is completed for RF_",i)
        print("Time taken:",round((t.time() - t1)/60,3),"m")
        print("---------------------------------------------------------------")
        print("Best parameter choosen: {}".format(gridsearch_rf.best_params_))
        print("Mean Sequard Error: {}".format(gridsearch_rf.best_score_))
        print("Fitting on the entire training dataset using the best parameter found....")
        model.append(gridsearch_rf.best_estimator_)
        model[i].fit(X_train,y_train)
        print("Fitting completed.")
        print(model[i])
        print("##############################################################################")
    
    return model    

In [16]:
models = train_models(data)

##############################################################################
Gridsearch is completed for RF_ 0
Time taken: 0.981 m
---------------------------------------------------------------
Best parameter choosen: {'max_depth': 10, 'max_features': 'log2'}
Mean Sequard Error: -0.691241438176717
Fitting on the entire training dataset using the best parameter found....
Fitting completed.
RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=10,
           max_features='log2', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)
##############################################################################
##############################################################################
Gridsearch is completed for RF_ 1
Time taken: 0.075 m
------------

##############################################################################
Gridsearch is completed for RF_ 10
Time taken: 2.245 m
---------------------------------------------------------------
Best parameter choosen: {'max_depth': 10, 'max_features': 'sqrt'}
Mean Sequard Error: -0.7113278601785751
Fitting on the entire training dataset using the best parameter found....
Fitting completed.
RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=10,
           max_features='sqrt', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)
##############################################################################


## 4. COLLABORATIVE FILTERING 

Next we apply collaborative filtering. In that first we will find 10 most similar user and items they have rated. After that we will drop common items rated by user for whome top N list you want and his other similar users. We will predict ratings for remaning items and retrive top N list from that. 

### Pearson Correlation

Pearson Correlation Score:

It is more sophisticated method to similarity between user intrest then simple cosine similarity. The correlation coefficient is a measure of how well two sets of data fit on a straight line. The formula for this is more complicated that the Euclidean distance score, but it tends to give better results in situations where the data isn’t well normalized like our present data set.

$$ S_x = \dfrac{\sum x^2 - (\sum x)^2}{n}$$<br>
$$ S_y = \dfrac{\sum y^2 - (\sum y)^2}{n}$$<br>
$$ S_{xy} = \dfrac{\sum xy - (\sum x)(\sum x)}{n}$$<br>
$$score = \dfrac{S_{xy}}{\sqrt{S_{xx}S_{yy}}}$$

Implementation for the Pearson correlation score first finds the items rated by both users. It then calculates the sums and the sum of the squares of the ratings for the both users and calculates the sum of the products of their ratings. Finally, it uses these results to calculate the Pearson correlation coefficient.Unlike the distance metric, this formula is not intuitive, but it does tell you how much the variables change together divided by the product of how much they alter individually.

In [17]:
def pearson_correlation(user1,row,df):
    # Getting user
    user2 = int(row['userId'])
    
    # To get both rated items
    u1_r = df[df['userId'] == user1][['movieId','rating']]
    u2_r = df[df['userId'] == user2][['movieId','rating']]
    both_rated = u1_r.merge(u2_r,on='movieId',how='inner')
    
    number_of_ratings = len(both_rated)
    # Checking for number of ratings in common
    if number_of_ratings == 0:
        return 0
    
    # Add up all the preferences of each user
    user1_preferences_sum = both_rated['rating_x'].sum()
    user2_preferences_sum = both_rated['rating_y'].sum()
    
    # Sum up the squares of preferences of each user
    user1_square_preferences_sum = (both_rated['rating_x'] ** 2).sum()
    user2_square_preferences_sum = (both_rated['rating_y'] ** 2).sum()
    
    # Sum up the product value of both preferences for each item
    product_sum_of_both_users = (both_rated['rating_x'] * both_rated['rating_y']).sum()
    
    # Calculate the pearson score
    numerator_value = product_sum_of_both_users - (user1_preferences_sum*user2_preferences_sum/number_of_ratings)
    denominator_value = math.sqrt((user1_square_preferences_sum - pow(user1_preferences_sum,2)/number_of_ratings)
                                 * (user2_square_preferences_sum -pow(user2_preferences_sum,2)/number_of_ratings))
    
    if denominator_value == 0:
        return 0
    else:
        r = numerator_value/denominator_value
        return r

def most_similar_users(user_df,rating_df,person,number_of_users):
    
    # returns the number_of_users (similar persons) for a given specific person.
    correlation_data = user_df.copy()
    correlation_data = correlation_data[correlation_data.userId != person]
    correlation_data['relation_score'] = correlation_data.apply(lambda row: pearson_correlation(person,row,rating_df),
                                                                axis = 1)
    #print(data.head())
    
    # Sort the similar persons so that highest scores person will appear at the first
    correlation_data = correlation_data.sort_values(by='relation_score', ascending=False)
    return correlation_data[0:number_of_users].userId  #[['userId','relation_score']]

In [18]:
#most_similar_users(users,data,1,10)

In [19]:
def predict_rating(row):
    i = int(row['cluster'])
    #print(row[col_list].values.tolist())
    return models[i].predict([row[col_list].values.tolist()])[0]
    
    
def get_ratings(df):
    df['rating'] = df.apply(lambda row: predict_rating(row),axis = 1)
    return df

In [84]:
def user_recommendation(person):
    
    # Getting top 10 similar user 
    sim_users = most_similar_users(users,data,person,10)
    # Getting items that are rated by similar users 
    item_list = data.loc[data['userId'].isin(sim_users)]
    
    # Selecting only items that user haven't seen yet from item_list
    user_items = data[data.userId == person].movieId.tolist()
    recom_item_list = item_list.loc[~(item_list['movieId'].isin(user_items))].movieId.unique().tolist() 
    recom_item_data = items.loc[items['movieId'].isin(recom_item_list)] 
    
    # Predicting ratings for items
    recomm_data = recom_item_data.assign(key=1).merge(users[users['userId'] == person].assign(key=1)).drop('key', 1)
    recomm_data = get_ratings(recomm_data)
    
    # Sort data by ratings 
    recomm_data = recomm_data.sort_values(by='rating', ascending=False)
    return recomm_data#recomm_data[0:number_of_recommendation][['movieId','cost','item_cat','cluster','genre']]
    

In [92]:
#print("Recommendation for user 2")
#rec = user_recommendation(2).reset_index(drop=True)
#rec.head(10)
#len(rec)

## 5. CEAM FILTERING

1. We will be creating a database with average and median prices of items within different categories. In order to have a comparison of particular user’s spending with average or median of different categories, we will need data of previous purchases by a customer.
2. Now in order to compare it with previous purchases of a customer, we need to make calculations of Average Ratio. Average Ratio is the proportion of price of the items bought by users and the average price of the categories of items that the customer has bought.

In [55]:
items['genre'] = items['genre'].fillna('unknown')
data['genre'] = data['genre'].fillna('unknown')

In [56]:
genre_df = pd.DataFrame({'genre' : items.genre.unique()})

In [57]:
def cal_avg_price(row,cost_data):
    cat = row['genre']
    #print(cat)
    cost_data = cost_data[cost_data.genre == cat]
    return cost_data['cost'].sum()/len(cost_data)

def avg_cat_price(cat_data,cost_data):
    cat_data['avg_cost'] = cat_data.apply(lambda row: cal_avg_price(row,cost_data), axis=1)
    return cat_data

In [58]:
genre_df = avg_cat_price(genre_df,items)

In [59]:
def cal_purchase_score(row):
    category = row['genre']
    price = int(row['cost'])
    #print(genre_df[genre_df.genre == category]['avg_cost'])
    avg_cost = int(genre_df[genre_df.genre == category]['avg_cost'])
    return price/avg_cost

def purchase_score(row):
    temp = data[data.userId == int(row['userId'])]
    temp['purchase_score'] = temp.apply(lambda row: cal_purchase_score(row), axis=1)
    return temp.purchase_score.sum()/len(temp)

def avg_purchase(df):
    df['avg_purchase_score'] = df.apply(lambda row: purchase_score(row), axis=1)
    return df

In [60]:
# Calculate average spending ration for each user
users = avg_purchase(users)

In [61]:
def item_score(df):
    df['score'] = df.apply(lambda row: cal_purchase_score(row), axis=1)
    return df

In [94]:
def top_N(user,N):
    # Getting collobrativ filtered data
    rec = user_recommendation(user).reset_index(drop=True)
    
    # Getting cost ratio of items to be reommended 
    rec = item_score(rec)
    num = int(users[users.userId == 2].avg_purchase_score)
    rec = rec[rec['score'].between(num-1,num+1,inclusive=True)]
    
    # Getting top-N items
    top_N = (rec.sort_values(by='rating', ascending=False)).reset_index(drop=True).head(N)
    print(top_N['title'])
    print("------------------------------")
    print(top_N.item_cat.value_counts())

In [93]:
# making top 10 recommendation for userid 661
top_N(661,10)

0    Batman: Mask of the Phantasm
1              Send Me No Flowers
2                    Doctor Sleep
3              The Usual Suspects
4               Leaving Las Vegas
5                      Diabolique
6                         Khomreh
7                    Urban Legend
8           Come See the Paradise
9                   Anna Karenina
Name: title, dtype: object
------------------------------
tail    6
head    4
Name: item_cat, dtype: int64
