# Collaborative Filtering based on similarity

In [1]:
pip freeze > requirements.txt

Note: you may need to restart the kernel to use updated packages.


In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from scipy import sparse
from matplotlib import pyplot as plt
from scipy import stats

In [3]:
pip freeze > requirements.txt

Note: you may need to restart the kernel to use updated packages.


In [2]:
trans_df = pd.read_csv("transactions_train.csv")

In [3]:
trans_df.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id
0,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,663713001,0.050831,2
1,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,541518023,0.030492,2
2,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,505221004,0.015237,2
3,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687003,0.016932,2
4,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687004,0.016932,2


In [4]:
trans_df = trans_df.dropna(subset = ['customer_id'], axis = 0, inplace = False)

In [5]:
trans_df.shape

(31788324, 5)

## Dataset Filtering

In [None]:
trans_df['Bought'] = 1
import datetime  
trans_df["t_dat"]=pd.to_datetime(trans_df["t_dat"])
trans_df['pop_factor'] = trans_df['t_dat'].apply(lambda x:1/(datetime.datetime(2020,12,31)-x).days)
#pop_factor==rating
#

In [None]:
import datetime
start_date = datetime.datetime(2019,7,1)
# Filter transactions by date
trans_df["t_dat"] = pd.to_datetime(trans_df["t_dat"])
anime_ratings_df = trans_df.loc[trans_df["t_dat"] >= start_date]

In [None]:
# Filter transactions by number of an article has been bought
#Also, we are getting rid of articles that have not been bought enough. (Minimum 10 purchases are required)
article_bought_count = anime_ratings_df[['article_id', 't_dat']].groupby('article_id').count().reset_index().rename(columns={'t_dat': 'count'})
most_bought_articles = article_bought_count[article_bought_count['count']>10]['article_id'].values
anime_ratings_df = anime_ratings_df[anime_ratings_df['article_id'].isin(most_bought_articles)]

In [None]:
#Also, we are getting rid of customer who have not been bought enough. (Minimum purchase of 10 articles is required)
customer_bought_count = anime_ratings_df[['customer_id', 'article_id']].groupby('customer_id').count().reset_index().rename(columns={'article_id': 'count'})
most_bought_customers = customer_bought_count[customer_bought_count['count']>100]['customer_id'].values
anime_ratings_df = anime_ratings_df[anime_ratings_df['customer_id'].isin(most_bought_customers)]

In [None]:
#Also, we are getting rid of customer who have not been bought enough. (articles to be sold minimum of 100 customers is required)
customer_bought_count = anime_ratings_df[['customer_id', 'article_id']].groupby('article_id').count().reset_index().rename(columns={'customer_id': 'count'})
most_bought_article_customers = customer_bought_count[customer_bought_count['count']>100]['article_id'].values
anime_ratings_df = anime_ratings_df[anime_ratings_df['article_id'].isin(most_bought_article_customers)]
anime_ratings_df = anime_ratings_df.groupby(['customer_id','article_id'])['pop_factor'].aggregate('sum').reset_index().rename(columns={'pop_factor': 'pop_factor'})

In [None]:
#anime_ratings = trans_df

#customer_id==>user_id
#article_id ==>anime_id
#rating==>pop_factor

In [None]:
articles_ratings = anime_ratings_df.loc[trans_df.pop_factor != -1].reset_index()[['customer_id','article_id','pop_factor']]

In [None]:
articles_ratings

In [None]:
def encode_column(column):
    """ Encodes a pandas column with continous IDs"""
    keys = column.unique()
    key_to_id = {key:idx for idx,key in enumerate(keys)}
    return key_to_id, np.array([key_to_id[x] for x in column]), len(keys)

In [None]:
#df.rename(columns = {'old_name':'new_name'}, inplace = True)

In [None]:
def encode_df(articles_df):
    """Encodes rating data with continuous user and anime ids"""
    
    anime_ids, articles_df['article_id'], num_anime = encode_column(articles_df['article_id'])
    user_ids, articles_df['customer_id'], num_users = encode_column(articles_df['customer_id'])
    return articles_df, num_users, num_anime, user_ids, anime_ids

In [None]:
articles_ratings = articles_ratings.reset_index()[['customer_id','article_id','pop_factor']]

In [None]:
articles_ratings

In [None]:
articles_df, num_customers, num_articles, user_ids, anime_ids = encode_df(articles_ratings)
print("Number of customers :", num_customers)
print("Number of articles :", num_articles)
articles_df

In [None]:
articles_df['customer_id'].unique()

In [None]:
def create_sparse_matrix(df, rows, cols, column_name="pop_factor"):
    """ Returns a sparse utility matrix""" 
    return sparse.csc_matrix((df[column_name].values,(df['customer_id'].values, df['article_id'].values)),shape=(rows, cols))

In [None]:
articles_df, num_customers, num_articles, user_ids, anime_ids = encode_df(articles_ratings)
Y = create_sparse_matrix(articles_df, num_customers, num_articles)

In [None]:
Y.shape

In [None]:
Y.todense()

# Use Collaborative Filtering based on similarity ( sklearn) For Article Recommendation

In [None]:
from sklearn.neighbors import NearestNeighbors

In [None]:
len(articles_df['customer_id'])

In [None]:
X = np.array(Y.todense())
is_id = []
user_idss = []
pred = []
#for i in range(len(user_ids)):
for i in range(num_customers):
#def recommand_anime_id(user_id, anime_id): 
    customer_id = i
    article_id =1
    # from the users rated selected anime_id
    X_sub = X[np.nonzero(X[:,article_id])[0],:]
    model = NearestNeighbors(n_neighbors=10, metric="cosine")
    model.fit(X_sub)
# find the top 10 NN of selected user_id
    neigh_ind_i = model.kneighbors([X[customer_id,:]], 10, return_distance=False).flatten()
    neigh_ind_i
    # their average rating is the prediction of user_id on anime_id
    user_idss.append(customer_id)
    is_id.append(neigh_ind_i)
    pred_i = np.mean(X_sub[neigh_ind_i, article_id])
    pred.append(pred_i)

In [None]:
len(pred)

In [None]:
#Transform matrix to dataframes
user_idss = pd.DataFrame(user_idss)
pred = pd.DataFrame(pred) 

In [None]:
is_id = np.concatenate((user_idss, is_id), axis = 1)

In [None]:
is_id = pd.DataFrame(is_id)

In [None]:
is_id.shape

# Top 10 Recommanded Articles For Each Customer with Id: customer_id

In [None]:
is_id.columns = ['customer_id', 'A1', 'A2', 'A3', 'A4', 'A5', 'A6', 'A7', 'A8', 'A9', 'A10']
is_id

# Average Popularity Factor of The Top 10 Recommanded Articles to Each Customer with Id: customer_id

In [None]:
Predict_Avg_Pop_Factor = np.concatenate((user_idss, pred), axis = 1)
Predict_Avg_Pop_Factor = pd.DataFrame(Predict_Avg_Pop_Factor)

In [None]:
Predict_Avg_Pop_Factor.columns = ['customer_id', 'Pred_Avg_pop_factor']
Predict_Avg_Pop_Factor

In [None]:
Predict_Avg_Pop_Factor.max()

In [None]:
Predict_Avg_Pop_Factor.min()

In [None]:
plt.scatter(Predict_Avg_Pop_Factor['customer_id'],Predict_Avg_Pop_Factor['Pred_Avg_pop_factor'])

In [None]:
plt.hist(Predict_Avg_Pop_Factor['Pred_Avg_pop_factor'])

In [None]:
plt.plot(np.sort(Predict_Avg_Pop_Factor['Pred_Avg_pop_factor']))

# Question?

In [None]:
pip freeze > requirements.txt