# Collaborative Based Filtering
---

In [1]:
import os
import pandas as pd
from ast import literal_eval
import numpy as np
from wordcloud import WordCloud
import matplotlib.pyplot as plt

from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error

## General Settings

In [2]:
#Define Saving Dir
baseDir = os.getcwd()+'\\data\\PreProcessorData'

## Building Model

In [3]:
review = pd.read_csv(baseDir+'\\review.csv')
review.shape

(1660, 4)

In [4]:
model = review.pivot_table(index='User', columns='ID', values='Rating').fillna(0)
model.head()

ID,464439297,1850379265,1925374721,2374069313,3619829639,5723884103,6903008775,10148981767,10212908615,10935947527,...,6907877032033,6907881390177,6966634250337,6966653583457,6966658924641,6976031522913,6977998258273,6983550599265,6983557021793,6993170825313
User,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Aa,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Aaron K.,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Abdul A.,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Abdulla J.,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Abdullah A.,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
model = model.reset_index()
model.to_csv(os.getcwd()+'\\data\\RecommenderSystem\\collaborativeBased.csv', index=False)

## Testing Model

In [6]:
model = pd.read_csv(os.getcwd()+'\\data\\RecommenderSystem\\collaborativeBased.csv', index_col='User')
model.columns = model.columns.astype(np.int64)
model.head()

Unnamed: 0_level_0,464439297,1850379265,1925374721,2374069313,3619829639,5723884103,6903008775,10148981767,10212908615,10935947527,...,6907877032033,6907881390177,6966634250337,6966653583457,6966658924641,6976031522913,6977998258273,6983550599265,6983557021793,6993170825313
User,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Aa,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Aaron K.,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Abdul A.,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Abdulla J.,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Abdullah A.,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
preference = [
    {'ProductID': 464439297, 'Rating': 4},
    {'ProductID': 1850379265, 'Rating': 4},
    {'ProductID': 2374069313, 'Rating': 5},
    {'ProductID': 6903008775, 'Rating': 5},
    {'ProductID': 11300571975, 'Rating': 5},
    {'ProductID': 452304175135, 'Rating': 5},
    {'ProductID': 766219747425, 'Rating': 4},
    {'ProductID': 1419071258721, 'Rating': 4},
    {'ProductID': 1419080040545, 'Rating': 5},
    {'ProductID': 2028338511969, 'Rating': 5},
    {'ProductID': 4765085335649, 'Rating': 5}
]

In [8]:
def CollaborativeBased(preference):
    preference = pd.DataFrame(preference).set_index('ProductID').rename(columns={'Rating':'Target'})
    result = pd.concat([preference.T, model]).fillna(0)
    similarity = result.subtract(result.mean(axis=1))
    similarity = pd.DataFrame(data=cosine_similarity(similarity.fillna(0)), index=similarity.index, columns=similarity.index)
    result = result.loc[similarity.nlargest(10, 'Target').index]
    result = result.drop(preference.index, axis=1)
    return result.mean(axis=0).sort_values(ascending=False)[:3].index.to_list()

In [9]:
CollaborativeBased(preference)

[291859234847, 5723884103, 3959064789089]

In [10]:
def parseFeedback(feedback, preference, keep):
    try:
        for x in feedback:
            if x['Interested'] == True:
                rating = 5            
                keep.append(x['ProductID'])
            elif x['Interested'] == False:
                rating = 1
            else:
                return False
            preference.append({'ProductID':x['ProductID'], 'Rating':rating})
            return True
    except Exception as e:
        return False
    
def CollaborativeBased(preference, feedback=[]):
    
    keep = []
    if feedback and not parseFeedback(feedback, preference, keep): return False

    preference = pd.DataFrame(preference).set_index('ProductID').rename(columns={'Rating':'Target'})
    result = pd.concat([preference.T, model]).fillna(0)
    
    similarity = result.subtract(result.mean(axis=1))
    similarity = pd.DataFrame(data=cosine_similarity(similarity.fillna(0)), index=similarity.index, columns=similarity.index)
    
    result = result.loc[similarity.nlargest(10, 'Target').index]
    
    if keep != []: result = result.drop([x for x in preference.index if x not in keep], axis=1)
    else: result = result.drop(preference.index, axis=1)
    
    result = result.mean(axis=0).sort_values(ascending=False)
    return result[:3].index.to_list()

In [11]:
CollaborativeBased(preference)

[291859234847, 5723884103, 3959064789089]

In [12]:
CollaborativeBased(preference, feedback=[{'ProductID':291859234847, 'Interested':False}])

[5723884103, 3959064789089, 1925374721]

## Log Process

In [13]:
product = pd.read_csv(baseDir+'\\product.csv', index_col='ID', converters={'Tags':literal_eval})
product.head()

Unnamed: 0_level_0,Tags
ID,Unnamed: 1_level_1
2374069313,"[Africa, Blend, Central America, Chocolate, Es..."
6903008775,"[Asia & Pacific, Blend, Bold, Caramel, Chocola..."
6811920695393,"[Anaerobic, Asia & Pacific, Best Coffee, Filte..."
452304175135,"[Bestseller, Blend, Bold, Caramel, Chocolate, ..."
1419084071009,"[Blend, Bold, Chocolate, Espresso, Malaysia, M..."


In [14]:
from io import BytesIO
import base64

def collaborativeBasedDescription(result):
    description = 'The recommendation is derived from a set of user with a similar preference to you. You might find something new that you would enjoy, but do note that certain product might not get recommended due to lack of review.'
    wordList = product.loc[result].explode('Tags')['Tags'].value_counts().to_dict()
    wordCloud = WordCloud(background_color='white').generate_from_frequencies(wordList).to_image()
    image = BytesIO()
    wordCloud.save(image, format='png')
    return description, image.getvalue()

In [16]:
description, image = collaborativeBasedDescription([5723884103, 3959064789089, 1925374721])