In [1]:
import pandas as pd
import numpy as np

In [2]:
def add_ext_to_str(df, col):
    if type(col) == str:
        col = [col]
    for col in col:
        df[col] = df[col].astype(str) + '_' + col
    return df

def fill_na_with_text(df, col, add_ext=False):
    if type(col) == str:
        col = [col]
    for col in col:
        if add_ext:
            df[col] = df[col].fillna('NO_'+col)
        else:
            df[col] = df[col].fillna('NO_INFO')
    return df

def merge_col(df, col, sep=','):
    return df[col].apply(lambda x: sep.join(x), axis=1)

def generate_int_id(dataframe, id_col_name):
    """
    Generate unique integer id for users, product and answers

    Parameters
    ----------
    dataframe: Dataframe
        Pandas Dataframe for Users or Q&A. 
    id_col_name : String 
        New integer id's column name.
        
    Returns
    -------
    Dataframe
        Updated dataframe containing new id column 
    """
    new_dataframe=dataframe.assign(
        int_id_col_name=np.arange(len(dataframe))
        ).reset_index(drop=True)
    return new_dataframe.rename(columns={'int_id_col_name': id_col_name})

def generate_feature_list(dataframe, features_name):
    """
    Generate features list for mapping 

    Parameters
    ----------
    dataframe: Dataframe
        Pandas Dataframe for Users or Q&A. 
    features_name : List
        List of feature columns name avaiable in dataframe. 
        
    Returns
    -------
    List of all features for mapping 
    """
    features = dataframe[features_name].apply(
        lambda x: ','.join(x.map(str)), axis=1)
    features = features.str.split(',')
    features = features.apply(pd.Series).stack().reset_index(drop=True)
    return features

def create_features(dataframe, features_name, id_col_name):
    """
    Generate features that will be ready for feeding into lightfm

    Parameters
    ----------
    dataframe: Dataframe
        Pandas Dataframe which contains features
    features_name : List
        List of feature columns name avaiable in dataframe
    id_col_name: String
        Column name which contains id of the product or
        answer that the features will map to.
        There are two possible values for this variable.
        1. product_id_num
        2. user_id_num

    Returns
    -------
    Pandas Series
        A pandas series containing process features
        that are ready for feed into lightfm.
        The format of each value
        will be (user_id, ['feature_1', 'feature_2', 'feature_3'])
        Ex. -> (1, ['military', 'army', '5'])
    """

    features = dataframe[features_name].apply(
        lambda x: ','.join(x.map(str)), axis=1)
    features = features.str.split(',')
    features = list(zip(dataframe[id_col_name], features))
    return features

In [3]:
df_review = pd.read_csv("/home/music/Desktop/measure_model/db_cosmenet/review.csv", low_memory=False)
df_review = df_review[['ID', 'EID', 'UID', 'POINT']][df_review['ACTION'] == 1]
df_review.head(1)

Unnamed: 0,ID,EID,UID,POINT
0,161562,40039,106774,5


In [4]:
df_product = pd.read_csv("/home/music/Desktop/measure_model/db_cosmenet/product_joined.csv")
df_product = df_product[['EID', 'PRODUCT_NAME', 'BID', 'SCID', 'CID']]
df_product.head(1)

Unnamed: 0,EID,PRODUCT_NAME,BID,SCID,CID
0,13001,UV Aqua Rich Watery Essence SPF50+ PA++++,1528,248,62


In [5]:
df_product.loc[df_product['SCID'] == -1, 'SCID'] = 'NO_SUBCATEGORY'
df_product.loc[df_product['SCID'] == -2, ['SCID', 'CID']] = 'NO_CATEGORY'
df_product.loc[df_product['BID'] == -4, 'BID'] = 'NO_INFO'

df_product['BID'] = 'BID' + '_' + df_product['BID'].astype(str)
df_product['SCID'] = df_product['SCID'].astype(str)
df_product['CID'] = df_product['CID'].astype(str)

df_product['product_tag'] = merge_col(df_product, ['BID', 'SCID', 'CID'])
df_product = generate_int_id(df_product, 'product_id_num')
df_product['product_tag'] = (
    df_product['product_tag'].str.split(',').apply(set).str.join(','))

df_product.head(1)

Unnamed: 0,EID,PRODUCT_NAME,BID,SCID,CID,product_tag,product_id_num
0,13001,UV Aqua Rich Watery Essence SPF50+ PA++++,BID_1528,248,62,"248,BID_1528,62",0


In [6]:
df_user = pd.read_csv("/home/music/Desktop/measure_model/db_cosmenet/user.csv")
df_user = df_user[['ID', 'AGE_TEXT', 'PERSONAL_GENDER', 'UF_SKINFACE', 'UF_BODYTYPE', 'UF_HAIRTYPE']]
df_user.rename(columns={'ID': 'UID'}, inplace=True)
df_user.head(1)

Unnamed: 0,UID,AGE_TEXT,PERSONAL_GENDER,UF_SKINFACE,UF_BODYTYPE,UF_HAIRTYPE
0,1,31 - 35,Female,,,


In [7]:
df_user = fill_na_with_text(df_user, ['UF_SKINFACE', 'UF_BODYTYPE'])
df_user = add_ext_to_str(df_user, ['UF_SKINFACE', 'UF_BODYTYPE'])
df_user = fill_na_with_text(df_user, ['AGE_TEXT', 'PERSONAL_GENDER', 'UF_HAIRTYPE'], add_ext=True)

is_female = np.isin(df_user['PERSONAL_GENDER'], 'Female')
is_male = np.isin(df_user['PERSONAL_GENDER'], 'Male')
is_nan = np.isin(df_user['PERSONAL_GENDER'], 'NO_PERSONAL_GENDER')
df_user.loc[~is_female & ~is_male & ~is_nan, 'PERSONAL_GENDER'] = 'LGBTQ+'

df_user['user_tag'] = merge_col(df_user, ['AGE_TEXT', 'PERSONAL_GENDER', 'UF_SKINFACE', 'UF_BODYTYPE', 'UF_HAIRTYPE'])
df_user = generate_int_id(df_user, 'user_id_num')

df_user.head(1)

Unnamed: 0,UID,AGE_TEXT,PERSONAL_GENDER,UF_SKINFACE,UF_BODYTYPE,UF_HAIRTYPE,user_tag,user_id_num
0,1,31 - 35,Female,NO_INFO_UF_SKINFACE,NO_INFO_UF_BODYTYPE,NO_UF_HAIRTYPE,"31 - 35,Female,NO_INFO_UF_SKINFACE,NO_INFO_UF_...",0


In [8]:
df_merge = df_review.merge(df_product[['EID', 'product_tag', 'product_id_num']], how='inner',
    left_on='EID', right_on='EID')
df_merge = df_merge.merge(df_user[['UID', 'user_tag', 'user_id_num']], how='inner',
    left_on='UID', right_on='UID')
df_merge.head(1)

Unnamed: 0,ID,EID,UID,POINT,product_tag,product_id_num,user_tag,user_id_num
0,161561,40039,104728,5,"58,216,BID_3081",1244,"25 - 30,Female,ผิวผสม_UF_SKINFACE,ผิวธรรมดา_UF...",102544


In [9]:
user_prev_product_tags = df_merge[['UID', 'product_tag']]
user_prev_product_tags = user_prev_product_tags.dropna()
user_prev_product_tags = user_prev_product_tags.groupby(
    ['UID'])['product_tag'].apply(
        ','.join).reset_index()

# drop duplicates tags from each user rows
user_prev_product_tags['product_tag'] = (
    user_prev_product_tags['product_tag'].str.split(',').apply(set).str.join(','))

# finally merge the dataframe with user dataframe 
df_user = df_user.merge(user_prev_product_tags, how='left', on='UID')

# join user tags and their answered tags 
# we replace nan values with ""
df_user['UID_all_tags'] = (
    df_user[['user_tag', 'product_tag']].apply(
        lambda x: ','.join(x.dropna()),
        axis=1))

df_user.head(1)

Unnamed: 0,UID,AGE_TEXT,PERSONAL_GENDER,UF_SKINFACE,UF_BODYTYPE,UF_HAIRTYPE,user_tag,user_id_num,product_tag,UID_all_tags
0,1,31 - 35,Female,NO_INFO_UF_SKINFACE,NO_INFO_UF_BODYTYPE,NO_UF_HAIRTYPE,"31 - 35,Female,NO_INFO_UF_SKINFACE,NO_INFO_UF_...",0,"BID_1578,1821,2636","31 - 35,Female,NO_INFO_UF_SKINFACE,NO_INFO_UF_..."


In [10]:
df_product['product_tag'] = df_product['product_tag'].str.split(',').apply(set).str.join(',')
df_user['UID_all_tags'] = df_user['UID_all_tags'].str.split(',').apply(set).str.join(',')

In [11]:
product_feature_list = generate_feature_list(
    df_product,
    ['product_tag'])

user_feature_list = generate_feature_list(
    df_user,
    ['UID_all_tags'])

In [12]:
df_product['product_features'] = create_features(
    df_product, ['product_tag'], 
    'product_id_num')

df_user['user_features'] = create_features(
    df_user,
    ['UID_all_tags'],
    'user_id_num')

In [13]:
from lightfm.data import Dataset
from lightfm import LightFM

In [14]:
dataset = Dataset()
dataset.fit(
    set(df_user['user_id_num']), 
    set(df_product['product_id_num']),
    item_features=product_feature_list, 
    user_features=user_feature_list)

In [15]:
df_merge['point_tuple'] = list(zip(
    df_merge.user_id_num, df_merge.product_id_num, df_merge.POINT))

In [16]:
interactions, weights = dataset.build_interactions(
    df_merge['point_tuple'])

In [17]:
product_features = dataset.build_item_features(
    df_product['product_features'])

user_features = dataset.build_user_features(
    df_user['user_features'])

In [18]:
model = LightFM(
    no_components=150,
    learning_rate=0.05,
    loss='warp',
    random_state=2019)

model.fit(
    interactions,
    item_features=product_features,
    user_features=user_features, sample_weight=weights,
    epochs=20, num_threads=4, verbose=True)

Epoch: 100%|██████████| 20/20 [02:52<00:00,  8.61s/it]


<lightfm.lightfm.LightFM at 0x7fbeac6eb1f0>

In [20]:
import pickle
import os
file = 'lightFM_model.pickle'
if not os.path.exists(file):
    with open(file, 'wb') as fle:
        pickle.dump(model, fle, protocol=pickle.HIGHEST_PROTOCOL)
else:
    print(f"{file} exists.")

In [21]:
import pickle
loaded_model = pickle.load(open('lightFM_model.pickle', 'rb'))

In [22]:
from lightfm.evaluation import auc_score

In [23]:
def calculate_auc_score(lightfm_model, interactions_matrix, 
                        product_features, User_features): 
    """
    Measure the ROC AUC metric for a model. 
    A perfect score is 1.0.

    Parameters
    ----------
    lightfm_model: LightFM model 
        A fitted lightfm model 
    interactions_matrix : 
        A lightfm interactions matrix 
    product_features, User_features: 
        Lightfm features 
        
    Returns
    -------
    String containing AUC score 
    """
    score = auc_score( 
        lightfm_model, interactions_matrix, 
        item_features=product_features, 
        user_features=User_features, 
        num_threads=4).mean()
    return score

In [27]:
calculate_auc_score(model, interactions, product_features, user_features)

0.99134266

In [129]:
calculate_auc_score(loaded_model, interactions, product_features, user_features)

0.99591017

In [24]:
from IPython.display import display_html
def display_side_by_side(*args):
    html_str=''
    for df in args:
        html_str+=df.to_html()
    display_html(html_str.replace('table','table style="display:inline"'),raw=True)

def previous_product(user, n_previous=3, show=True):
    # print their previous answered product title
    previous_product_id_num = df_merge.loc[df_merge['user_id_num'] == user][:n_previous]['product_id_num']
    df_previous_product = df_product.loc[df_product['product_id_num'].isin(previous_product_id_num)]
    if show:
        print('User Id (' + str(user) + "): Previous review product")
        display_side_by_side(
            df_previous_product[['EID', 'PRODUCT_NAME', 'product_features']],
            df_user.loc[df_user.user_id_num == user][['UID', 'user_id_num','user_tag']])
    return df_previous_product

def recommend_product(model, user, n_previous=3, top_n=10, show=True):
    df_previous_product = previous_product(user, n_previous, show=show)
    
    # predict
    discard_product_id = df_previous_product['product_id_num'].values.tolist()
    df_use_for_prediction = df_product.loc[~df_product['product_id_num'].isin(discard_product_id)]
    product_id_for_predict = df_use_for_prediction['product_id_num'].values.tolist()
    
    scores = model.predict(
        user,
        product_id_for_predict,
        item_features=product_features,
        user_features=user_features)
    
    df_use_for_prediction = df_use_for_prediction.assign(scores=scores)
    df_use_for_prediction = df_use_for_prediction.sort_values(by='scores', ascending=False)[:top_n]
    if show:
        print('User Id (' + str(user) + "): Recommended product: ")
        display(df_use_for_prediction[['EID', 'PRODUCT_NAME', 'product_features']])
    return df_use_for_prediction

def recommends_product(model, user_ids, n_previous=3, top_n=10):
    for user in user_ids:
        recommend_product(model, user, n_previous, top_n)
        print("=========================================================================================================")

In [25]:
from tqdm.notebook import tqdm

In [27]:
n_previous=5
count = 0
for i in tqdm(range(df_merge['user_id_num'].max())):
    previous = previous_product(i, None, show=False)
    recommend = recommend_product(loaded_model, i, n_previous=n_previous, top_n=10, show=False)
    result = recommend[recommend['EID'].isin(previous['EID'])]
    
    if len(result) > 0:
        count += 1
        # print('User Id (' + str(i) + "): Previous review product")
        # display_side_by_side(
        #     previous[['EID', 'PRODUCT_NAME']],
        #     previous_product(i, 5, show=False)[['EID', 'PRODUCT_NAME']],
        #     recommend[['EID', 'PRODUCT_NAME']],
        #     result[['EID', 'PRODUCT_NAME']])
        # print("=========================================================================================================")
print(count)
print(count/(df_merge['user_id_num'].max()+1)*100)

  0%|          | 0/103527 [00:00<?, ?it/s]

1721


In [29]:
from lightfm.evaluation import precision_at_k
from lightfm.cross_validation import random_train_test_split

In [31]:
train_interactions, test_interactions = random_train_test_split(interactions, test_percentage=0.2)

In [33]:
train_weights, test_weights = random_train_test_split(weights, test_percentage=0.2)


In [60]:
model = LightFM(
    no_components=250,
    learning_rate=0.0005,
    loss='warp',
    random_state=2019)

model.fit(
    train_interactions,
    item_features=product_features,
    user_features=user_features,
    epochs=20, num_threads=8, verbose=True)

Epoch: 100%|██████████| 20/20 [05:20<00:00, 16.04s/it]


<lightfm.lightfm.LightFM at 0x7fbea86ff9d0>

In [61]:
print("Test precision_at_k score:", precision_at_k(loaded_model, interactions, item_features=product_features, user_features=user_features, train_interactions=train_interactions,
                                   num_threads=8, check_intersections=False).mean())

Test precision_at_k score: 0.13128889


In [None]:
0.13, 150


In [43]:
print("Test AUC score:", auc_score(model, test_interactions, item_features=product_features, user_features=user_features, 
                                   num_threads=7).mean())

Test AUC score: 0.98318493


In [45]:
test_auc = auc_score(model, test_interactions, item_features=product_features, user_features=user_features,
                     train_interactions=train_interactions, num_threads=7, check_intersections=False).mean()
print('Collaborative filtering test AUC: %s' % test_auc)

Collaborative filtering test AUC: 0.98344797
