In [132]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime, date 
from lightfm.data import Dataset
from lightfm import LightFM
from lightfm import cross_validation
from lightfm.evaluation import precision_at_k
from lightfm.evaluation import auc_score
import random
from lightfm_dataset_helper.lightfm_dataset_helper import  DatasetHelper

In [133]:
path='./places_event_decouper_csv/'
df_user=pd.read_csv(path + 'users.csv',sep=',')
df_favorite=pd.read_csv(path + 'favorite.csv',sep=';')
df_visit=pd.read_csv(path + 'visit.csv',sep=';')
df_place=pd.read_csv(path + 'place.csv',sep=';',encoding='latin-1')
review=pd.read_csv(path + 'review.csv',sep=';',encoding='latin-1')
df_place_place_type=pd.read_csv(path + 'place_place_type.csv',sep=';')
place_type=pd.read_csv(path + 'place_type.csv',sep=',')

In [134]:
def generate_int_id(dataframe, id_col_name):
    """
    Generate unique integer id for users, questions and answers

    Parameters
    ----------
    dataframe: Dataframe
        Pandas Dataframe for Users or Q&A. 
    id_col_name : String 
        New integer id's column name.
        
    Returns
    -------
    Dataframe
        Updated dataframe containing new id column 
    """
    new_dataframe=dataframe.assign(
        int_id_col_name=np.arange(len(dataframe))
        ).reset_index(drop=True)
    return new_dataframe.rename(columns={'int_id_col_name': id_col_name})

#drop columns if they have to many na 

def drop_columns_na(dataframe,pourcentna):
    for i in dataframe.columns:
        pourcent=(dataframe[i].isna().sum()/dataframe[i].isna().count())
        if(pourcentna<pourcent):
            dataframe.drop(i,axis=1,inplace=True)
    return dataframe


def create_features(dataframe, features_name, id_col_name):
    """
    Generate features that will be ready for feeding into lightfm

    Parameters
    ----------
    dataframe: Dataframe
        Pandas Dataframe which contains features
    features_name : List
        List of feature columns name avaiable in dataframe
    id_col_name: String
        Column name which contains id of the question or
        answer that the features will map to.
        There are two possible values for this variable.
        1. questions_id_num
        2. professionals_id_num

    Returns
    -------
    Pandas Series
        A pandas series containing process features
        that are ready for feed into lightfm.
        The format of each value
        will be (user_id, ['feature_1', 'feature_2', 'feature_3'])
        Ex. -> (1, ['military', 'army', '5'])
    """

    features = dataframe[features_name].apply(
        lambda x: ','.join(x.map(str)), axis=1)
    features = features.str.split(',')
    features = list(zip(dataframe[id_col_name], features))
    return features



def generate_feature_list(dataframe, features_name):
    """
    Generate features list for mapping 

    Parameters
    ----------
    dataframe: Dataframe
        Pandas Dataframe for Users or Q&A. 
    features_name : List
        List of feature columns name avaiable in dataframe. 
        
    Returns
    -------
    List of all features for mapping 
    """
    features = dataframe[features_name].apply(
        lambda x: ','.join(x.map(str)), axis=1)
    features = features.str.split(',')
    features = features.apply(pd.Series).stack().reset_index(drop=True)
    return features


def calculate_auc_score(lightfm_model, interactions_matrix, 
                        question_features, professional_features): 
    """
    Measure the ROC AUC metric for a model. 
    A perfect score is 1.0.

    Parameters
    ----------
    lightfm_model: LightFM model 
        A fitted lightfm model 
    interactions_matrix : 
        A lightfm interactions matrix 
    question_features, professional_features: 
        Lightfm features 
        
    Returns
    -------
    String containing AUC score 
    """
    score = auc_score( 
        lightfm_model, interactions_matrix, 
        item_features=question_features, 
        user_features=professional_features, 
        num_threads=4).mean()
    return score

In [135]:
# calculate_auc_score(model,interactions,item_feature,user_feature)

In [136]:
df_user.columns

Index(['id', 'created_at', 'updated_at', 'date_of_birth', 'firstname',
       'gender', 'locale', 'timezone', 'firebase_uid'],
      dtype='object')

on import nos user qui sont les persone a qui on doit recommander une place (bar)
on drop les columns sans intérais pour nous comme la date de création la dernier mise a jour ect 
et on leur génere un id speciale qui servira seulement pour notre model car on a besoin d'avoir des id unique par user qui commence a 0 et sont consecutif jusqu'au dernier (comme certains user de la db ne sont pas des user rélle mais ajouté par l'equipe de schlouk map pour diverse raisson on drop les personne sans firebase uid )

In [176]:
user=df_user
user=user.drop(['created_at','updated_at'],axis=1)
user=user.dropna(subset=['firebase_uid'])
user.rename(columns={'id':'user_id'},inplace=True)
user = generate_int_id(user,"user_id_light")

Unnamed: 0,user_id,date_of_birth,firstname,gender,locale,timezone,firebase_uid,user_id_light
0,2,,Mélisande,u,,,Pr9h7oy8OPUQjvBMyd5UbxTLo9f1,0
1,3,,Jules,m,,,WHxFfvhMfDSaAnU0Sw5DQcubcX72,1
2,4,1994-02-21 00:00:00,Martine,f,,,Vkf93Wl3lfcKT7PqLYQix7Auaoc2,2
3,6,1980-12-04 22:13:43,Kayla,f,,,kEZnF2cU6RekcHW1cyE7XT349ML2,3
4,7,2000-04-01 14:52:19,Francis,m,,,Ih3ug97x9JOYbyIdGXCfjzyK5Aq1,4
...,...,...,...,...,...,...,...,...
36780,39657,,Joana,,,,JhCuJXsBuYP60WBJgz8e2C3viqQ2,36780
36781,39658,,Baptiste,,,,Ac5IKA5fIAgEPhCvbrIzOAcGM0u2,36781
36782,39659,,Madjid,,,,mvG1oyWeZCV7goCcAjRzvVehBvq1,36782
36783,39660,,Riad,,,,oVREj1jNkKX3fVA2DlgMAbJWjG63,36783


on récupere seulement les infos qui pourrais nous etre utile pour chaque bar et comme pour les user on leur crée un lightfm id qui servira au mapping 
on modifie aussi le type pour evité certaine erreur au moment des merge 

In [138]:
place = df_place[['id','is_closed','is_published','has_offers','has_food','has_terrace','slug','city_id']]
place=place.astype(object)
place=generate_int_id(place,"place_id_light")


on cherhce a savoir qu'elle utilisateur on un favoris ou un lien avec au moins un  bar et on va créé une rellation pour les user qui n'en non aucun actuellement 
pour cela on recupere la liste des user sur favoris et on drop les doublon 
cela nous permet de recupere avec un merge qui n'a aucun favoris 

In [161]:
qui_a_un_favoris=df_favorite.drop_duplicates(subset='user_id')
qui_a_un_favoris=qui_a_un_favoris['user_id']


user_without_favori=user.merge(
    qui_a_un_favoris,how='outer',on='user_id',indicator=True
)
indexNames=user_without_favori[user_without_favori['_merge'] == 'both'].index
user_without_favori.drop(indexNames , inplace=True)

user_without_favoris=user_without_favori['user_id']
user_without_favoris2=user_without_favori[['user_id']]



In [162]:
place_id_list=place['id']

liste_user=[]
for i in  user_without_favoris:
    liste_user.append(i)

liste_place=[]
for i in range(len(liste_user)):
    liste_place.append(random.choice(place_id_list))


dicko={'user_id':liste_user,'place_id':liste_place}
favorite_for_new_user = pd.DataFrame(dicko)



In [163]:
favorite_temp=pd.concat([df_favorite,favorite_for_new_user])
favorite_temp.nunique()

id            11999
place_id       6973
user_id       36785
created_at     8635
dtype: int64

In [164]:
bar_sans_favoris=favorite_temp.drop_duplicates(subset='place_id')
bar_sans_favoris=bar_sans_favoris['place_id']

place_without_favori = place.merge(
    bar_sans_favoris,how='outer',left_on='id',right_on='place_id',indicator=True
)
indexNames=place_without_favori[place_without_favori['_merge'] == 'both'].index
place_without_favori.drop(indexNames , inplace=True)


place_without_favoris=place_without_favori[['id']]


In [165]:
user_id_list=user_without_favoris2.user_id.to_list()

place_without_favoris=place_without_favoris.id.to_list()

list_user=[]
for i in place_without_favoris:
    list_user.append(random.choice(user_id_list))

dickos={'user_id':list_user,'place_id':place_without_favoris}
favorite_for_new_user2 = pd.DataFrame(dickos)

favorite_for_new_user2.nunique()


user_id     52
place_id    52
dtype: int64

In [166]:
favorite=pd.concat([favorite_temp,favorite_for_new_user2])
favorite.nunique()

id            11999
place_id       7025
user_id       36785
created_at     8635
dtype: int64

In [167]:
# on crée des id unique 
favorite_for_merge=favorite[['user_id',"place_id"]]
favorite_for_merge = generate_int_id(favorite_for_merge,'favorite_id_light')

# on merge place_type avec place_place_type = type_for_merge
type_for_merge=place_type.merge(
    df_place_place_type,how='inner',right_on='place_type_id',left_on='id'
)

# on merge place avec type for merge pour assossier les item aux places=place_for_merge
type_for_merge=type_for_merge.drop(["id",'place_type_id'],axis=1)

place_for_merge=place.merge(
    type_for_merge,how='left',left_on='id',right_on='place_id'
)

# on merge place_for_merge avec les user
df_merge=user.merge(
    favorite_for_merge,how="inner",left_on='user_id',right_on='user_id'
)

df_merge=df_merge.merge(
    place_for_merge,how='left',left_on='place_id',right_on='id'
)




bar_feature_list

In [168]:
test=df_merge[['user_id_light','name']]

test=test.dropna()
test=test.groupby(
    ['user_id_light'])['name'].apply(
        ','.join).reset_index()
test['name'] = (
    test['name'].str.split(',').apply(set).str.join(','))

df_user_ready=user.merge(
    test,how='left',on='user_id_light'
)
df_user_ready



Unnamed: 0,user_id,date_of_birth,firstname,gender,locale,timezone,firebase_uid,user_id_light,name
0,2,,Mélisande,u,,,Pr9h7oy8OPUQjvBMyd5UbxTLo9f1,0,"Bar à spectacles,Bar de quartier,Bar Brasserie..."
1,3,,Jules,m,,,WHxFfvhMfDSaAnU0Sw5DQcubcX72,1,"Bar à spectacles,Beer Hall,Bar Brasserie,Bar à..."
2,4,1994-02-21 00:00:00,Martine,f,,,Vkf93Wl3lfcKT7PqLYQix7Auaoc2,2,
3,6,1980-12-04 22:13:43,Kayla,f,,,kEZnF2cU6RekcHW1cyE7XT349ML2,3,
4,7,2000-04-01 14:52:19,Francis,m,,,Ih3ug97x9JOYbyIdGXCfjzyK5Aq1,4,
...,...,...,...,...,...,...,...,...,...
36780,39657,,Joana,,,,JhCuJXsBuYP60WBJgz8e2C3viqQ2,36780,"Bar gaming,Bar à rhum"
36781,39658,,Baptiste,,,,Ac5IKA5fIAgEPhCvbrIzOAcGM0u2,36781,
36782,39659,,Madjid,,,,mvG1oyWeZCV7goCcAjRzvVehBvq1,36782,
36783,39660,,Riad,,,,oVREj1jNkKX3fVA2DlgMAbJWjG63,36783,


In [169]:
place_for_item= place_for_merge
place_for_item['name']=place_for_item['name'].fillna('No Tag')

In [170]:
favorite= df_favorite[['place_id','user_id']]
visit = df_visit[['place_id','user_id']]
fav_vis = pd.concat([favorite,visit])
fav_vis
df_do_not_recomend=fav_vis.drop_duplicates()
df_do_not_recomend

Unnamed: 0,place_id,user_id
0,379,3
1,200,3
2,147,3
3,3942,23
4,2803,23
...,...,...
12674,1475,13496
12679,361,26049
12680,6792,2355
12681,6600,38762


In [171]:
user_feature_list = generate_feature_list(
    df_user_ready,['name']
)
place_feature_list = generate_feature_list(
    place_for_item,['name']
)


In [150]:
df_user_ready['user_feature2']=create_features(
    df_user_ready,['name'],'user_id_light'
)

place_for_item['place_feature'] = create_features(
    place_for_item,['name'],'place_id_light'
)


In [151]:
# df_merge['total_weights']=random.randint(0,5)

In [172]:
dataset = Dataset()
dataset.fit(
    set(df_user_ready['user_id_light']),
    set(place_for_item['place_id_light']), 
    user_features=user_feature_list,
    item_features=place_feature_list)



In [173]:
df_merge['bar_user_id_tuple']=list(zip(
    df_merge.user_id_light,df_merge.place_id_light
))

interactions,weights=dataset.build_interactions(
    df_merge['bar_user_id_tuple']
)

In [174]:
user_feature = dataset.build_user_features(
    df_user_ready['user_feature2']
)
item_feature = dataset.build_item_features(
    place_for_item['place_feature']
)

KeyError: 'user_feature2'

In [155]:
model = LightFM(
    no_components=150,
    learning_rate=0.02,
    loss='warp',
    random_state=2019)

model.fit(
    interactions,
    user_features=user_feature,
    item_features=item_feature,
    sample_weight=weights,
    epochs=10, num_threads=7, verbose=True)

Epoch: 100%|██████████| 10/10 [00:21<00:00,  2.15s/it]


<lightfm.lightfm.LightFM at 0x1d10d3530d0>

In [156]:
user_id_map, user_feature_map, item_id_map, item_feature_map = dataset.mapping()


n_users, n_items = interactions.shape

In [157]:
user_x = user_id_map[0]

list_prediction=model.predict(user_x, np.arange(n_items)) # means predict for all 
list_prediction

array([-0.1827057 ,  0.11128785, -0.09932998, ..., -0.20055492,
       -0.4586488 , -0.4254532 ], dtype=float32)

In [158]:
df_prediction = pd.DataFrame(list_prediction, columns=['score'])
df_prediction=df_prediction.reset_index()
# filtre avant les bar ferme non publish ect 
df_prediction=df_prediction.merge(
    place,how='inner',left_on='index',right_on='place_id_light'
)
df_prediction = df_prediction.drop(df_prediction[df_prediction.is_closed==1].index)
df_prediction = df_prediction.drop(df_prediction[df_prediction.is_published == 0].index)
df_prediction = df_prediction.drop(df_prediction[df_prediction.city_id != 1].index)


df_prediction.sort_values(by='score',ascending=False)

Unnamed: 0,index,score,id,is_closed,is_published,has_offers,has_food,has_terrace,slug,city_id,place_id_light
32,32,0.791502,35,0,1,0,1.0,1.0,public-house,1,32
144,144,0.791226,148,0,1,0,1.0,1.0,caupona,1,144
37,37,0.788883,41,0,1,0,1.0,1.0,what-the-fox,1,37
5,5,0.533987,8,0,1,0,1.0,1.0,academie-de-la-biere-cathedrale,1,5
50,50,0.501233,54,0,1,0,1.0,1.0,abattoir-cafe,1,50
...,...,...,...,...,...,...,...,...,...,...,...
185,185,-0.591026,189,0,1,0,1.0,0.0,episode-0-dooz-escape-game,1,185
90,90,-0.624362,94,0,1,0,,,cafe-montmartre,1,90
6413,6413,-0.633787,6580,0,1,0,1.0,1.0,brasserie-du-bonheur,1,6413
6846,6846,-0.676091,7016,0,1,0,1.0,1.0,le-bonnet-dane,1,6846


In [159]:
def recommandation_for_user_by_city(id_user,city_id):
    user = user_id_map[id_user]
    list_prediction=model.predict(user, np.arange(n_items)) 
    df_prediction = pd.DataFrame(list_prediction, columns=['score'])
    df_prediction=df_prediction.reset_index()

    df_prediction=df_prediction.merge(
        place,how='inner',left_on='index',right_on='place_id_light'
    )
    df_prediction = df_prediction.drop(df_prediction[df_prediction.is_closed==1].index)
    df_prediction = df_prediction.drop(df_prediction[df_prediction.is_published == 0].index)
    df_prediction = df_prediction.drop(df_prediction[df_prediction.city_id != city_id].index)
    
    df_prediction=df_prediction.sort_values(by='score',ascending=False)
    return df_prediction
    

In [160]:
recommandation_for_user_by_city(4,1)

Unnamed: 0,index,score,id,is_closed,is_published,has_offers,has_food,has_terrace,slug,city_id,place_id_light
32,32,0.528086,35,0,1,0,1.0,1.0,public-house,1,32
144,144,0.480467,148,0,1,0,1.0,1.0,caupona,1,144
37,37,0.461904,41,0,1,0,1.0,1.0,what-the-fox,1,37
5,5,0.411098,8,0,1,0,1.0,1.0,academie-de-la-biere-cathedrale,1,5
139,139,0.358091,143,0,1,1,0.0,0.0,barberousse,1,139
...,...,...,...,...,...,...,...,...,...,...,...
147,147,-0.234498,151,0,1,0,,,le-living-room,1,147
6846,6846,-0.235441,7016,0,1,0,1.0,1.0,le-bonnet-dane,1,6846
145,145,-0.249204,149,0,1,0,1.0,1.0,starbucks-kleber,1,145
90,90,-0.271355,94,0,1,0,,,cafe-montmartre,1,90
