In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime, date 
from lightfm.data import Dataset
from lightfm import LightFM
from lightfm import cross_validation
from lightfm.evaluation import precision_at_k
from lightfm.evaluation import auc_score



In [22]:
user=pd.read_csv('./data/users.csv',sep=',')
favorite=pd.read_csv(r'./places_event_decouper_csv/favorite.csv',sep=';')
visit=pd.read_csv(r'./places_event_decouper_csv/visit.csv',sep=';')
place=pd.read_csv(r'./places_event_decouper_csv/place.csv',sep=';',encoding='latin-1')
review=pd.read_csv(r'./places_event_decouper_csv/review.csv',sep=';',encoding='latin-1')
fos_user=pd.read_csv(r'./places_event_decouper_csv/fos_user_user.csv',sep=',')
place_place_type=pd.read_csv(r'./places_event_decouper_csv/place_place_type.csv',sep=';')
place_type=pd.read_csv(r'./places_event_decouper_csv/place_type.csv',sep=';')


  fos_user=pd.read_csv(r'./places_event_decouper_csv/fos_user_user.csv',sep=',')


In [15]:
def generate_int_id(dataframe, id_col_name):
    """
    Generate unique integer id for users, questions and answers

    Parameters
    ----------
    dataframe: Dataframe
        Pandas Dataframe for Users or Q&A. 
    id_col_name : String 
        New integer id's column name.
        
    Returns
    -------
    Dataframe
        Updated dataframe containing new id column 
    """
    new_dataframe=dataframe.assign(
        int_id_col_name=np.arange(len(dataframe))
        ).reset_index(drop=True)
    return new_dataframe.rename(columns={'int_id_col_name': id_col_name})

#drop columns if they have to many na 

def drop_columns_na(dataframe,pourcentna):
    for i in dataframe.columns:
        pourcent=(dataframe[i].isna().sum()/dataframe[i].isna().count())
        if(pourcentna<pourcent):
            dataframe.drop(i,axis=1,inplace=True)
    return dataframe


def create_features(dataframe, features_name, id_col_name):
    """
    Generate features that will be ready for feeding into lightfm

    Parameters
    ----------
    dataframe: Dataframe
        Pandas Dataframe which contains features
    features_name : List
        List of feature columns name avaiable in dataframe
    id_col_name: String
        Column name which contains id of the question or
        answer that the features will map to.
        There are two possible values for this variable.
        1. questions_id_num
        2. professionals_id_num

    Returns
    -------
    Pandas Series
        A pandas series containing process features
        that are ready for feed into lightfm.
        The format of each value
        will be (user_id, ['feature_1', 'feature_2', 'feature_3'])
        Ex. -> (1, ['military', 'army', '5'])
    """

    features = dataframe[features_name].apply(
        lambda x: ','.join(x.map(str)), axis=1)
    features = features.str.split(',')
    features = list(zip(dataframe[id_col_name], features))
    return features



def generate_feature_list(dataframe, features_name):
    """
    Generate features list for mapping 

    Parameters
    ----------
    dataframe: Dataframe
        Pandas Dataframe for Users or Q&A. 
    features_name : List
        List of feature columns name avaiable in dataframe. 
        
    Returns
    -------
    List of all features for mapping 
    """
    features = dataframe[features_name].apply(
        lambda x: ','.join(x.map(str)), axis=1)
    features = features.str.split(',')
    features = features.apply(pd.Series).stack().reset_index(drop=True)
    return features


def calculate_auc_score(lightfm_model, interactions_matrix, 
                        question_features, professional_features): 
    """
    Measure the ROC AUC metric for a model. 
    A perfect score is 1.0.

    Parameters
    ----------
    lightfm_model: LightFM model 
        A fitted lightfm model 
    interactions_matrix : 
        A lightfm interactions matrix 
    question_features, professional_features: 
        Lightfm features 
        
    Returns
    -------
    String containing AUC score 
    """
    score = auc_score( 
        lightfm_model, interactions_matrix, 
        item_features=question_features, 
        user_features=professional_features, 
        num_threads=4).mean()
    return score

In [46]:
#merge of bar if with their type in a single rows
places_type_group=place_place_type.merge(
    place_type,how='inner',left_on='place_type_id',right_on='id')
# places_type_group=places_type_group.drop(['id','place_type_id'],axis=1)
places_type_group=places_type_group.groupby(['place_id'])['name'].apply(','.join).reset_index()
places_type_group.rename(columns={'name':'type'})


Unnamed: 0,place_id,type
0,3,Bar à bière
1,4,"Bar à bière,Bar à cocktail,Bar sportif"
2,10,Péniche
3,11,Péniche
4,13,"Bar alternatif,Bar de nuit"
...,...,...
1282,7195,"Bar Brasserie,Café"
1283,7196,"Bar à bière,Bar à cocktail,Bar Brasserie"
1284,7197,"Bar à bière,Irish Pub"
1285,7198,"Bar à bière,Bar à vin"


In [23]:
# user table with to many na drop and user without firebase_uid 

user=drop_columns_na(user,0.58)
user=user.drop(['created_at','updated_at'],axis=1)
user=user.dropna(subset=['firebase_uid'])


In [83]:
# j'essaye de groupé le style prefere d'un utilisateur en fonction des type de ces favoris 

favorites=favorite[['place_id','user_id']]
favorites=favorites.merge(
    places_type_group,on='place_id',how='left'
)
favorites=favorites.drop(columns=['place_id'])
favorites=favorites.groupby(['user_id']).name.agg('sum').reset_index()
# user_favorite_score=
favorites

# création des feature 
favorites_feature_list = generate_feature_list(favorites,['name'])
favorites_feature_list

0                      Bar insoliteBar sportif
1            Bar de nuitBar à bièreBar à bière
2                   Bar de quartierBar à bière
3       Bar à spectaclesBar à bièreBar à tapas
4        Bar asiatiqueBar à bièreBar Brasserie
                         ...                  
7433                                         0
7434                                         0
7435                                         0
7436                      Bar à rhumBar gaming
7437                                         0
Length: 7438, dtype: object