In [2]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import ticker
import matplotlib.dates as mdates
from datetime import datetime, time ,date ,timedelta
from dateutil.relativedelta import relativedelta
import seaborn as sns
pd.options.mode.chained_assignment = None

In [2]:
import warnings
warnings.filterwarnings("ignore")

<h1> Functions </h1>

<h3> Fonctions générales </h3>

In [3]:
def floatise (df, list_columns_to_floatise):
    for i in list_columns_to_floatise:
        df[i] = df[i].apply(lambda x: float(x.replace(',','.')))
    return (df)

In [4]:
def intise (df, list_columns_to_floatise):
    for i in list_columns_to_floatise:
        df[i] = df[i].astype(int)
    return (df)

In [5]:
def flatten (table):
    if type(table.columns)==pd.MultiIndex:
        columns_to_look = [name_tmp for name_tmp in table.columns]

        columns_df = [ str(t[0])+'_'+str(t[1]) for t in columns_to_look]
        columns_df.insert(0,table.index.name)

        df = pd.DataFrame(columns = columns_df)

        index = 0
        for i in table.index:
            row = [table[r][i] for r in columns_to_look]
            row.insert(0,i)
            df.loc[index] = row
            index = index + 1
        return(df)
    else :
        table = pd.DataFrame(table)
        table.reset_index(level=0, inplace=True)
        return table

In [6]:
def flatten_soft(dataframe):
    res = pd.DataFrame()
    res[dataframe.index.name] = dataframe.index
    for col in dataframe.columns:
        name_tmp=""
        for i in range(len(dataframe.columns[0])):
            name_tmp = name_tmp +'_'+str(col[i])
        res[str(name_tmp)] = dataframe[col].values
    return res

<h3> Fonctions particulières </h3>

In [7]:
def cohort_attribution (x):
    if (x<2017):
        return 2016
    else :
        return x

In [8]:
def customer_category_attribution(x):
    if x==1:
        return 'One-timer'
    if x==2:
        return 'Two-timer'
    else:
        return 'Recurring'

In [9]:
def describe_discount(x):
    if x<5:
        return '<5%'
    if (x>=5) and (x<10):
        return( '5-10%')
    if (x>=10) and (x<20):
        return( '10-20%')
    if (x>=20) and (x<30):
        return( '20-30%')
    if (x>=30) and (x<40):
        return( '30-40%')
    if (x>=40) and (x<50):
        return( '40-50%')
    if (x>=50) and (x<60):
        return( '50-60%')
    if (x>=60) and (x<70):
        return( '60-70%')
    if (x>=70):
        return( '>70%')            

<h1> Dataset creation </h1>

<h3> Import de la data </h3>

In [10]:
src_root = os.getcwd()
print(src_root)

C:\Users\UgoMANTEL\Work\Github\Vertbaudet_2021\src\notebooks


- Préparation des colonnes:

In [11]:
col_names = ['AUFTRNR','KDNR','ARTNRERF','WTR','ARTGRERF','DATERF','PREISERF','PREIS','EKP','PREISNET','RABATT','RABATTSATZ',
             'MENGE','MENGERET','DEPARTEMENT','CAUFTRAGSPOS','MWST','MWSTRABATT','ANZKINDER','ANLAGEDAT',
             'DATAUFTRAG1','STRASSE','PLZ','ORT','BUNDESLAND','CLAND','CWERBESP','DATWERBESP','CLIEFERSP','DATLIEFERSP',
             'CADRESSSP','DATADRESSSP','UMSATZ1','DATLETZTAUFTR','UMSLETZTAUFTR','ANZKATLETZTAUFTR','WKOSTLETZTAUFTR',
             'DBKDNR','WKOSTENKUM','UMSATZKUM','UMSATZKUMTEL','UMSATZKUMFAX','UMSATZKUMWEB','UMSATZLIEFKUM','RUECKSTAUFTR',
             'ANZAUFTR','REFPRES','REFSTK','CODDOC','PLANEKP','RAYON','FAMILLE','DEPART','CODMARQ','ARTGROESSE','DATAUFTRAG',
             'DATERFASSUNG','DATVERARB','GUTSCHEINWERT']

In [12]:
dict_department = {'B':'BEBE','N':'PAP FILLE','U':'PAP GARCON','D':'CHAUSSURE','F':'FEMME',
'T':'TEXTILE HOME','P':'PUERICULTURE','W':'NON PAP ENFANT','K':'DECO','M':'CHAMBRE ET LITERIE','R':'JOUETS'}

<h3> Constitution du dataset </h3>

- Définition des répertoires :

In [13]:
repertoire = "C:/Users/UgoMANTEL/Work/Github/Vertbaudet_2021/data/1. Germany"
output_rep = "C:/Users/UgoMANTEL/eleven/Engagements - Vertbaudet/5. Analyses/3. Outputs python"
backup_rep = "C:/Users/UgoMANTEL/Work/Github/Vertbaudet_2021/data/4. Back-up"

- Choix des options

In [1]:
import_data = False

<h4> Création de la donnée: </h4>

In [15]:
net_demand=True

In [17]:
if import_data==False:
    df = pd.DataFrame()

    for file in os.listdir(repertoire):
        file_name_tmp = os.path.join(repertoire,file)
        df_tmp = pd.read_csv(file_name_tmp,sep=",",encoding= "unicode_escape", usecols=col_names)

        df_tmp = df_tmp.loc[df_tmp.CAUFTRAGSPOS!='$null$']
        df_tmp.CAUFTRAGSPOS = df_tmp.CAUFTRAGSPOS.astype(int)
        df_tmp = df_tmp.loc[df_tmp.CAUFTRAGSPOS <= 7]

        ################### CONVERSION DES CHAMPS ###################
        df_tmp['DATERF'] = pd.to_datetime(df_tmp.DATERF)
        df_tmp['DATAUFTRAG1'] = pd.to_datetime(df_tmp.DATAUFTRAG1)
        df_tmp.DEPARTEMENT = df_tmp.DEPARTEMENT.map(dict_department) 

        ################### CREATION DES CHAMPS ###################
        df_tmp['TOT_SALES'] = (df_tmp.PREIS - df_tmp.RABATT)*(df_tmp.MENGE - df_tmp.MENGERET)
        df_tmp['NET_DEMAND'] = (df_tmp.PREIS - df_tmp.RABATT) * df_tmp.MENGE
        df_tmp['GROSS_DEMAND'] = df_tmp.PREIS * df_tmp.MENGE
        df_tmp['YEAR_FIRST_ORDER'] = df_tmp.DATAUFTRAG1.dt.year
        df_tmp['YEAR_ORDER'] = df_tmp.DATERF.dt.year
        df_tmp['COHORT'] = df_tmp.YEAR_FIRST_ORDER.apply(lambda x: cohort_attribution(x))
        df_tmp['MONTH_RECRUITMENT'] = df_tmp.DATAUFTRAG1.dt.month
        df_tmp['ID_ORDER'] = df_tmp.AUFTRNR.astype(str) + '_'+ df_tmp.KDNR.astype(str)

        if net_demand:
            cat_order = df_tmp.sort_values(by='NET_DEMAND',ascending=False).drop_duplicates(subset='ID_ORDER', keep='first',
                                                                                            inplace=False)
        else:
            cat_order = df_tmp.sort_values(by='TOT_SALES',ascending=False).drop_duplicates(subset='ID_ORDER', keep='first',
                                                                                            inplace=False)
        cat_order = cat_order [['ID_ORDER','DEPARTEMENT']]
        cat_order.columns = ['ID_ORDER','DEPARTEMENT_CMD']
        df_tmp = pd.merge(df_tmp,cat_order,on='ID_ORDER',how='left',suffixes=(False,False))

        df_tmp = df_tmp[['DATERF','DATAUFTRAG1','DEPARTEMENT','TOT_SALES','NET_DEMAND','GROSS_DEMAND',
                        'YEAR_FIRST_ORDER','YEAR_ORDER','COHORT','MONTH_RECRUITMENT','ID_ORDER','KDNR','ARTNRERF',
                         'DEPARTEMENT_CMD','RABATT','RABATTSATZ','PREIS','ANZKINDER','ARTGRERF','ARTGROESSE','CODMARQ']]
        df = pd.concat([df,df_tmp])
        del(df_tmp)


    ################### CREATION DE CHAMPS ###################
    ################### UNIVERS RECRUTEMENT
    cat_rec = df.sort_values(by='DATERF',ascending=True).drop_duplicates(subset='KDNR', keep='first', inplace=False)[['KDNR','DEPARTEMENT_CMD']] 
    cat_rec.columns = ['KDNR','DEPARTEMENT_FIRST']
    df = pd.merge(df,cat_rec,on='KDNR',how='left',suffixes=(False,False))

    ################### DATE POUR LTV 12/24 MOIS
    df['DATAUFTRAG1_12MONTH'] = df['DATAUFTRAG1']+ timedelta(days=365)
    df['DATAUFTRAG1_24MONTH'] = df['DATAUFTRAG1']+ timedelta(days=730)

    df = df.loc[(df.COHORT>=2017) & (df.COHORT<2021)]

- Ajout du nombre de commande:

In [19]:
if import_data==False:
    nb_commande = flatten(df.groupby(['KDNR']).agg({'ID_ORDER':pd.Series.nunique}))
    nb_commande.columns = ['KDNR','NB_ORDERS']
    df = pd.merge(df,nb_commande,on='KDNR',how='left',suffixes=(False,False))

    ########## Création des catégories
    df['CLIENT_CATEGORY'] = df.NB_ORDERS.apply(lambda x: customer_category_attribution(x))

- Export du dataset :

In [20]:
if import_data==False:
    df.to_csv(os.path.join('C:/Users/UgoMANTEL/Work/Github/Vertbaudet_2021/data/4. Back-up/Raw_Data_Germany.csv'),sep=";")

<h4> Import de la donnée: </h4>

In [15]:
if import_data:
    df = pd.read_csv(os.path.join(backup_rep,'Raw_Data_Germany.csv'),sep=";")
    df.drop(columns=['Unnamed: 0'],inplace=True)
    
    ########## Conversion des champs
    df['DATERF'] = pd.to_datetime(df.DATERF)
    df['DATAUFTRAG1'] = pd.to_datetime(df.DATAUFTRAG1)
    df['DATAUFTRAG1_12MONTH'] = pd.to_datetime(df.DATAUFTRAG1_12MONTH)
    df['DATAUFTRAG1_24MONTH'] = pd.to_datetime(df.DATAUFTRAG1_24MONTH)

In [16]:
df.columns

Index(['DATERF', 'DATAUFTRAG1', 'DEPARTEMENT', 'TOT_SALES', 'NET_DEMAND',
       'GROSS_DEMAND', 'YEAR_FIRST_ORDER', 'YEAR_ORDER', 'COHORT',
       'MONTH_RECRUITMENT', 'ID_ORDER', 'KDNR', 'ARTNRERF', 'DEPARTEMENT_CMD',
       'RABATT', 'RABATTSATZ', 'PREIS', 'ANZKINDER', 'ARTGRERF', 'ARTGROESSE',
       'CODMARQ', 'DEPARTEMENT_FIRST', 'DATAUFTRAG1_12MONTH',
       'DATAUFTRAG1_24MONTH', 'NB_ORDERS', 'CLIENT_CATEGORY'],
      dtype='object')

<h4> Autre tâches: </h4>

- Export carte Pierrick :

In [37]:
export_carte = flatten(df.groupby(['KDNR']).agg({'CLIENT_CATEGORY':'last'}))
export_carte.to_csv(os.path.join(output_rep,'ClientsCategory_Germany.csv'),sep=";")

- Import du discount:

In [21]:
df_base_price = pd.read_csv('C:/Users/UgoMANTEL/Work/Github/Vertbaudet_2021/data/3. Other/20210928_DiscountGER.csv')

In [25]:
df_base_price = df_base_price.drop_duplicates(subset=['ARTNR'], keep='last')

- Ajout du numcmde var 38:

In [28]:
df_numcde = pd.read_csv("C:/Users/UgoMANTEL/Work/Github/Vertbaudet_2021/data/3. Other/20211001_NUM_CMD_CANAL_GER.csv",sep=",",encoding= "unicode_escape")
df_numcde['ID_ORDER'] = df_numcde.AUFTNR.astype(str) + '_'+ df_numcde.KDNR.astype(str)
df = pd.merge(df,df_numcde[['ID_ORDER','BELEGSTAPELNR']],on='ID_ORDER',how='left',suffixes=(False,False))
del(df_numcde)

- Check de cohérence:

In [69]:
print('% Lines with no BELEGSTAPELNR :' +str(round(100*len(df.loc[df.BELEGSTAPELNR == '$null$'])/len(df),1))+'%')

% Lines with no BELEGSTAPELNR :0.4%


In [70]:
print('% Orders with no BELEGSTAPELNR :' +str(round(100*len(df.loc[df.BELEGSTAPELNR == '$null$'].ID_ORDER.unique())/len(df.ID_ORDER.unique()),1))+'%')

% Orders with no BELEGSTAPELNR :0.4%


<h1> Analyses </h1>

<h3> Check : </h3> 

In [20]:
check_coherence = flatten(df.groupby(['CLIENT_CATEGORY']).agg({'KDNR':pd.Series.nunique,'ID_ORDER':pd.Series.nunique}))
check_coherence['NB_PURCHASE_PER_CLI'] = check_coherence.ID_ORDER / check_coherence.KDNR
check_coherence

Unnamed: 0,CLIENT_CATEGORY,KDNR,ID_ORDER,NB_PURCHASE_PER_CLI
0,One-timer,409292,409292,1.0
1,Recurring,176840,892092,5.044628
2,Two-timer,143457,286914,2.0


<h3> Etude par DN : </h3> 

In [21]:
DN_cat = df.groupby(['CLIENT_CATEGORY','YEAR_ORDER']).agg({'NET_DEMAND': sum})
DN_cat = flatten_soft(DN_cat.unstack('YEAR_ORDER',fill_value=0))
DN_cat

Unnamed: 0,CLIENT_CATEGORY,_NET_DEMAND_2017,_NET_DEMAND_2018,_NET_DEMAND_2019,_NET_DEMAND_2020,_NET_DEMAND_2021
0,One-timer,5671230.0,6792149.0,7865800.0,10946240.0,38486.43
1,Recurring,9839625.0,14718290.0,15908140.0,21635360.0,17419980.0
2,Two-timer,3522414.0,4598461.0,5124417.0,7222721.0,3439103.0


In [22]:
DN_cat.to_csv(os.path.join(output_rep,'rep_DN_cat_GER.csv'),sep=";")

<h3> Etude par mois de recrutement : </h3> 

- Etude par année ?

In [22]:
per_year = True

In [23]:
if per_year:
    for year in df.COHORT.unique():
        month_distrib = df.loc[df.COHORT == year].groupby(['CLIENT_CATEGORY','MONTH_RECRUITMENT']).agg({'KDNR': pd.Series.nunique})
        month_distrib = flatten_soft(month_distrib.unstack('CLIENT_CATEGORY',fill_value=0))
        name = str('month_recrut_client_category_GER_'+str(year)+'.csv')
        month_distrib.to_csv(os.path.join(output_rep,name),sep=";")
        month_distrib
        
else :
    month_distrib = df.groupby(['CLIENT_CATEGORY','MONTH_RECRUITMENT']).agg({'KDNR': pd.Series.nunique})
    month_distrib = flatten_soft(month_distrib.unstack('CLIENT_CATEGORY',fill_value=0))
    month_distrib.to_csv(os.path.join(output_rep,'month_recrut_client_category_GER.csv'),sep=";")
    month_distrib

<h3> Etude des univers d'achat pour le 1er achat : </h3> 

In [42]:
df_1_purchase = df.loc[df.DATERF == df.DATAUFTRAG1]

In [25]:
res_univ_1 = df_1_purchase.groupby(['CLIENT_CATEGORY','DEPARTEMENT']).agg({'NET_DEMAND': sum})
res_univ_1 = flatten_soft(res_univ_1.unstack('DEPARTEMENT',fill_value=0))
res_univ_1.to_csv(os.path.join(output_rep,'1stPurchase_PerCAt_GER.csv'),sep=";")

<h3> Etude des univers d'achat (net demand) : </h3> 

In [26]:
res_univ = df.groupby(['CLIENT_CATEGORY','DEPARTEMENT']).agg({'NET_DEMAND': sum})
res_univ = flatten_soft(res_univ.unstack('DEPARTEMENT',fill_value=0))

In [27]:
res_univ.to_csv(os.path.join(output_rep,'Purchase_perUnivers_perCategory_GER.csv'),sep=";")

<h3> Etude des univers d'achat (% client) : </h3> 

In [21]:
res_univ = df.groupby(['CLIENT_CATEGORY','DEPARTEMENT_FIRST']).agg({'KDNR': pd.Series.nunique})
res_univ = flatten_soft(res_univ.unstack('DEPARTEMENT_FIRST',fill_value=0))

In [22]:
res_univ.to_csv(os.path.join(output_rep,'Purchase_perUnivers_perCategory_percentage_GER.csv'),sep=";")

<h3> Etudes des discount :</h3>

<h4> Premier achat :</h4>

- Merge du prix de base:

In [72]:
df_1_purchase = pd.merge(df_1_purchase,df_base_price[['ARTNR','VKP']],
                         left_on='ARTNRERF',right_on='ARTNR',how='left',suffixes=(False,False))

KPI_to_print = sum(df_1_purchase.ARTNR.isnull())/len(df_1_purchase)*100
print('% Lines where discount computation is impossible :' +str(round(KPI_to_print,1))+'%')

% Lines where discount computation is impossible :6.6%


- Calcul du discount:

In [60]:
df_1_purchase['DISCOUNT'] = (1 -  df_1_purchase.PREIS/df_1_purchase.VKP)*100
df_1_purchase['CAT_DISCOUNT'] = df_1_purchase.DISCOUNT.apply(lambda x :describe_discount(x) )

- Vérification:

In [76]:
KPI_to_print = len(df_1_purchase.loc[df_1_purchase.DISCOUNT<0])/len(df_1_purchase)*100
print('% Lines with negative discount :' +str(round(KPI_to_print,1))+'%')

KPI_to_print = df_1_purchase.DISCOUNT.mean()
print('% Average discount :' +str(round(KPI_to_print,1))+'%')

% Lines with negative discount :1.9%
% Average discount :14.4%


- Etude des discount:

In [62]:
res_discount_1st = df_1_purchase.groupby(['CAT_DISCOUNT','CLIENT_CATEGORY']).agg({'ARTNRERF':'count'})
res_discount_1st = flatten_soft(res_discount_1st.unstack('CLIENT_CATEGORY',fill_value=0))
res_discount_1st.to_csv(os.path.join(output_rep,'Discount_perCategory_GER_1st_purchase.csv'),sep=";")
res_discount_1st

<h4> Toute la base :</h4>

- Merge du prix de base:

In [77]:
df = pd.merge(df,df_base_price[['ARTNR','VKP']],
                         left_on='ARTNRERF',right_on='ARTNR',how='left',suffixes=(False,False))

KPI_to_print = sum(df.ARTNR.isnull())/len(df)*100
print('% Lines where discount computation is impossible :' +str(round(KPI_to_print,1))+'%')

% Lines where discount computation is impossible :10.1%


- Calcul du discount:

In [78]:
df['DISCOUNT'] = (1 -  df.PREIS/df.VKP)*100
df['CAT_DISCOUNT'] = df.DISCOUNT.apply(lambda x :describe_discount(x))

- Vérification:

In [79]:
KPI_to_print = len(df.loc[df.DISCOUNT<0])/len(df)*100
print('% Lines with negative discount :' +str(round(KPI_to_print,1))+'%')

KPI_to_print = df.DISCOUNT.mean()
print('% Average discount :' +str(round(KPI_to_print,1))+'%')

% Lines with negative discount :1.7%
% Average discount :15.0%


- Etude des discount:

In [80]:
res_discount = df.groupby(['CAT_DISCOUNT','CLIENT_CATEGORY']).agg({'ARTNRERF':'count'})
res_discount = flatten_soft(res_discount.unstack('CLIENT_CATEGORY',fill_value=0))
res_discount.to_csv(os.path.join(output_rep,'Discount_perCategory_GER.csv'),sep=";")
res_discount

Unnamed: 0,CAT_DISCOUNT,_ARTNRERF_One-timer,_ARTNRERF_Recurring,_ARTNRERF_Two-timer
0,10-20%,154818,449367,124366
1,20-30%,144603,412683,113752
2,30-40%,97181,316156,82009
3,40-50%,67041,227585,56301
4,5-10%,15809,34789,9965
5,50-60%,39637,138280,33870
6,60-70%,32819,98733,25178
7,<5%,598090,1730918,479562
8,>70%,28371,73451,20931


<h3> Etudes des marques :</h3>

- Pour le 1er achat:

In [31]:
res_brand_1_purch = df_1_purchase.groupby(['MARQUE_NATIO','CLIENT_CATEGORY']).agg({'NET_DEMAND':sum})
res_brand_1_purch = flatten_soft(res_brand_1_purch.unstack('CLIENT_CATEGORY',fill_value=0))
res_brand_1_purch

Unnamed: 0,MARQUE_NATIO,_NET_DEMAND_One-timer,_NET_DEMAND_Recurring,_NET_DEMAND_Two-timer
0,National Brand,1774911.0,1317232.0,802692.7
1,Vertbaudet,29096500.0,16165110.0,11990610.0


In [32]:
res_brand_1_purch.to_csv(os.path.join(output_rep,'Brand_distrib_1stpurch__GER.csv'),sep=";")

- En général : 

In [23]:
df['MARQUE_NATIO'] = df.CODMARQ.apply(lambda x: 'Vertbaudet' if x=='VB'
                                     else 'National Brand')

In [26]:
res_brand = df.groupby(['MARQUE_NATIO','CLIENT_CATEGORY']).agg({'NET_DEMAND':sum})
res_brand = flatten_soft(res_brand.unstack('CLIENT_CATEGORY',fill_value=0))
res_brand                      

Unnamed: 0,MARQUE_NATIO,_NET_DEMAND_One-timer,_NET_DEMAND_Recurring,_NET_DEMAND_Two-timer
0,National Brand,1795251.0,3989086.0,1307805.0
1,Vertbaudet,29518660.0,75532300.0,22599310.0


In [27]:
res_brand.to_csv(os.path.join(output_rep,'Brand_distrib_GER.csv'),sep=";")

<h3> Etudes des canaux :</h3>

In [51]:
df_canal = pd.read_csv("C:/Users/UgoMANTEL/Work/Github/Vertbaudet_2021/data/3. Other/20210928_Order_chanel_GER.tab",sep="\t",encoding= "unicode_escape")
df_canal.columns =['BELEGSTAPELNR','LEVIER','CAMPAGNE']
df_canal

Unnamed: 0,BELEGSTAPELNR,LEVIER,CAMPAGNE
0,201848977,SEA,SEA Marque
1,201848979,SEA,SEA Marque
2,201848991,SEA,FS 10
3,201848994,SEA,SEA Marque
4,201849000,SEA,SEA Marque
...,...,...,...
2394121,201784986,SEA,SEA Marque
2394122,201784987,SEA,SEA Marque
2394123,201784988,SEA,SEA Marque
2394124,201784989,SEA,SEA Hors Marque


In [59]:
df_1_purchase = df_1_purchase.loc[df_1_purchase.BELEGSTAPELNR!='$null$']
df_1_purchase.BELEGSTAPELNR = df_1_purchase.BELEGSTAPELNR .astype(int)

In [60]:
df_1_purchase = pd.merge(df_1_purchase,df_canal[['BELEGSTAPELNR','LEVIER']],on='BELEGSTAPELNR',how='left',suffixes=(False,False))

In [63]:
res_canal = df_1_purchase.groupby(['CLIENT_CATEGORY','LEVIER']).agg({'KDNR': pd.Series.nunique})
res_canal = flatten_soft(res_canal.unstack('CLIENT_CATEGORY',fill_value=0))
res_canal
res_canal.to_csv(os.path.join(output_rep,'CanalAcquisition_GER.csv'),sep=";")

<h3> Etudes des délais de livraison :</h3>

- Import des dates de livraison:

In [21]:
df_delivery = pd.read_csv("C:/Users/UgoMANTEL/Work/Github/Vertbaudet_2021/data/3. Other/20211004_Delivery_Germany.csv",sep=",")

In [22]:
df_delivery['ID_ORDER'] = df_delivery.AUFTNR.astype(str) + '_'+ df_delivery.KDNR.astype(str)
df_delivery['LIEFERDAT'] = pd.to_datetime(df_delivery.LIEFERDAT)
df_delivery

Unnamed: 0,KDNR,AUFTNR,LIEFERDAT,ID_ORDER
0,6478,4,2017-01-02,4_6478
1,127,87,2017-01-03,87_127
2,59032,113,2017-01-03,113_59032
3,59032,113,2017-01-03,113_59032
4,59032,113,2017-01-03,113_59032
...,...,...,...,...
18386465,33703408,2,2021-09-22,2_33703408
18386466,33703408,2,2021-09-22,2_33703408
18386467,33703408,2,2021-09-22,2_33703408
18386468,33703408,2,2021-09-27,2_33703408


- Ajout dans le dataset : 

In [23]:
df = pd.merge(df,df_delivery[['ID_ORDER','LIEFERDAT']],on='ID_ORDER',how='left',suffixes=(False,False))
del(df_delivery)

- Cleaning :

In [37]:
df = df.loc[df.LIEFERDAT.isnull()==False]
df['DELAY'] = ( df.LIEFERDAT - df.DATERF)
df['DELAY_DAYS'] = df.DELAY.dt.days

- Obtention data :

In [39]:
res_delay = df.groupby(['CLIENT_CATEGORY','DELAY_DAYS']).agg({'KDNR': pd.Series.nunique})
res_delay = flatten_soft(res_delay.unstack('DELAY_DAYS',fill_value=0))
res_delay.to_csv(os.path.join(output_rep,'delays_GERM.csv'),sep=";")

<h3> Sankey par catégorie :</h3>

- Ajouter le nombre de commandes :

In [40]:
nb_order =  flatten(df.sort_values(by='DATERF',ascending=True).groupby(['ID_ORDER']).agg({'KDNR':'last'}))
nb_order['ORDER_NUMBER'] = nb_order.groupby(['KDNR']).cumcount()+1
df = pd.merge(df,nb_order[['ID_ORDER','ORDER_NUMBER']],on='ID_ORDER',how='left')

- One timers :

In [46]:
# df_to_graph = df.loc[(df.CLIENT_CATEGORY == 'One-timer') & (df.ORDER_NUMBER<=1)]
# res_sankey = df_to_graph.groupby(['DEPARTEMENT_CMD']).agg({'KDNR':pd.Series.nunique})
# res_sankey.to_csv(os.path.join(output_rep,str('sankey_GER_one_timers.csv')),sep=";")

- Two timers :

In [44]:
res_final = pd.DataFrame(columns=['OrderInitial','Count','OrderTarget'])

df_to_graph = df.loc[(df.CLIENT_CATEGORY == 'Two-timer') & (df.ORDER_NUMBER<=2)]
res_sankey = df_to_graph.groupby(['KDNR','ORDER_NUMBER']).agg({'DEPARTEMENT_CMD':'last'})
res_sankey = flatten_soft(res_sankey.unstack('ORDER_NUMBER',fill_value='CHURN').dropna(how='any'))
res_sankey = res_sankey.loc[res_sankey['_DEPARTEMENT_CMD_2']!='CHURN']
res_sankey.columns

for i in range(1, 2):
    dep_n = str('_DEPARTEMENT_CMD_'+str(i))
    dep_n_1 = str('_DEPARTEMENT_CMD_'+str(i+1))
    res_sankey_tmp = res_sankey.groupby([dep_n,dep_n_1]).agg({'KDNR':pd.Series.nunique})
    res_sankey_tmp = flatten(res_sankey_tmp)
    res_sankey_tmp = res_sankey_tmp.reset_index(inplace=False)

    res_sankey_tmp.columns = ['OrderTarget','OrderInitial','Count']
    res_sankey_tmp = res_sankey_tmp.loc[(res_sankey_tmp['OrderInitial']!='CHURN')]
    res_sankey_tmp['OrderInitial']= res_sankey_tmp['OrderInitial'].apply(lambda x: str(x+'_'+str(i)))
    res_sankey_tmp['OrderTarget']= res_sankey_tmp['OrderTarget'].apply(lambda x: str(x+'_'+str(i+1)))
    res_sankey_tmp['Count']= res_sankey_tmp['Count'].apply(lambda x: str('['+str(x)+']'))
    res_sankey_tmp = res_sankey_tmp[['OrderInitial','Count','OrderTarget']]
    res_final = pd.concat([res_final,res_sankey_tmp])
    res_final.to_csv(os.path.join(output_rep,str('sankey_GER_two_timers.csv')),sep=";")

- Reccurings :

In [45]:
res_final = pd.DataFrame(columns=['OrderInitial','Count','OrderTarget'])

df_to_graph = df.loc[(df.CLIENT_CATEGORY == 'Recurring') & (df.ORDER_NUMBER<=5)]
res_sankey = df_to_graph.groupby(['KDNR','ORDER_NUMBER']).agg({'DEPARTEMENT_CMD':'last'})
res_sankey = flatten_soft(res_sankey.unstack('ORDER_NUMBER',fill_value='CHURN').dropna(how='any'))
res_sankey = res_sankey.loc[res_sankey['_DEPARTEMENT_CMD_2']!='CHURN']
res_sankey = res_sankey.loc[res_sankey['_DEPARTEMENT_CMD_3']!='CHURN']
res_sankey.columns

for i in range(1, 5):
    dep_n = str('_DEPARTEMENT_CMD_'+str(i))
    dep_n_1 = str('_DEPARTEMENT_CMD_'+str(i+1))
    res_sankey_tmp = res_sankey.groupby([dep_n,dep_n_1]).agg({'KDNR':pd.Series.nunique})
    res_sankey_tmp = flatten(res_sankey_tmp)
    res_sankey_tmp = res_sankey_tmp.reset_index(inplace=False)

    res_sankey_tmp.columns = ['OrderTarget','OrderInitial','Count']
    res_sankey_tmp = res_sankey_tmp.loc[(res_sankey_tmp['OrderInitial']!='CHURN')]
    res_sankey_tmp['OrderInitial']= res_sankey_tmp['OrderInitial'].apply(lambda x: str(x+'_'+str(i)))
    res_sankey_tmp['OrderTarget']= res_sankey_tmp['OrderTarget'].apply(lambda x: str(x+'_'+str(i+1)))
    res_sankey_tmp['Count']= res_sankey_tmp['Count'].apply(lambda x: str('['+str(x)+']'))
    res_sankey_tmp = res_sankey_tmp[['OrderInitial','Count','OrderTarget']]
    res_final = pd.concat([res_final,res_sankey_tmp])
    res_final.to_csv(os.path.join(output_rep,str('sankey_GER_recurrins.csv')),sep=";")