In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import ticker
import matplotlib.dates as mdates
from datetime import datetime, time ,date ,timedelta
from dateutil.relativedelta import relativedelta
import seaborn as sns
pd.options.mode.chained_assignment = None

In [2]:
import warnings
warnings.filterwarnings("ignore")

<h1> Functions </h1>

<h3> Fonctions générales </h3>

In [3]:
def floatise (df, list_columns_to_floatise):
    for i in list_columns_to_floatise:
        df[i] = df[i].apply(lambda x: float(x.replace(',','.')))
    return (df)

In [4]:
def intise (df, list_columns_to_floatise):
    for i in list_columns_to_floatise:
        df[i] = df[i].astype(int)
    return (df)

In [5]:
def flatten (table):
    if type(table.columns)==pd.MultiIndex:
        columns_to_look = [name_tmp for name_tmp in table.columns]

        columns_df = [ str(t[0])+'_'+str(t[1]) for t in columns_to_look]
        columns_df.insert(0,table.index.name)

        df = pd.DataFrame(columns = columns_df)

        index = 0
        for i in table.index:
            row = [table[r][i] for r in columns_to_look]
            row.insert(0,i)
            df.loc[index] = row
            index = index + 1
        return(df)
    else :
        table = pd.DataFrame(table)
        table.reset_index(level=0, inplace=True)
        return table

In [6]:
def flatten_soft(dataframe):
    res = pd.DataFrame()
    res[dataframe.index.name] = dataframe.index
    for col in dataframe.columns:
        name_tmp=""
        for i in range(len(dataframe.columns[0])):
            name_tmp = name_tmp +'_'+str(col[i])
        res[str(name_tmp)] = dataframe[col].values
    return res

<h3> Fonctions particulières </h3>

In [7]:
def cohort_attribution (x):
    if (x<=2017):
        return 2017
    else :
        return x

In [8]:
def cohort_attribution_bis (x):
    if (x<2017):
        return 2016
    else :
        return x

<h1> Dataset creation </h1>

<h3> Import de la data </h3>

In [9]:
src_root = os.getcwd()
print(src_root)

C:\Users\UgoMANTEL\Work\Github\Vertbaudet_2021\src\notebooks


- Préparation des colonnes:

In [10]:
col_names = ['AUFTRNR','KDNR','ARTNRERF','WTR','ARTGRERF','DATERF','PREISERF','PREIS','EKP','PREISNET','RABATT','RABATTSATZ',
             'MENGE','MENGERET','DEPARTEMENT','CAUFTRAGSPOS','MWST','MWSTRABATT','ANZKINDER','ANLAGEDAT',
             'DATAUFTRAG1','STRASSE','PLZ','ORT','BUNDESLAND','CLAND','CWERBESP','DATWERBESP','CLIEFERSP','DATLIEFERSP',
             'CADRESSSP','DATADRESSSP','UMSATZ1','DATLETZTAUFTR','UMSLETZTAUFTR','ANZKATLETZTAUFTR','WKOSTLETZTAUFTR',
             'DBKDNR','WKOSTENKUM','UMSATZKUM','UMSATZKUMTEL','UMSATZKUMFAX','UMSATZKUMWEB','UMSATZLIEFKUM','RUECKSTAUFTR',
             'ANZAUFTR','REFPRES','REFSTK','CODDOC','PLANEKP','RAYON','FAMILLE','DEPART','CODMARQ','ARTGROESSE','DATAUFTRAG',
             'DATERFASSUNG','DATVERARB','GUTSCHEINWERT']

# col_names = ['AUFTRNR','KDNR','ARTNRERF','WTR','ARTGRERF','DATERF','PREISERF','PREIS','EKP','PREISNET',
#      'RABATT','RABATTSATZ','MENGE','MENGERET','DEPARTEMENT','CAUFTRAGSPOS','MWST','MWSTRABATT',
#      'ANZKINDER','ANLAGEDAT','DATAUFTRAG1','STRASSE','PLZ','ORT','BUNDESLAND','CWERBESP','DATWERBESP','CLIEFERSP'
#      ,'DATLIEFERSP','CADRESSSP','DATADRESSSP','UMSATZ1','DATLETZTAUFTR','UMSLETZTAUFTR','ANZKATLETZTAUFTR','WKOSTLETZTAUFTR',
#      'DBKDNR','WKOSTENKUM','UMSATZKUM','UMSATZKUMTEL','UMSATZKUMFAX','UMSATZKUMWEB','UMSATZLIEFKUM','REFPRES','REFSTK','CODDOC',
#      'PLANEKP','RAYON','FAMILLE','DEPART','CODMARQ','ARTGROESSE','DATAUFTRAG','DATERFASSUNG','DATVERARB','GUTSCHEINWERT']

In [11]:
len(col_names)

59

In [12]:
dict_department = {'B':'BEBE','N':'PAP FILLE','U':'PAP GARCON','D':'CHAUSSURE','F':'FEMME',
'T':'TEXTILE HOME','P':'PUERICULTURE','W':'NON PAP ENFANT','K':'DECO','M':'CHAMBRE ET LITERIE','R':'JOUETS'}

- Import:

In [13]:
# df = pd.read_csv("C:/Users/UgoMANTEL/Work/Github/Vertbaudet_2021/data/1. Germany/20210915_Export_Germany_17_18.csv",sep=",",encoding= "unicode_escape", names=col_names, skiprows=
#     1)
# df = pd.read_csv("C:/Users/UgoMANTEL/Work/Github/Vertbaudet_2021/data/1. Germany/20210922_Export_Germany_19_20.tab",sep="\t",encoding='Latin1')

<h3> Dataset pour cohortes </h3>

- Définition des répertoires :

In [14]:
repertoire = "C:/Users/UgoMANTEL/Work/Github/Vertbaudet_2021/data/1. Germany"

In [15]:
output_rep = "C:/Users/UgoMANTEL/eleven/Engagements - Vertbaudet/5. Analyses/3. Outputs python"

- Création de la donnée:

In [16]:
df = pd.DataFrame(columns=['KDNR','COHORT'])
for file in os.listdir(repertoire):
    file_name_tmp = os.path.join(repertoire,file)
    df_tmp = pd.read_csv(file_name_tmp,sep=",",encoding= "unicode_escape", usecols=col_names)
#     df_tmp = pd.read_csv(file_name_tmp,sep="\t",encoding= "Latin1")
    
    df_tmp = df_tmp.loc[df_tmp.CAUFTRAGSPOS!='$null$']
    df_tmp.CAUFTRAGSPOS = df_tmp.CAUFTRAGSPOS.astype(int)
    df_tmp = df_tmp.loc[df_tmp.CAUFTRAGSPOS <= 7]
    
    ################### CONVERSION DES CHAMPS ###################
    df_tmp['DATERF'] = pd.to_datetime(df_tmp.DATERF)
    df_tmp['DATAUFTRAG1'] = pd.to_datetime(df_tmp.DATAUFTRAG1)
    
    ################### CREATION DES CHAMPS ###################
    df_tmp['TOT_SALES'] = (df_tmp.PREIS - df_tmp.RABATT)*(df_tmp.MENGE - df_tmp.MENGERET)
    df_tmp['GROSS_DEMAND'] = df_tmp.PREIS * df_tmp.MENGE
    df_tmp['NET_DEMAND'] = (df_tmp.PREIS - df_tmp.RABATT) * df_tmp.MENGE
    df_tmp['YEAR_FIRST_ORDER'] = df_tmp.DATAUFTRAG1.dt.year
    df_tmp['YEAR_ORDER'] = df_tmp.DATERF.dt.year
    df_tmp['COHORT'] = df_tmp.YEAR_FIRST_ORDER.apply(lambda x: cohort_attribution_bis(x))
    
    for year in df_tmp.YEAR_ORDER.unique():
        df_tmp_peryear = df_tmp.loc[df_tmp.YEAR_ORDER == year]
        
        ################### GROUP BY CLIENT ###################
        CA_tmp = flatten(df_tmp_peryear.groupby(['KDNR']).agg({'NET_DEMAND': sum, 'COHORT':'last','AUFTRNR':pd.Series.nunique}))
        CA_tmp.columns = ['KDNR',str('SALES_'+str(year)),str('COHORT_'+str(year)),str('NB_COMMANDE_'+str(year))]
        CA_tmp[str('ACTIVE_'+str(year))] = 1
        df = pd.merge(df,CA_tmp, on='KDNR', how='outer', suffixes=(False,False))
        df['COHORT'] = df['COHORT'].combine_first(df[str('COHORT_'+str(year))])
        df.drop(columns=[str('COHORT_'+str(year))], inplace=True)
        del(df_tmp_peryear)
        del(CA_tmp)
        
    del(df_tmp)
    
df = df.fillna(0)

In [17]:
file_name= 'raw_data_cohort_germany.csv'
df.to_csv(os.path.join(output_rep,file_name),sep=";")

In [28]:
df.columns

Index(['COHORT', 'KDNR', 'SALES_2017', 'NB_COMMANDE_2017', 'ACTIVE_2017',
       'SALES_2018', 'NB_COMMANDE_2018', 'ACTIVE_2018', 'SALES_2019',
       'NB_COMMANDE_2019', 'ACTIVE_2019', 'SALES_2020', 'NB_COMMANDE_2020',
       'ACTIVE_2020', 'SALES_2021', 'NB_COMMANDE_2021', 'ACTIVE_2021',
       'ACTIVE_2017_CUM', 'ACTIVE_2018_CUM', 'ACTIVE_2019_CUM',
       'ACTIVE_2020_CUM', 'ACTIVE_2021_CUM'],
      dtype='object')

<h1> Analyses cohortes </h1>

<h3> Cohortes en volume & valeur </h3>

- Groupement par cohortes:

In [17]:
df_by_cohort = flatten(df.groupby(['COHORT']).agg({'SALES_2017': sum, 'SALES_2018': sum,'SALES_2019': sum, 'SALES_2020': sum,
                                                  'SALES_2021': sum, 
                                                    'ACTIVE_2017': sum, 'ACTIVE_2018': sum,'ACTIVE_2019': sum, 'ACTIVE_2020': sum,
                                                  'ACTIVE_2021': sum, }))
df_by_cohort.columns = ['COHORT','SALES_2017', 'SALES_2018', 'SALES_2019','SALES_2020', 'SALES_2021', 
                        'ACTIVE_2017', 'ACTIVE_2018', 'ACTIVE_2019','ACTIVE_2020', 'ACTIVE_2021']

- Export:

In [19]:
file_name_bis = 'data_cohort_germany_DN.csv'
df_by_cohort.to_csv(os.path.join(output_rep,file_name_bis),sep=";")

<h3> Cohortes des acheteurs passifs en volume </h3>

- Construct the data:

In [20]:
df['ACTIVE_2017_CUM'] = df[["ACTIVE_2017","ACTIVE_2018", "ACTIVE_2019",'ACTIVE_2020','ACTIVE_2021']].max(axis=1)
df['ACTIVE_2018_CUM'] = df[["ACTIVE_2018", "ACTIVE_2019",'ACTIVE_2020','ACTIVE_2021']].max(axis=1)
df['ACTIVE_2019_CUM'] = df[[ "ACTIVE_2019",'ACTIVE_2020','ACTIVE_2021']].max(axis=1)
df['ACTIVE_2020_CUM'] = df[['ACTIVE_2020','ACTIVE_2021']].max(axis=1)
df['ACTIVE_2021_CUM'] = df[['ACTIVE_2021']].max(axis=1)

- Clean the data:

In [21]:
df.loc[df.COHORT>2017,'ACTIVE_2017_CUM']=0
df.loc[df.COHORT>2018,'ACTIVE_2018_CUM']=0
df.loc[df.COHORT>2019,'ACTIVE_2019_CUM']=0
df.loc[df.COHORT>2020,'ACTIVE_2020_CUM']=0

In [22]:
df_by_cohort_cum = flatten(df.groupby(['COHORT']).agg({'ACTIVE_2017_CUM': sum, 'ACTIVE_2018_CUM': sum,'ACTIVE_2019_CUM': sum, 'ACTIVE_2020_CUM': sum,
                                                  'ACTIVE_2021_CUM': sum, }))
df_by_cohort_cum.columns = ['COHORT','ACTIVE_2017', 'ACTIVE_2018', 'ACTIVE_2019','ACTIVE_2020', 'ACTIVE_2021']
df_by_cohort_cum.to_csv(os.path.join(output_rep,'data_cohort_germany_cum_DN.csv'),sep=";")

<h3> Etude des one-timers</h3>

In [24]:
df_small = df.loc[df.COHORT>=2017]
df_small['NB_COMMANDE_CUM_2017'] = df_small['NB_COMMANDE_2017']
df_small['NB_COMMANDE_CUM_2018'] = df_small['NB_COMMANDE_2017'] + df_small['NB_COMMANDE_2018']
df_small['NB_COMMANDE_CUM_2019'] = df_small['NB_COMMANDE_CUM_2018'] + df_small['NB_COMMANDE_2019']
df_small['NB_COMMANDE_CUM_2020'] = df_small['NB_COMMANDE_CUM_2019'] + df_small['NB_COMMANDE_2020']
df_small['NB_COMMANDE_CUM_2021'] = df_small['NB_COMMANDE_CUM_2020'] + df_small['NB_COMMANDE_2021']

for year in df_small.COHORT.unique():
    df_small[str('ONE_TIMER_'+str(int(year)))] = df_small[str('NB_COMMANDE_CUM_'+str(int(year)))] == 1
one_timer = flatten(df_small.groupby(['COHORT']).agg({'ONE_TIMER_2017': sum, 'ONE_TIMER_2018': sum,'ONE_TIMER_2019': sum, 
                                        'ONE_TIMER_2020': sum, 'ONE_TIMER_2021': sum,
                                        'ACTIVE_2017': sum, 'ACTIVE_2018': sum,'ACTIVE_2019': sum,
                                        'ACTIVE_2020': sum,'ACTIVE_2021': sum }))    

In [26]:
file_name_onetimers = 'one_timer_per_cohort_v2.csv'
one_timer.to_csv(os.path.join(output_rep,file_name_onetimers),sep=";")

<h3> Représentation waterfall </h3>

- Avec la fonction cohort_attribution originale!

In [28]:
 range(int(min(df.COHORT.unique()))+1,int(max(df.COHORT.unique())))

range(2017, 2021)

In [36]:
for year in range(int(min(df.COHORT.unique()))+1,int(max(df.COHORT.unique()))):
    df[str('NEW_BUSINESS_'+str(int(year+1)))] = df.apply(lambda row: row[str('SALES_'+str(year+1))] 
                                                      if ((row[str('ACTIVE_'+str(year))] == 0) and (row[str('ACTIVE_'+str(year+1))] == 1) and 
                                                         row.COHORT==year+1)
                                                      else 0 ,axis=1)
    df[str('LFL_'+str(int(year)))] = df.apply(lambda row: row[str('SALES_'+str(year+1))]-row[str('SALES_'+str(year))]
                                                      if ((row[str('ACTIVE_'+str(year))] == 1) and (row[str('ACTIVE_'+str(year+1))] == 1))
                                                      else 0 ,axis=1)
    df[str('CHURN_'+str(int(year+1)))] = df.apply(lambda row: -row[str('SALES_'+str(year))] 
                                                      if ((row[str('ACTIVE_'+str(year))] == 1) and (row[str('ACTIVE_'+str(year+1))] == 0))
                                                      else 0 ,axis=1)
    df[str('REACTIVATED_'+str(int(year+1)))] = df.apply(lambda row: row[str('SALES_'+str(year+1))] 
                                                      if ((row[str('ACTIVE_'+str(year))] == 0) and (row[str('ACTIVE_'+str(year+1))] == 1) 
                                                          and (row.COHORT<year+1))
                                                      else 0 ,axis=1)

In [37]:
file_name_waterfall = 'waterfallbisDN.csv'
df[['SALES_2017','NEW_BUSINESS_2018','LFL_2017','CHURN_2018','REACTIVATED_2018',
  'SALES_2018','NEW_BUSINESS_2019','LFL_2018','CHURN_2019','REACTIVATED_2019',
  'SALES_2019','NEW_BUSINESS_2020','LFL_2019','CHURN_2020','REACTIVATED_2020',
  'SALES_2020','NEW_BUSINESS_2021','LFL_2020','CHURN_2021','REACTIVATED_2021',
  'SALES_2021']].sum(axis=0).to_csv(os.path.join(output_rep,file_name_waterfall),sep=";")

<h3> Zoom sur le comportement des cohortes</h3>

- Cohort Prior 2017 en 2021:

In [None]:
df_2020 = pd.read_csv("C:/Users/UgoMANTEL/Work/Github/Vertbaudet_2021/data/20210910_Export_Germany_2020.csv",sep=",",encoding= "unicode_escape", names=col_names, skiprows=
    1)

In [125]:
df_2020['TOT_SALES'] = (df_2020.PREIS - df_2020.RABATT)*(df_2020.MENGE - df_2020.MENGERET)
df_2020['NET_DEMAND'] = (df_2020.PREIS - df_2020.RABATT) * df_2020.MENGE

In [126]:
zoom_2017 = df_2020.loc[df_2020.KDNR.isin(df.loc[df.COHORT==2017].KDNR.unique())]

In [127]:
zoom_2017['DEPARTEMENT'] = zoom_2017.DEPART.map(dict_department)

In [131]:
rev_by_dep = flatten_soft(zoom_2017.groupby(['DEPARTEMENT']).agg({'TOT_SALES': ['sum']}))
rev_by_dep.columns = ['DEPARTEMENT','TOT_SALES']
rev_by_dep.sort_values(by='TOT_SALES',ascending=False)

Unnamed: 0,DEPARTEMENT,TOT_SALES
7,PAP FILLE,6864285.0
8,PAP GARCON,4369744.0
1,CHAMBRE ET LITERIE,2827234.0
6,NON PAP ENFANT,2411793.0
3,DECO,1813648.0
10,TEXTILE HOME,1213997.0
0,BEBE,1015957.0
5,JOUETS,877754.0
2,CHAUSSURE,838185.3
4,FEMME,577594.6


- Cohort Prior 2017 en 2017:

In [132]:
df_2017 = pd.read_csv("C:/Users/UgoMANTEL/Work/Github/Vertbaudet_2021/data/20210910_Export_Germany_2017.csv",sep=",",encoding= "unicode_escape", names=col_names, skiprows=
    1)

In [133]:
df_2017['TOT_SALES'] = (df_2017.PREIS - df_2017.RABATT)*(df_2017.MENGE - df_2017.MENGERET)
df_2017['NET_DEMAND'] = (df_2017.PREIS - df_2017.RABATT) * df_2017.MENGE

In [134]:
zoom_2017_2017 = df_2017.loc[df_2017.KDNR.isin(df.loc[df.COHORT==2017].KDNR.unique())]

In [135]:
zoom_2017_2017['DEPARTEMENT'] = zoom_2017_2017.DEPART.map(dict_department)

In [136]:
rev_by_dep_2017 = flatten_soft(zoom_2017_2017.groupby(['DEPARTEMENT']).agg({'TOT_SALES': ['sum']}))
rev_by_dep_2017.columns = ['DEPARTEMENT','TOT_SALES']
rev_by_dep_2017.sort_values(by='TOT_SALES',ascending=False)

Unnamed: 0,DEPARTEMENT,TOT_SALES
7,PAP FILLE,14437790.0
8,PAP GARCON,8154574.0
1,CHAMBRE ET LITERIE,5952854.0
3,DECO,4876841.0
0,BEBE,4777328.0
6,NON PAP ENFANT,4319716.0
2,CHAUSSURE,3850059.0
4,FEMME,3616409.0
10,TEXTILE HOME,3188721.0
9,PUERICULTURE,661634.5
