In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import ticker
import matplotlib.dates as mdates
from datetime import datetime, time ,date ,timedelta
from dateutil.relativedelta import relativedelta
import seaborn as sns
pd.options.mode.chained_assignment = None

In [2]:
import warnings
warnings.filterwarnings("ignore")

<h1> Functions </h1>

<h3> Fonctions générales </h3>

In [3]:
def floatise (df, list_columns_to_floatise):
    for i in list_columns_to_floatise:
        df[i] = df[i].apply(lambda x: float(x.replace(',','.')))
    return (df)

In [4]:
def intise (df, list_columns_to_floatise):
    for i in list_columns_to_floatise:
        df[i] = df[i].astype(int)
    return (df)

In [5]:
def flatten (table):
    if type(table.columns)==pd.MultiIndex:
        columns_to_look = [name_tmp for name_tmp in table.columns]

        columns_df = [ str(t[0])+'_'+str(t[1]) for t in columns_to_look]
        columns_df.insert(0,table.index.name)

        df = pd.DataFrame(columns = columns_df)

        index = 0
        for i in table.index:
            row = [table[r][i] for r in columns_to_look]
            row.insert(0,i)
            df.loc[index] = row
            index = index + 1
        return(df)
    else :
        table = pd.DataFrame(table)
        table.reset_index(level=0, inplace=True)
        return table

In [6]:
def flatten_soft(dataframe):
    res = pd.DataFrame()
    res[dataframe.index.name] = dataframe.index
    for col in dataframe.columns:
        name_tmp=""
        for i in range(len(dataframe.columns[0])):
            name_tmp = name_tmp +'_'+str(col[i])
        res[str(name_tmp)] = dataframe[col].values
    return res

<h3> Fonctions particulières </h3>

In [7]:
def cohort_attribution (x):
    if (x<2017):
        return 2016
    else :
        return x

In [8]:
def categorize_distribution_freq(x):
    if x==1:
        return('1 order')
    if x==2:
        return('2 orders')
    if (x>=3) & (x<=5):
        return('3-5 orders')
    if (x>=6) & (x<=10):
        return('6-10 orders')
    if x>10:
        return('10+ orders')

In [9]:
def categorize_distribution_order_price(x):
    if x<=25:
        return('0-25€')
    if (x>25) & (x<=50):
        return('26-50€')
    if (x>50) & (x<=100):
        return('51-100€')
    if (x>100) & (x<=200):
        return('101-200€')
    if (x>200) & (x<=500):
        return('201-500€')
    if x>500:
        return('>500€')

<h1> Dataset creation </h1>

<h3> Import de la data </h3>

In [10]:
src_root = os.getcwd()
print(src_root)

C:\Users\UgoMANTEL\Work\Github\Vertbaudet_2021\src\notebooks


- Préparation des colonnes:

In [11]:
col_names = ['AUFTRNR','KDNR','ARTNRERF','WTR','ARTGRERF','DATERF','PREISERF','PREIS','EKP','PREISNET','RABATT','RABATTSATZ',
             'MENGE','MENGERET','DEPARTEMENT','CAUFTRAGSPOS','MWST','MWSTRABATT','ANZKINDER','ANLAGEDAT',
             'DATAUFTRAG1','STRASSE','PLZ','ORT','BUNDESLAND','CLAND','CWERBESP','DATWERBESP','CLIEFERSP','DATLIEFERSP',
             'CADRESSSP','DATADRESSSP','UMSATZ1','DATLETZTAUFTR','UMSLETZTAUFTR','ANZKATLETZTAUFTR','WKOSTLETZTAUFTR',
             'DBKDNR','WKOSTENKUM','UMSATZKUM','UMSATZKUMTEL','UMSATZKUMFAX','UMSATZKUMWEB','UMSATZLIEFKUM','RUECKSTAUFTR',
             'ANZAUFTR','REFPRES','REFSTK','CODDOC','PLANEKP','RAYON','FAMILLE','DEPART','CODMARQ','ARTGROESSE','DATAUFTRAG',
             'DATERFASSUNG','DATVERARB','GUTSCHEINWERT']

In [12]:
dict_department = {'B':'BEBE','N':'PAP FILLE','U':'PAP GARCON','D':'CHAUSSURE','F':'FEMME',
'T':'TEXTILE HOME','P':'PUERICULTURE','W':'NON PAP ENFANT','K':'DECO','M':'CHAMBRE ET LITERIE','R':'JOUETS'}

<h3> Dataset </h3>

- Définition des répertoires :

In [13]:
repertoire = "C:/Users/UgoMANTEL/Work/Github/Vertbaudet_2021/data/1. Germany"

In [14]:
output_rep = "C:/Users/UgoMANTEL/eleven/Engagements - Vertbaudet/5. Analyses/3. Outputs python"

- Création de la donnée:

In [15]:
df = pd.DataFrame()
for file in os.listdir(repertoire):
    file_name_tmp = os.path.join(repertoire,file)
    df_tmp = pd.read_csv(file_name_tmp,sep=",",encoding= "unicode_escape", usecols=col_names)
    
    df_tmp = df_tmp.loc[df_tmp.CAUFTRAGSPOS!='$null$']
    df_tmp.CAUFTRAGSPOS = df_tmp.CAUFTRAGSPOS.astype(int)
    df_tmp = df_tmp.loc[df_tmp.CAUFTRAGSPOS <= 7]
    
    ################### CONVERSION DES CHAMPS ###################
    df_tmp['DATERF'] = pd.to_datetime(df_tmp.DATERF)
    df_tmp['DATAUFTRAG1'] = pd.to_datetime(df_tmp.DATAUFTRAG1)
    df_tmp.DEPARTEMENT = df_tmp.DEPARTEMENT.map(dict_department) 
    
    ################### CREATION DES CHAMPS ###################
    df_tmp['TOT_SALES'] = (df_tmp.PREIS - df_tmp.RABATT)*(df_tmp.MENGE - df_tmp.MENGERET)
    df_tmp['NET_DEMAND'] = (df_tmp.PREIS - df_tmp.RABATT) * df_tmp.MENGE
    df_tmp['TOT_RETURN'] = (df_tmp.PREIS - df_tmp.RABATT) * df_tmp.MENGERET
    df_tmp['GROSS_DEMAND'] = df_tmp.PREIS * df_tmp.MENGE
    df_tmp['YEAR_FIRST_ORDER'] = df_tmp.DATAUFTRAG1.dt.year
    df_tmp['YEAR_ORDER'] = df_tmp.DATERF.dt.year
    df_tmp['COHORT'] = df_tmp.YEAR_FIRST_ORDER.apply(lambda x: cohort_attribution(x))
    df_tmp['MONTH_RECRUITMENT'] = df_tmp.DATAUFTRAG1.dt.month
    df_tmp['ID_ORDER'] = df_tmp.AUFTRNR.astype(str) + '_'+ df_tmp.KDNR.astype(str)
    
    cat_order =  df_tmp.groupby(['ID_ORDER','DEPARTEMENT']).agg({'NET_DEMAND':sum}).reset_index().sort_values(by='NET_DEMAND',
    cat_order = cat_order.drop_duplicates(subset=['ID_ORDER'],keep='first')
    cat_order.columns = ['ID_ORDER','DEPARTEMENT_CMD','NET_DEMAND']
    df_tmp = pd.merge(df_tmp,cat_order[['ID_ORDER','DEPARTEMENT_CMD']], on='ID_ORDER', how='left', suffixes=(False,False))
    
    df_tmp = df_tmp[['DATERF','DATAUFTRAG1','DEPARTEMENT','TOT_SALES','NET_DEMAND','GROSS_DEMAND',
                    'YEAR_FIRST_ORDER','YEAR_ORDER','COHORT','MONTH_RECRUITMENT','ID_ORDER','KDNR','ARTNRERF',
                     'DEPARTEMENT_CMD','RABATT','RABATTSATZ','PREIS','ANZKINDER','MENGE','MENGERET','TOT_RETURN']]
    df = pd.concat([df,df_tmp])
    del(df_tmp)

################### CREATION DE CHAMPS ###################
################### UNIVERS RECRUTEMENT
cat_rec = df.sort_values(by='DATERF',ascending=True).drop_duplicates(subset='KDNR', keep='first', inplace=False)[['KDNR','DEPARTEMENT_CMD']] 
cat_rec.columns = ['KDNR','DEPARTEMENT_FIRST']
df = pd.merge(df,cat_rec,on='KDNR',how='left',suffixes=(False,False))

################### DATE POUR LTV 12/24 MOIS
df['DATAUFTRAG1_12MONTH'] = df['DATAUFTRAG1']+ timedelta(days=365)
df['DATAUFTRAG1_24MONTH'] = df['DATAUFTRAG1']+ timedelta(days=730)

- Ajout de la catégorisation booléen/non-booléen:

In [16]:
res_returner = flatten(df.groupby(['KDNR']).agg({'TOT_RETURN' :sum}))
res_returner['RETURNER'] = res_returner.TOT_RETURN>0
df = pd.merge(df,res_returner[['KDNR','RETURNER']],on='KDNR',how='left',suffixes=(False,False))

In [17]:
df = df.drop(columns=['RETURNER'])

- Ajout de l'information de retour article:

In [18]:
df['RETURNED_ARTICLE'] = df.MENGERET>0

- Ajout du numcmde var 38:

In [51]:
df_numcde = pd.read_csv("C:/Users/UgoMANTEL/Work/Github/Vertbaudet_2021/data/3. Other/20211001_NUM_CMD_CANAL_GER.csv",sep=",",encoding= "unicode_escape")
df_numcde['ID_ORDER'] = df_numcde.AUFTNR.astype(str) + '_'+ df_numcde.KDNR.astype(str)
df = pd.merge(df,df_numcde[['ID_ORDER','BELEGSTAPELNR']],on='ID_ORDER',how='left',suffixes=(False,False))
del(df_numcde)

- Ajout nombre de commande :

In [62]:
nb_commande = flatten(df.groupby(['KDNR']).agg({'ID_ORDER':pd.Series.nunique}))
nb_commande.columns = ['KDNR','NB_ORDERS']
df = pd.merge(df,nb_commande,on='KDNR',how='left',suffixes=(False,False))

In [64]:
df['FREQ_CATEGORY'] = df.NB_ORDERS.apply(lambda x: categorize_distribution_freq(x))

<h1> Etudes des retours </h1>

<h3> Répartition en effectif:</h3> 

In [37]:
res = pd.DataFrame()
for year in df.YEAR_ORDER.unique():
    df_tmp = df.loc[df.YEAR_ORDER == year]
    res_returner = flatten(df_tmp.groupby(['KDNR']).agg({'TOT_RETURN' :sum}))
    res_returner['RETURNERS'] = res_returner.TOT_RETURN>0
    df_tmp = pd.merge(df_tmp,res_returner[['KDNR','RETURNERS']],on='KDNR',how='left',suffixes=(False,False))
    rep_returner = flatten(df_tmp.groupby(['RETURNERS']).agg({'KDNR' :pd.Series.nunique}))
    rep_returner['YEAR'] = year
    res = pd.concat([res,rep_returner],axis=0)

In [45]:
res.to_csv(os.path.join(output_rep,'rep_returners_GER.csv'),sep=";")

<h3> Répartition selon nombre d'achat : </h3> 

In [66]:
distrib_freq = df.groupby(['RETURNER','FREQ_CATEGORY']).agg({'KDNR': pd.Series.nunique})
distrib_freq = flatten_soft(distrib_freq.unstack('RETURNER',fill_value=0))
distrib_freq 

Unnamed: 0,FREQ_CATEGORY,_KDNR_False,_KDNR_True
0,1 order,539060,259837
1,10+ orders,2201,39782
2,2 orders,118027,168580
3,3-5 orders,67860,217146
4,6-10 orders,11016,95088


In [67]:
distrib_freq.to_csv(os.path.join(output_rep,'freq_returners_GER.csv'),sep=";")

<h3> Répartition des commandes retournées par univers:</h3> 

In [49]:
returned_order = flatten(df.groupby(['ID_ORDER']).agg({'TOT_RETURN' :sum}))
returned_order['RETURNED_ORDER'] = returned_order.TOT_RETURN>0
df = pd.merge(df,returned_order[['ID_ORDER','RETURNED_ORDER']],on='ID_ORDER',how='left',suffixes=(False,False))
rep_returner = df.loc[df.RETURNED_ORDER == True].groupby(['YEAR_ORDER','DEPARTEMENT_CMD']).agg({'ID_ORDER': pd.Series.nunique})
rep_returner = flatten_soft(rep_returner.unstack('YEAR_ORDER',fill_value=0))
rep_returner 

Unnamed: 0,DEPARTEMENT_CMD,_ID_ORDER_2017,_ID_ORDER_2018,_ID_ORDER_2019,_ID_ORDER_2020,_ID_ORDER_2021
0,BEBE,40202,43863,36462,33508,30335
1,CHAMBRE ET LITERIE,10838,10019,8575,10421,7800
2,CHAUSSURE,67007,58509,32437,22444,25737
3,DECO,23957,21184,18596,20164,15828
4,FEMME,52851,51936,42853,31814,33655
5,JOUETS,1646,2033,4707,9924,7235
6,NON PAP ENFANT,17509,16107,13536,15116,12669
7,PAP FILLE,127965,114007,85583,82240,77299
8,PAP GARCON,62392,52955,46600,48865,48337
9,PUERICULTURE,4498,3949,3954,4207,4333


In [50]:
rep_returner.to_csv(os.path.join(output_rep,'rep_univ_returners_GER.csv'),sep=";")

<h3> Répartition des articles retournés par univers:</h3> 

In [20]:
returned_article = df.groupby(['YEAR_ORDER','DEPARTEMENT']).agg({'MENGERET': sum})
returned_article = flatten_soft(returned_article.unstack('YEAR_ORDER',fill_value=0))
returned_article 

Unnamed: 0,DEPARTEMENT,_MENGERET_2017,_MENGERET_2018,_MENGERET_2019,_MENGERET_2020,_MENGERET_2021
0,BEBE,165028,177991,147742,137750,131325
1,CHAMBRE ET LITERIE,7872,8357,7528,8352,6162
2,CHAUSSURE,163822,146571,77966,49005,54024
3,DECO,48814,43617,36671,35445,25764
4,FEMME,192515,196032,161016,128316,135739
5,JOUETS,3375,3968,9576,16093,11323
6,NON PAP ENFANT,118459,103776,81889,77070,62160
7,PAP FILLE,548711,512184,361648,354277,337807
8,PAP GARCON,251801,223946,180121,195323,197451
9,PUERICULTURE,6618,5788,6043,7446,7867


In [21]:
returned_article.to_csv(os.path.join(output_rep,'rep_article_returned_GER.csv'),sep=";")

<h3> Répartition des commandes retournées par canal:</h3> 

In [52]:
df_canal = pd.read_csv("C:/Users/UgoMANTEL/Work/Github/Vertbaudet_2021/data/3. Other/20210928_Order_chanel_GER.tab",sep="\t",encoding= "unicode_escape")
df_canal.columns =['BELEGSTAPELNR','LEVIER','CAMPAGNE']
df_canal

Unnamed: 0,BELEGSTAPELNR,LEVIER,CAMPAGNE
0,201848977,SEA,SEA Marque
1,201848979,SEA,SEA Marque
2,201848991,SEA,FS 10
3,201848994,SEA,SEA Marque
4,201849000,SEA,SEA Marque
...,...,...,...
2394121,201784986,SEA,SEA Marque
2394122,201784987,SEA,SEA Marque
2394123,201784988,SEA,SEA Marque
2394124,201784989,SEA,SEA Hors Marque


In [57]:
df_with_channel = df.loc[(df.BELEGSTAPELNR!='$null$') & (df.BELEGSTAPELNR.isnull()==False)]
df_with_channel.BELEGSTAPELNR = df_with_channel.BELEGSTAPELNR .astype(int)

In [58]:
df_with_channel = pd.merge(df_with_channel,df_canal[['BELEGSTAPELNR','LEVIER']],on='BELEGSTAPELNR',how='left',suffixes=(False,False))

In [59]:
rep_returner_canal = df_with_channel.loc[df_with_channel.RETURNED_ORDER == True].groupby(['YEAR_ORDER','LEVIER']).agg({'ID_ORDER': pd.Series.nunique})
rep_returner_canal = flatten_soft(rep_returner_canal.unstack('YEAR_ORDER',fill_value=0))
rep_returner_canal 

Unnamed: 0,LEVIER,_ID_ORDER_2018,_ID_ORDER_2019,_ID_ORDER_2020,_ID_ORDER_2021
0,Affiliation,7775,18296,22555,24421
1,Comparateurs,1734,4490,2675,2758
2,Display,2829,7889,7166,5411
3,Emails Fideli,10630,30537,27776,25099
4,Emails Services,0,154,92,95
5,Medias,1,1585,2448,2739
6,Partenariats,153,496,868,341
7,Retargeting,2192,8497,9878,6898
8,Réseaux sociaux,918,5889,4413,4478
9,SEA,57926,141065,135748,138984


In [60]:
rep_returner_canal.to_csv(os.path.join(output_rep,'rep_channel_returners_GER.csv'),sep=";")