In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import ticker
import matplotlib.dates as mdates
from datetime import datetime, time ,date ,timedelta
from dateutil.relativedelta import relativedelta
import seaborn as sns
import gc
pd.options.mode.chained_assignment = None

from datetime import date
today = date.today().strftime("%Y%m%d")

import warnings
warnings.filterwarnings("ignore")

<h1> Functions </h1>

<h3> Fonctions générales </h3>

In [3]:
def floatise (df, list_columns_to_floatise):
    for i in list_columns_to_floatise:
        df[i] = df[i].apply(lambda x: float(x.replace(',','.')))
    return (df)

In [4]:
def intise (df, list_columns_to_floatise):
    for i in list_columns_to_floatise:
        df[i] = df[i].astype(int)
    return (df)

In [5]:
def flatten (table):
    if type(table.columns)==pd.MultiIndex:
        columns_to_look = [name_tmp for name_tmp in table.columns]

        columns_df = [ str(t[0])+'_'+str(t[1]) for t in columns_to_look]
        columns_df.insert(0,table.index.name)

        df = pd.DataFrame(columns = columns_df)

        index = 0
        for i in table.index:
            row = [table[r][i] for r in columns_to_look]
            row.insert(0,i)
            df.loc[index] = row
            index = index + 1
        return(df)
    else :
        table = pd.DataFrame(table)
        table.reset_index(level=0, inplace=True)
        return table

In [6]:
def flatten_soft(dataframe):
    res = pd.DataFrame()
    res[dataframe.index.name] = dataframe.index
    for col in dataframe.columns:
        name_tmp=""
        for i in range(len(dataframe.columns[0])):
            name_tmp = name_tmp +'_'+str(col[i])
        res[str(name_tmp)] = dataframe[col].values
    return res

<h3> Fonctions particulières </h3>

In [7]:
def cohort_attribution(x):
    if x <= 2016:
        return 2016
    else :
        return x

In [8]:
def customer_category_attribution(x):
    if x==1:
        return 'One-timer'
    if x==2:
        return 'Two-timer'
    else:
        return 'Recurring'

In [9]:
def describe_discount(x):
    if x<5:
        return '<5%'
    if (x>=5) and (x<10):
        return( '5-10%')
    if (x>=10) and (x<20):
        return( '10-20%')
    if (x>=20) and (x<30):
        return( '20-30%')
    if (x>=30) and (x<40):
        return( '30-40%')
    if (x>=40) and (x<50):
        return( '40-50%')
    if (x>=50) and (x<60):
        return( '50-60%')
    if (x>=60) and (x<70):
        return( '60-70%')
    if (x>=70):
        return( '>70%')  

In [10]:
dict_department = {'B':'BEBE','N':'PAP FILLE','U':'PAP GARCON','D':'CHAUSSURE','F':'FEMME',
'T':'TEXTILE HOME','P':'PUERICULTURE','W':'NON PAP ENFANT','K':'DECO','M':'CHAMBRE ET LITERIE','R':'JOUETS'}

In [11]:
repertoire = "C:/Users/pierrick/Documents/Vertbaudet/Germany/Source_file"

In [12]:
output_rep = "C:/Users/pierrick/Documents/Vertbaudet/Germany/Output_python"

<h1> Dataset creation </h1>

<h3> Import de la data </h3>

In [15]:
#columns = ['AUFTRNR', 'KDNR', 'ARTNRERF', 'WTR', 'ARTGRERF', 'DATERF', 'PREISERF', 'PREIS', 'EKP', 'PREISNET',
#           'RABATT', 'RABATTSATZ', 'MENGE', 'MENGERET', 'DEPARTEMENT', 'CAUFTRAGSPOS', 'MWST', 'MWSTRABATT', 
#           'ANZKINDER', 'ANLAGEDAT', 'DATAUFTRAG1', 'STRASSE', 'PLZ', 'ORT', 'BUNDESLAND', 'CLAND', 'CWERBESP', 
#           'DATWERBESP', 'CLIEFERSP', 'DATLIEFERSP', 'CADRESSSP', 'DATADRESSSP', 'UMSATZ1', 'DATLETZTAUFTR', 
#           'UMSLETZTAUFTR', 'ANZKATLETZTAUFTR', 'WKOSTLETZTAUFTR', 'DBKDNR', 'WKOSTENKUM', 'UMSATZKUM', 'UMSATZKUMTEL', 
#           'UMSATZKUMFAX', 'UMSATZKUMWEB', 'UMSATZLIEFKUM', 'RUECKSTAUFTR', 'ANZAUFTR', 'REFPRES', 'REFSTK', 'CODDOC', 
#           'PLANEKP', 'RAYON', 'FAMILLE', 'DEPART', 'CODMARQ', 'ARTGROESSE', 'DATAUFTRAG', 'DATERFASSUNG', 'DATVERARB', 
#           'GUTSCHEINWERT']

In [13]:
columns = ['AUFTRNR', 'KDNR', 'DATERF', 'PREIS', 'PREISNET', 'RABATT', 'RABATTSATZ', 'MENGE', 'MENGERET', 'DEPARTEMENT', 'CAUFTRAGSPOS', 'DATAUFTRAG1', 'PLZ']

# Import

In [14]:
net_demand = True

In [15]:
df = pd.DataFrame()
for file in os.listdir(repertoire):
    file_name_tmp = os.path.join(repertoire,file)
    df_tmp = pd.read_csv(file_name_tmp,sep=",",encoding= "unicode_escape", usecols=columns)
    
    df_tmp = df_tmp.loc[df_tmp.CAUFTRAGSPOS!='$null$']
    df_tmp.CAUFTRAGSPOS = df_tmp.CAUFTRAGSPOS.astype(int)
    df_tmp = df_tmp.loc[df_tmp.CAUFTRAGSPOS <= 7]
    
    ################### CONVERSION DES CHAMPS ###################
    df_tmp['DATERF'] = pd.to_datetime(df_tmp.DATERF)
    df_tmp['DATAUFTRAG1'] = pd.to_datetime(df_tmp.DATAUFTRAG1)
    df_tmp.DEPARTEMENT = df_tmp.DEPARTEMENT.map(dict_department) 
    df_tmp['PLZ'] = df_tmp['PLZ'].astype(str)
    
    ################### CREATION DES CHAMPS ###################
    df_tmp['TOT_SALES'] = (df_tmp.PREIS - df_tmp.RABATT)*(df_tmp.MENGE - df_tmp.MENGERET)
    df_tmp['NET_DEMAND'] = (df_tmp.PREIS - df_tmp.RABATT) * df_tmp.MENGE
    df_tmp['GROSS_DEMAND'] = df_tmp.PREIS * df_tmp.MENGE
    df_tmp['YEAR_FIRST_ORDER'] = df_tmp.DATAUFTRAG1.dt.year
    df_tmp['YEAR_ORDER'] = df_tmp.DATERF.dt.year
    df_tmp['COHORT'] = df_tmp.YEAR_FIRST_ORDER.apply(lambda x: cohort_attribution(x))
    df_tmp['MONTH_RECRUITMENT'] = df_tmp.DATAUFTRAG1.dt.month
    df_tmp['ID_ORDER'] = df_tmp.AUFTRNR.astype(str) + '_'+ df_tmp.KDNR.astype(str)
    
    if net_demand:
        cat_order = df_tmp.sort_values(by='NET_DEMAND',ascending=False).drop_duplicates(subset='ID_ORDER', keep='first',
                                                                                        inplace=False)
    else:
        cat_order = df_tmp.sort_values(by='TOT_SALES',ascending=False).drop_duplicates(subset='ID_ORDER', keep='first',
                                                                                        inplace=False)
    cat_order = cat_order [['ID_ORDER','DEPARTEMENT']]
    cat_order.columns = ['ID_ORDER','DEPARTEMENT_CMD']
    df_tmp = pd.merge(df_tmp,cat_order,on='ID_ORDER',how='left',suffixes=(False,False))
    
    df_tmp = df_tmp[['DATERF','DATAUFTRAG1','DEPARTEMENT','TOT_SALES','NET_DEMAND','GROSS_DEMAND',
                    'YEAR_FIRST_ORDER','YEAR_ORDER','COHORT','MONTH_RECRUITMENT','ID_ORDER','KDNR',
                     'DEPARTEMENT_CMD','RABATT','RABATTSATZ','PREIS', 'PLZ', ]]
    df = pd.concat([df,df_tmp])
    del(df_tmp)

################### CREATION DE CHAMPS ###################
################### UNIVERS RECRUTEMENT
cat_rec = df.sort_values(by='DATERF',ascending=True).drop_duplicates(subset='KDNR', keep='first', inplace=False)[['KDNR','DEPARTEMENT_CMD']] 
cat_rec.columns = ['KDNR','DEPARTEMENT_FIRST']
df = pd.merge(df,cat_rec,on='KDNR',how='left',suffixes=(False,False))

In [16]:
### Add number of cmd and CLIENT_CATEGORY
nb_commande = flatten(df.groupby(['KDNR']).agg({'ID_ORDER':pd.Series.nunique}))
nb_commande.columns = ['KDNR','NB_ORDERS']
df = pd.merge(df,nb_commande,on='KDNR',how='left',suffixes=(False,False))
df['CLIENT_CATEGORY'] = df.NB_ORDERS.apply(lambda x: customer_category_attribution(x))

In [17]:
### SELECT COHORTS
df = df[df['COHORT'].isin(range(2017,2021))]

In [18]:
gc.collect()

109

- Rajout du compte de commande

In [25]:
nb_order =  flatten(df.groupby(['ID_ORDER']).agg({'KDNR':'last', 'DATERF': 'last'})).sort_values(by='DATERF',ascending=True)
nb_order['ORDER_NUMBER'] = nb_order.groupby(['KDNR']).cumcount()+1
df = pd.merge(df,nb_order[['ID_ORDER','ORDER_NUMBER']],on='ID_ORDER',how='left')

- Segmentation selon fréquence

In [19]:
def segmentation_freq(x):
    if x.days < 30:
        return '< 1 month'
    elif x.days < 90:
        return '1-3 months'
    elif x.days < 180:
        return '3-6 months'
    elif x.days < 270:
        return '6-9 months'
    elif x.days < 365 :
        return '9-12 months'
    elif x.days > 365 :
        return '> 12 months'

In [20]:
def segmentation_small_freq(x):
    if x.days < 7:
        return '< 1 week'
    elif x.days < 14:
        return '1-2 weeks'
    elif x.days < 21:
        return '2-3 weeks'
    elif x.days < 30:
        return '3-4 weeks'
    elif x.days < 90:
        return '1-3 months'
    elif x.days < 180:
        return '3-6 months'
    elif x.days < 270:
        return '6-9 months'
    elif x.days < 365 :
        return '9-12 months'
    elif x.days > 365 :
        return '> 12 months'

In [21]:
df_first = df[df['ORDER_NUMBER'] == 1].drop_duplicates('KDNR')[['KDNR', 'DATERF']]

df_second = df[df['ORDER_NUMBER'] == 2].drop_duplicates('KDNR')[['KDNR', 'DATERF']].rename(columns = {'DATERF': 'DATERF_2'})

df_first = df_first.merge(df_second, how = 'left', on = 'KDNR')

df_first['GAP'] = (df_first['DATERF_2'] - df_first['DATERF'])

df_first['FREQ_CAT'] = df_first['GAP'].apply(segmentation_freq)
df_first['FREQ_CAT_SMALL'] = df_first['GAP'].apply(segmentation_small_freq)

df = df.merge(df_first[['KDNR', 'FREQ_CAT', 'FREQ_CAT_SMALL']], how = 'left', on = 'KDNR')

del df_first
del df_second
gc.collect()

0

In [26]:
### Sanity Check

for year in range(2017,2021):
    print('Net Demand in '+str(year)+':')
    print(df[df['YEAR_ORDER'] == year]['NET_DEMAND'].sum())
    print('Nb of client recruited in '+str(year)+':')
    print(df[df['COHORT']==year]['KDNR'].nunique())

Net Demand in 2017:
19033269.100000005
Nb of client recruited in 2017:
146110
Net Demand in 2018:
26108901.320000004
Nb of client recruited in 2018:
167324
Net Demand in 2019:
28898354.28999999
Nb of client recruited in 2019:
184797
Net Demand in 2020:
39804321.38000002
Nb of client recruited in 2020:
231358


### DN First Order

In [22]:
df_cde_1 = df[df.ORDER_NUMBER == 1].groupby('ID_ORDER').agg({'NET_DEMAND': sum, 'KDNR': 'first'}).rename(columns = {'NET_DEMAND': 'DN_1'})
df = df.merge(df_cde_1, how = 'left', on = 'KDNR')

In [23]:
df.drop_duplicates('KDNR').groupby('FREQ_CAT_SMALL').agg({'DN_1': 'mean'}).to_excel('./Vertbaudet/Germany/'+today+'_dn_1_by_reorder_freq_small.xlsx')

### Nombre d'univers

In [27]:
def map_nb_univ(x):
    if x == 1:
        return '1'
    if x == 2:
        return '2'
    if x == 3:
        return '3'
    if x >= 4:
        return '4+'

In [28]:
### Create nb_univ field
df_nb_univ = df.groupby('KDNR').agg({'DEPARTEMENT': pd.Series.nunique}).rename(columns = {'DEPARTEMENT': 'NB_UNIV'})
df = df.merge(df_nb_univ, how = 'left', on = 'KDNR')

del df_nb_univ
gc.collect()

df['NB_UNIV_CAT'] = df['NB_UNIV'].apply(map_nb_univ)

In [None]:
### Create output df
df_nb_univ_by_freq_cat = df.groupby(['FREQ_CAT', 'NB_UNIV_CAT']).agg({'KDNR': pd.Series.nunique}).unstack()

In [None]:
df_nb_univ_by_freq_cat.to_excel('./Vertbaudet/Germany/'+today+'_nb_univ_by_reorfer_freq.xlsx')

In [29]:
df_nb_univ_by_freq_cat_small = df.groupby(['FREQ_CAT_SMALL', 'NB_UNIV_CAT']).agg({'KDNR': pd.Series.nunique}).unstack()
df_nb_univ_by_freq_cat_small.to_excel('./Vertbaudet/Germany/'+today+'_nb_univ_by_reorfer_freq_small.xlsx')

### Univers d'entrée en fonction de la catégorie de fréquence

- Add DEPARTEMENT_FIRST

In [20]:
df_entry_dept = df[df['DATERF']==df['DATAUFTRAG1']].drop_duplicates(subset = ['KDNR', 'ID_ORDER']).groupby('FREQ_CAT').agg({'DEPARTEMENT_CMD': 'value_counts'}).unstack()

KeyError: 'FREQ_CAT'

In [31]:
df_entry_dept.to_excel('./Vertbaudet/Germany/'+today+'_entry_dept_by_reorfer_freq.xlsx')

In [32]:
del df_entry_dept
gc.collect()

45

In [30]:
df_entry_dept_small = df[df['DATERF']==df['DATAUFTRAG1']].drop_duplicates(subset = ['KDNR', 'ID_ORDER']).groupby('FREQ_CAT_SMALL').agg({'DEPARTEMENT_CMD': 'value_counts'}).unstack()
df_entry_dept_small.to_excel('./Vertbaudet/Germany/'+today+'_entry_dept_by_reorfer_freq_small.xlsx')

### Univers majoritaire en fonction de la catégorie de fréquence

In [31]:
univ_maj = df.groupby(['KDNR','DEPARTEMENT']).agg({'NET_DEMAND' : 'sum'})
univ_maj.reset_index(inplace=True)


univ_maj = univ_maj.sort_values(by='NET_DEMAND',ascending=False).drop_duplicates(subset='KDNR', keep='first',inplace=False)
univ_maj.columns =['KDNR','DEPARTEMENT_MAJORITAIRE','NET_DEMAND']


df = pd.merge(df,univ_maj[['KDNR','DEPARTEMENT_MAJORITAIRE']],on='KDNR',how='left')

del univ_maj

In [34]:
df_univ_maj = df.groupby(['FREQ_CAT', 'DEPARTEMENT_MAJORITAIRE']).agg({'KDNR': pd.Series.nunique}).unstack()
df_univ_maj.to_excel('./Vertbaudet/Germany/'+today+'_maj_dept_by_reorfer_freq.xlsx')

In [32]:
df_univ_maj_small = df.groupby(['FREQ_CAT_SMALL', 'DEPARTEMENT_MAJORITAIRE']).agg({'KDNR': pd.Series.nunique}).unstack()
df_univ_maj_small.to_excel('./Vertbaudet/Germany/'+today+'_maj_dept_by_reorfer_freq_small.xlsx')

### Entry month by frequency category

In [None]:
df_entry_month = df[df['DATERF']==df['DATAUFTRAG1']].drop_duplicates(subset = ['KDNR', 'ID_ORDER']).groupby('FREQ_CAT').agg({'MONTH_RECRUITMENT': 'value_counts'}).unstack()
df_entry_month.to_excel('./Vertbaudet/Germany/'+today+'_entry_month_by_reorder_freq.xlsx')

In [33]:
df_entry_month_small = df[df['DATERF']==df['DATAUFTRAG1']].drop_duplicates(subset = ['KDNR', 'ID_ORDER']).groupby('FREQ_CAT_SMALL').agg({'MONTH_RECRUITMENT': 'value_counts'}).unstack()
df_entry_month_small.to_excel('./Vertbaudet/Germany/'+today+'_entry_month_by_reorder_freq_small.xlsx')

In [39]:
df_entry_month_17_19= df[df['COHORT']<2020].groupby(['FREQ_CAT', 'MONTH_RECRUITMENT']).agg({'KDNR': pd.Series.nunique}).unstack()

In [40]:
df_entry_month_17_19.to_excel('./Vertbaudet/Germany/'+today+'_entry_month_17_19_by_reorder_freq.xlsx')

In [41]:
del df_entry_month
gc.collect()

30

### Nbr command by frequency category

In [42]:
nb_cmd_by_freq_cat = df[df['COHORT'].isin(range(2017,2021))].drop_duplicates(subset = ['KDNR', 'ID_ORDER']).groupby(['FREQ_CAT', 'KDNR']).agg({'ID_ORDER': 'count'}).groupby('FREQ_CAT').agg({'ID_ORDER': 'mean'})
nb_cmd_by_freq_cat.to_excel('./Vertbaudet/Germany/'+today+'_nb_cmd_by_reorder_freq.xlsx')

In [34]:
nb_cmd_by_freq_cat_small = df[df['COHORT'].isin(range(2017,2021))].drop_duplicates(subset = ['KDNR', 'ID_ORDER']).groupby(['FREQ_CAT_SMALL', 'KDNR']).agg({'ID_ORDER': 'count'}).groupby('FREQ_CAT_SMALL').agg({'ID_ORDER': 'mean'})
nb_cmd_by_freq_cat_small.to_excel('./Vertbaudet/Germany/'+today+'_nb_cmd_by_reorder_freq_small.xlsx')

### DN by frequency catgory

In [44]:
df_avg_basket = df.groupby('FREQ_CAT').agg({'NET_DEMAND': sum, 'KDNR': pd.Series.nunique})
df_avg_basket['AVERAGE_BASKET'] = df_avg_basket['NET_DEMAND']/df_avg_basket['KDNR']
df_avg_basket.to_excel('./Vertbaudet/Germany/'+today+'_avg_basket_by_reorder_freq.xlsx')

In [37]:
df_avg_basket_small = df.groupby('FREQ_CAT_SMALL').agg({'NET_DEMAND': sum, 'KDNR': pd.Series.nunique})
df_avg_basket_small['AVERAGE_BASKET'] = df_avg_basket_small['NET_DEMAND']/df_avg_basket_small['KDNR']
df_avg_basket_small.to_excel('./Vertbaudet/Germany/'+today+'_avg_basket_by_reorder_freq_small.xlsx')

### OT/TT/Reg by frequency category

In [47]:
df_cat_by_freq_cat = df.groupby(['FREQ_CAT', 'CLIENT_CATEGORY']).agg({'KDNR': pd.Series.nunique}).unstack()
df_cat_by_freq_cat.to_excel('./Vertbaudet/Germany/'+today+'_cat_client_by_reorder_freq.xlsx')

In [38]:
df_cat_by_freq_cat_small = df.groupby(['FREQ_CAT_SMALL', 'CLIENT_CATEGORY']).agg({'KDNR': pd.Series.nunique}).unstack()
df_cat_by_freq_cat_small.to_excel('./Vertbaudet/Germany/'+today+'_cat_client_by_reorder_freq_small.xlsx')

### Acquisition chanel by frequency category

In [19]:
df_numcde = pd.read_csv("./Vertbaudet/Germany/20211001_NUM_CMD_CANAL_GER.csv",sep=",",encoding= "unicode_escape")
df_numcde['ID_ORDER'] = df_numcde.AUFTNR.astype(str) + '_'+ df_numcde.KDNR.astype(str)
df = pd.merge(df,df_numcde[['ID_ORDER','BELEGSTAPELNR']],on='ID_ORDER',how='left',suffixes=(False,False))

In [20]:
df_canal = pd.read_csv("./Vertbaudet/Germany/20210928_Order_chanel_GER.tab", sep = '\t', encoding= "unicode_escape")
df_canal.columns =['BELEGSTAPELNR','LEVIER','CAMPAGNE']

In [21]:
gc.collect()

30

In [22]:
df['BELEGSTAPELNR'] = df['BELEGSTAPELNR'].replace(np.nan,0)

df['BELEGSTAPELNR'] = df['BELEGSTAPELNR'].replace('$null$',0)

df['BELEGSTAPELNR'] = df['BELEGSTAPELNR'].astype(int)

df = df.merge(df_canal, how = 'left', on = 'BELEGSTAPELNR')

In [53]:
df_acqu_chanel = df[df['DATERF']==df['DATAUFTRAG1']].drop_duplicates(subset = ['KDNR', 'ID_ORDER']).groupby('FREQ_CAT').agg({'LEVIER': 'value_counts'}).unstack()
df_acqu_chanel.to_excel('./Vertbaudet/Germany/'+today+'_acqu_channel_by_freq_cat.xlsx')

In [43]:
df_acqu_chanel_small = df[df['DATERF']==df['DATAUFTRAG1']].drop_duplicates(subset = ['KDNR', 'ID_ORDER']).groupby('FREQ_CAT_SMALL').agg({'LEVIER': 'value_counts'}).unstack()
df_acqu_chanel_small.to_excel('./Vertbaudet/Germany/'+today+'_acqu_channel_by_freq_cat_small.xlsx')

### LTV by frequency category

In [23]:
import datetime
from datetime import datetime, time ,date ,timedelta

# Add DAT_12 and DAT_14 column
df['DAT_12'] = df['DATAUFTRAG1'] + timedelta(days=365)
df['DAT_24'] = df['DATAUFTRAG1'] + timedelta(days=730)

### Add LTV_12 column
df_ltv_12 = df[df['DATERF']<=df['DAT_12']][['KDNR', 'NET_DEMAND']]
df_ltv_12 = df_ltv_12.groupby('KDNR').agg({'NET_DEMAND': sum})
df_ltv_12 = df_ltv_12.rename(columns = {'NET_DEMAND': 'LTV_12'})
print(df_ltv_12['LTV_12'].mean())
df = df.merge(df_ltv_12, how = 'left', on = 'KDNR')

del df_ltv_12

### Add LTV_24 column
df_ltv_24 = df[df['DATERF']<=df['DAT_24']][['KDNR', 'NET_DEMAND']]
df_ltv_24 = df_ltv_24.groupby('KDNR').agg({'NET_DEMAND': sum})
df_ltv_24 = df_ltv_24.rename(columns = {'NET_DEMAND': 'LTV_24'})
print(df_ltv_24['LTV_24'].mean())
df = df.merge(df_ltv_24, how = 'left', on = 'KDNR')

del df_ltv_24

gc.collect()

135.1175223303425
159.17589908409315


0

In [45]:
df.drop_duplicates('KDNR').groupby('FREQ_CAT').agg({'LTV_12': 'mean'}).to_excel('./Vertbaudet/Germany/'+today+'_ltv_12_by_freq_cat.xlsx')
df.drop_duplicates('KDNR').groupby('FREQ_CAT').agg({'LTV_24': 'mean'}).to_excel('./Vertbaudet/Germany/'+today+'_ltv_24_by_freq_cat.xlsx')

In [46]:
df.drop_duplicates('KDNR').groupby('FREQ_CAT_SMALL').agg({'LTV_12': 'mean'}).to_excel('./Vertbaudet/Germany/'+today+'_ltv_12_by_freq_cat_small.xlsx')
df.drop_duplicates('KDNR').groupby('FREQ_CAT_SMALL').agg({'LTV_24': 'mean'}).to_excel('./Vertbaudet/Germany/'+today+'_ltv_24_by_freq_cat_small.xlsx')

### LTV by Cohort and entry month

In [25]:
df.drop_duplicates('KDNR').groupby('COHORT').agg({'LTV_12': 'mean'}).to_excel('./Vertbaudet/Germany/'+today+'_ltv_12_by_cohort.xlsx')

In [27]:
df.drop_duplicates('KDNR').groupby(['COHORT', 'MONTH_RECRUITMENT']).agg({'LTV_12': 'mean'}).unstack().to_excel('./Vertbaudet/Germany/'+today+'_ltv_12_by_cohort_and_month.xlsx')

### Churn delay by frequency category

In [26]:
### Add churn_delay field
df_churn = df.sort_values(by = 'DATERF', ascending = True).groupby('KDNR').agg({'DATERF': 'last', 'DATAUFTRAG1': 'last', 'KDNR':'last'})
df_second = df[df['ORDER_NUMBER'] == 2].drop_duplicates('KDNR')[['KDNR', 'DATERF']].rename(columns = {'DATERF': 'DATERF_2'})
df_churn = df_churn.merge(df_second, how = 'left', left_index = True,  right_on = 'KDNR')
df_churn = df_churn.reset_index()
df_churn['CHURN_DELAY'] = (df_churn['DATERF'] + timedelta(days=365)) - df_churn['DATAUFTRAG1']
df_churn['CHURN_DELAY_2'] = (df_churn['DATERF'] + timedelta(days=365)) - df_churn['DATERF_2']
df_churn = df_churn.drop(columns = 'index')

df = df.merge(df_churn[['KDNR', 'CHURN_DELAY', 'CHURN_DELAY_2']], how = 'left', left_on = 'KDNR', right_on  = 'KDNR')

del df_churn
del df_second

df['CHURN_DELAY'] = df['CHURN_DELAY'].apply(lambda x : x.days)
df['CHURN_DELAY_2'] = df['CHURN_DELAY_2'].apply(lambda x : x.days)

gc.collect()

0

In [48]:
df.drop_duplicates('KDNR').groupby('FREQ_CAT').agg({'CHURN_DELAY_2': 'mean'}).to_excel('./Vertbaudet/Germany/'+today+'_churn_delay_by_freq_cat.xlsx')
df.drop_duplicates('KDNR').groupby('FREQ_CAT_SMALL').agg({'CHURN_DELAY_2': 'mean'}).to_excel('./Vertbaudet/Germany/'+today+'_churn_delay_by_freq_cat_small.xlsx')

### Nbr new dept in the second command by frequency category

In [49]:
### Create NEW_DEP field
def dif(x,y):
    return len(y.difference(x))

df_list_dept_by_cmd = df[df['ORDER_NUMBER'].isin(range(1,3))].groupby(['KDNR', 'ORDER_NUMBER']).agg({'ORDER_NUMBER': 'first', 'DEPARTEMENT': lambda x: list(pd.Series.unique(x))})
df_cmd_1_2 = df_list_dept_by_cmd[df_list_dept_by_cmd['ORDER_NUMBER']==1].merge(df_list_dept_by_cmd[df_list_dept_by_cmd['ORDER_NUMBER']==2], how = 'left', on = 'KDNR').rename(columns = {'DEPARTEMENT_x': 'DEPART_1', 'DEPARTEMENT_y': 'DEPART_2'})
df_cmd_1_2 = df_cmd_1_2[(df_cmd_1_2.DEPART_1.notna()) & (df_cmd_1_2.DEPART_2.notna())]
df_cmd_1_2['NEW_DEP'] = df_cmd_1_2.apply(lambda x: dif(set(x.DEPART_1), set(x.DEPART_2)), axis = 1)
df = df.merge(df_cmd_1_2['NEW_DEP'], how = 'left', left_on = 'KDNR', right_index = True)

In [50]:
df.groupby(['FREQ_CAT', 'NEW_DEP']).agg({'KDNR': pd.Series.nunique}).unstack().replace(np.nan, 0).to_excel('./Vertbaudet/Germany/'+today+'_new_dep_by_freq_cat.xlsx')
df.groupby(['FREQ_CAT_SMALL', 'NEW_DEP']).agg({'KDNR': pd.Series.nunique}).unstack().replace(np.nan, 0).to_excel('./Vertbaudet/Germany/'+today+'_new_dep_by_freq_cat_small.xlsx')

### Cat discount by frequency catgory - not relevant for Germany

In [54]:
### add cat_discount
df['CAT_DISCOUNT'] = df.RABATTSATZ.apply(lambda x :describe_discount(x) )

df_cat_discount = df[df['DATERF']==df['DATAUFTRAG1']].drop_duplicates(subset = ['KDNR', 'ID_ORDER']).groupby('FREQ_CAT').agg({'CAT_DISCOUNT': 'value_counts'}).unstack()

In [55]:
### convert to percentage
for cat in ['<30', '<365', '>365']:
    div = df_cat_discount.loc[cat].sum()
    for disc in  ['10-20%', '20-30%', '30-40%', '40-50%', '5-10%', '50-60%', '<5%', '>70%']:
        df_cat_discount['CAT_DISCOUNT', disc][cat] =  df_cat_discount['CAT_DISCOUNT', disc][cat] / div

KeyError: '<30'

In [None]:
df_cat_discount.to_excel('./Vertbaudet/Germany/'+today+'_discount_cat_by_freq_cat.xlsx')

# Misc

### LTV 12-24 by entry department

In [23]:
df.drop_duplicates('KDNR').groupby('DEPARTEMENT_FIRST').agg({'LTV_12': 'mean'}).to_excel('./Vertbaudet/Germany/'+today+'_ltv_12_by_entr_dept.xlsx')
df.drop_duplicates('KDNR').groupby('DEPARTEMENT_FIRST').agg({'LTV_24': 'mean'}).to_excel('./Vertbaudet/Germany/'+today+'_ltv_24_by_entr_dept.xlsx')

### Lifetime by entry dept

In [24]:
### Seniority abs
df_last_cmd = df[['KDNR', 'DATERF', 'DATAUFTRAG1']].sort_values(by = 'DATERF', ascending = False).groupby('KDNR').agg({'DATERF': 'first', 'DATAUFTRAG1': 'first'})
df_last_cmd['SENIORITY_ABS'] = (df_last_cmd['DATERF'] - df_last_cmd['DATAUFTRAG1']).dt.days + 365
df = df.merge(df_last_cmd['SENIORITY_ABS'], how = 'left', on = 'KDNR')

del df_last_cmd
gc.collect()

15

In [25]:
df.drop_duplicates('KDNR').groupby('DEPARTEMENT_FIRST').agg({'SENIORITY_ABS': 'mean'}).to_excel('./Vertbaudet/Germany/'+today+'_lifetime_by_entr_dept.xlsx')

### Churn delay by acquisition channel

In [30]:
df[df.ORDER_NUMBER == 1].drop_duplicates('KDNR').groupby('LEVIER').agg({'CHURN_DELAY': 'mean', 'LTV_12': 'mean', 'LTV_24': 'mean'}).to_excel('./Vertbaudet/Germany/'+today+'_churn_ltv_bad_acq_channel.xlsx')