In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import ticker
import matplotlib.dates as mdates
from datetime import datetime, time ,date ,timedelta
from dateutil.relativedelta import relativedelta
import seaborn as sns
from random import randint
pd.options.mode.chained_assignment = None

In [2]:
import warnings
warnings.filterwarnings("ignore")

<h1> Functions </h1>

<h3> Fonctions générales </h3>

In [3]:
def floatise (df, list_columns_to_floatise):
    for i in list_columns_to_floatise:
        df[i] = df[i].apply(lambda x: float(x.replace(',','.')))
    return (df)

In [4]:
def intise (df, list_columns_to_floatise):
    for i in list_columns_to_floatise:
        df[i] = df[i].astype(int)
    return (df)

In [5]:
def flatten (table):
    if type(table.columns)==pd.MultiIndex:
        columns_to_look = [name_tmp for name_tmp in table.columns]

        columns_df = [ str(t[0])+'_'+str(t[1]) for t in columns_to_look]
        columns_df.insert(0,table.index.name)

        df = pd.DataFrame(columns = columns_df)

        index = 0
        for i in table.index:
            row = [table[r][i] for r in columns_to_look]
            row.insert(0,i)
            df.loc[index] = row
            index = index + 1
        return(df)
    else :
        table = pd.DataFrame(table)
        table.reset_index(level=0, inplace=True)
        return table

In [6]:
def flatten_soft(dataframe):
    res = pd.DataFrame()
    res[dataframe.index.name] = dataframe.index
    for col in dataframe.columns:
        name_tmp=""
        for i in range(len(dataframe.columns[0])):
            name_tmp = name_tmp +'_'+str(col[i])
        res[str(name_tmp)] = dataframe[col].values
    return res

<h3> Fonctions particulières </h3>

In [7]:
def cohort_attribution (x):
    if (x<2017):
        return 2016
    else :
        return x

In [8]:
def customer_category_attribution(x):
    if x==1:
        return 'One-timer'
    if x==2:
        return 'Two-timer'
    else:
        return 'Recurring'

In [9]:
def describe_discount(x):
    if x<5:
        return '<5%'
    elif (x>=5) and (x<10):
        return( '5-10%')
    elif (x>=10) and (x<20):
        return( '10-20%')
    elif (x>=20) and (x<30):
        return( '20-30%')
    elif (x>=30) and (x<40):
        return( '30-40%')
    elif (x>=40) and (x<50):
        return( '40-50%')
    elif (x>=50) and (x<60):
        return( '50-60%')
    elif (x>=60) and (x<70):
        return( '60-70%')
    elif (x>=70):
        return( '>70%')            

In [10]:
def attribute_recruitment_customer_type(x):
    if (x.month == 11) & (x.year==2020):
        return 'Nov 2020'
    elif (x.month == 11) & (x.year != 2020):
        return 'Other November'
    else :
        return 'Other'

<h1> Dataset creation </h1>

<h3> Import de la data </h3>

In [11]:
src_root = os.getcwd()
print(src_root)

C:\Users\UgoMANTEL\Work\Github\Vertbaudet_2021\src\notebooks


- Préparation des colonnes:

In [12]:
dict_department = {'B':'BEBE','N':'PAP FILLE','U':'PAP GARCON','D':'CHAUSSURE','F':'FEMME',
'T':'TEXTILE HOME','P':'PUERICULTURE','W':'NON PAP ENFANT','K':'DECO','M':'CHAMBRE ET LITERIE','R':'JOUETS'}

<h3> Création du data </h3>

- Définition des répertoires :

In [13]:
backup_rep = "C:/Users/UgoMANTEL/Work/Github/Vertbaudet_2021/data/4. Back-up"

In [14]:
output_rep = "C:/Users/UgoMANTEL/eleven/Engagements - Vertbaudet/5. Analyses/3. Outputs python"

- Création de la donnée:

In [15]:
df = pd.read_csv(os.path.join(backup_rep,'Raw_Data_Germany.csv'),sep=";")
df.drop(columns=['Unnamed: 0'],inplace=True)

- Conversion des champs:

In [16]:
df['DATERF'] = pd.to_datetime(df.DATERF)
df['DATAUFTRAG1'] = pd.to_datetime(df.DATAUFTRAG1)
df['DATAUFTRAG1_12MONTH'] = pd.to_datetime(df.DATAUFTRAG1_12MONTH)
df['DATAUFTRAG1_24MONTH'] = pd.to_datetime(df.DATAUFTRAG1_24MONTH)

In [17]:
df.columns

Index(['DATERF', 'DATAUFTRAG1', 'DEPARTEMENT', 'TOT_SALES', 'NET_DEMAND',
       'GROSS_DEMAND', 'YEAR_FIRST_ORDER', 'YEAR_ORDER', 'COHORT',
       'MONTH_RECRUITMENT', 'ID_ORDER', 'KDNR', 'ARTNRERF', 'DEPARTEMENT_CMD',
       'RABATT', 'RABATTSATZ', 'PREIS', 'ANZKINDER', 'ARTGRERF', 'ARTGROESSE',
       'CODMARQ', 'DEPARTEMENT_FIRST', 'DATAUFTRAG1_12MONTH',
       'DATAUFTRAG1_24MONTH', 'NB_ORDERS', 'CLIENT_CATEGORY'],
      dtype='object')

- Rajout du numéro de commandes:

In [18]:
nb_order =  flatten(df.groupby(['ID_ORDER']).agg({'KDNR':'last','DATERF':'last'})).sort_values(by='DATERF',ascending=True)
nb_order['ORDER_NUMBER'] = nb_order.groupby(['KDNR']).cumcount()+1
df = pd.merge(df,nb_order[['ID_ORDER','ORDER_NUMBER']],on='ID_ORDER',how='left')

- Import du discount:

In [19]:
df_base_price = pd.read_csv('C:/Users/UgoMANTEL/Work/Github/Vertbaudet_2021/data/3. Other/20210928_DiscountGER.csv')
df_base_price = df_base_price.drop_duplicates(subset=['ARTNR'], keep='last')

- Catégorisation des clients:

In [20]:
df['CLIENT_RECRUITMENT_CATEGORY'] = df.DATAUFTRAG1.apply(lambda x: attribute_recruitment_customer_type(x) )

<h1> Analyses </h1>

<h3> Distribution marques nationales : </h3> 

In [21]:
df['MARQUE_NATIO'] = df.CODMARQ.apply(lambda x: 'Vertbaudet' if x=='VB'
                                     else 'National Brand')

In [23]:
res_brand = df.groupby(['MARQUE_NATIO','CLIENT_RECRUITMENT_CATEGORY']).agg({'NET_DEMAND':sum})
res_brand = flatten_soft(res_brand.unstack('CLIENT_RECRUITMENT_CATEGORY',fill_value=0))
res_brand.to_csv(os.path.join(output_rep,'recruit_brand_November_GER.csv'),sep=";")
res_brand    

Unnamed: 0,MARQUE_NATIO,_NET_DEMAND_Nov 2020,_NET_DEMAND_Other,_NET_DEMAND_Other November
0,National Brand,162390.4,6441211.0,488541.1
1,Vertbaudet,3568888.0,114468700.0,9612708.0


<h3> Distribution des one-timers, two-timers, recurrings : </h3> 

In [34]:
res_cat = df.groupby(['CLIENT_RECRUITMENT_CATEGORY','CLIENT_CATEGORY']).agg({'KDNR' : pd.Series.nunique})
res_cat = flatten_soft(res_cat.unstack('CLIENT_RECRUITMENT_CATEGORY',fill_value=0))
res_cat.to_csv(os.path.join(output_rep,'recruit_month_cat_GER.csv'),sep=";")
res_cat

Unnamed: 0,CLIENT_CATEGORY,_KDNR_Nov 2020,_KDNR_Other,_KDNR_Other November
0,One-timer,23784,350947,34561
1,Recurring,3949,159847,13044
2,Two-timer,5895,126261,11301


<h3> Distribution des univers d'achat : </h3>

In [35]:
res_univ = df.groupby(['CLIENT_RECRUITMENT_CATEGORY','DEPARTEMENT_FIRST']).agg({'KDNR' : pd.Series.nunique})
res_univ = flatten_soft(res_univ.unstack('CLIENT_RECRUITMENT_CATEGORY',fill_value=0))
res_univ.to_csv(os.path.join(output_rep,'recruit_month_univ_entry_GER.csv'),sep=";")
res_univ

Unnamed: 0,DEPARTEMENT_FIRST,_KDNR_Nov 2020,_KDNR_Other,_KDNR_Other November
0,BEBE,3885,108255,9178
1,CHAMBRE ET LITERIE,3022,56976,6836
2,CHAUSSURE,364,31890,1603
3,DECO,4294,76722,7830
4,FEMME,1345,98693,6395
5,JOUETS,8881,32230,5346
6,NON PAP ENFANT,1975,23828,2783
7,PAP FILLE,2447,73087,6450
8,PAP GARCON,1941,48105,4000
9,PUERICULTURE,1143,23826,2178


<h3> Distribution du discount : </h3> 

- Merge du prix de base:

In [21]:
df = pd.merge(df,df_base_price[['ARTNR','VKP']],
                         left_on='ARTNRERF',right_on='ARTNR',how='left',suffixes=(False,False))

KPI_to_print = sum(df.ARTNR.isnull())/len(df)*100
print('% Lines where discount computation is impossible :' +str(round(KPI_to_print,1))+'%')

% Lines where discount computation is impossible :10.1%


- Calcul du discount:

In [22]:
df['DISCOUNT'] = (1 -  df.PREIS/df.VKP)*100
df['CAT_DISCOUNT'] = df.DISCOUNT.apply(lambda x :describe_discount(x))

- Vérification:

In [23]:
KPI_to_print = len(df.loc[df.DISCOUNT<0])/len(df)*100
print('% Lines with negative discount :' +str(round(KPI_to_print,1))+'%')

KPI_to_print = df.DISCOUNT.mean()
print('% Average discount :' +str(round(KPI_to_print,1))+'%')

% Lines with negative discount :1.7%
% Average discount :15.0%


- Etude des discount 1 er achat:

In [24]:
res_discount_1st = df.loc[df.ORDER_NUMBER == 1].groupby(['CAT_DISCOUNT','CLIENT_RECRUITMENT_CATEGORY']).agg({'ARTNRERF':'count'})
res_discount_1st = flatten_soft(res_discount_1st.unstack('CLIENT_RECRUITMENT_CATEGORY',fill_value=0))
res_discount_1st.to_csv(os.path.join(output_rep,'Discount1stpurch_perRecruitCategory_GER.csv'),sep=";")
res_discount_1st

Unnamed: 0,CAT_DISCOUNT,_ARTNRERF_Nov 2020,_ARTNRERF_Other,_ARTNRERF_Other November
0,10-20%,4595,294044,16188
1,20-30%,4116,258056,29667
2,30-40%,9792,173890,12630
3,40-50%,3678,122429,12668
4,5-10%,190,25680,3756
5,50-60%,170,81346,803
6,60-70%,16,63640,1026
7,<5%,59999,1074047,77222
8,>70%,14,51899,1264


- Etude des discount:

In [40]:
res_discount = df.groupby(['CAT_DISCOUNT','CLIENT_RECRUITMENT_CATEGORY']).agg({'ARTNRERF':'count'})
res_discount = flatten_soft(res_discount.unstack('CLIENT_RECRUITMENT_CATEGORY',fill_value=0))
res_discount.to_csv(os.path.join(output_rep,'Discount_perRecruitCategory_GER.csv'),sep=";")
res_discount

Unnamed: 0,CAT_DISCOUNT,_ARTNRERF_Nov 2020,_ARTNRERF_Other,_ARTNRERF_Other November
0,10-20%,9506,674758,44287
1,20-30%,9051,603652,58335
2,30-40%,15200,445249,34897
3,40-50%,6579,314535,29813
4,5-10%,498,54337,5728
5,50-60%,1505,200388,9894
6,60-70%,220,148840,7670
7,<5%,87906,2535079,185585
8,>70%,79,116481,6193


<h3> Distribution du délai 1er/2eme achat par univers d'entré : </h3> 

In [31]:
########################### Trop grand biais ###########################
# fig, ax = plt.subplots()

# colors = []
# n = len(df.CLIENT_RECRUITMENT_CATEGORY.unique())

# colors = [ 'limegreen','deepskyblue','mediumslateblue']

# i = 0
# df = df.loc[df.CLIENT_RECRUITMENT_CATEGORY.isnull()==False]
# for univ in df.CLIENT_RECRUITMENT_CATEGORY.unique():
#     df_tmp = df.loc[(df.NB_ORDERS>=2) 
#                     & (df.ORDER_NUMBER<=2) & (df.ORDER_NUMBER>=1) & (df.CLIENT_RECRUITMENT_CATEGORY == univ)].sort_values(by='DATERF',
#                                                                                  ascending=True).drop_duplicates(subset='ID_ORDER', keep='first',inplace=False)
    
#     df_tmp = df_tmp.groupby(['KDNR','ORDER_NUMBER']).agg({'DATERF' : 'last'})

#     ######## Création du dataset
#     res_delay = flatten_soft(df_tmp.unstack('ORDER_NUMBER',fill_value=0))
# #     print(univ)
# #     print(res_delay.columns)
#     res_delay['DELAY'] = ( res_delay._DATERF_2 - res_delay._DATERF_1)
#     res_delay['DELAY_days'] = res_delay.DELAY.dt.days
#     res_delay = res_delay.loc[(res_delay.DELAY_days > 0) & (res_delay.DELAY_days < 500)]
#     plot_delay = res_delay.DELAY_days.sort_values(ascending=True)
    
#     plt.step(plot_delay, np.arange(plot_delay.size)/len(plot_delay), color = colors[i] ,label=univ,linewidth = 1.5, markersize=1) 
#     i = i+1

# plt.legend(bbox_to_anchor=(0, -0.15, 1, 0), loc=1, ncol=2, mode="expand", borderaxespad=0)


# # fig.set_size_inches(18.5, 12)
# # plt.rcParams.update({'font.size': 15})
# # plt.savefig('delay_univers_GER.png',dpi=300, bbox_inches = "tight")

# plt.show()
