In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import ticker
import matplotlib.dates as mdates
from datetime import datetime, time ,date ,timedelta
from dateutil.relativedelta import relativedelta
import seaborn as sns
pd.options.mode.chained_assignment = None

In [2]:
import warnings
warnings.filterwarnings("ignore")

<h1> Functions </h1>

<h3> Fonctions générales </h3>

In [3]:
def floatise (df, list_columns_to_floatise):
    for i in list_columns_to_floatise:
        df[i] = df[i].apply(lambda x: float(x.replace(',','.')))
    return (df)

In [4]:
def intise (df, list_columns_to_floatise):
    for i in list_columns_to_floatise:
        df[i] = df[i].astype(int)
    return (df)

In [5]:
def flatten (table):
    if type(table.columns)==pd.MultiIndex:
        columns_to_look = [name_tmp for name_tmp in table.columns]

        columns_df = [ str(t[0])+'_'+str(t[1]) for t in columns_to_look]
        columns_df.insert(0,table.index.name)

        df = pd.DataFrame(columns = columns_df)

        index = 0
        for i in table.index:
            row = [table[r][i] for r in columns_to_look]
            row.insert(0,i)
            df.loc[index] = row
            index = index + 1
        return(df)
    else :
        table = pd.DataFrame(table)
        table.reset_index(level=0, inplace=True)
        return table

In [6]:
def flatten_soft(dataframe):
    res = pd.DataFrame()
    res[dataframe.index.name] = dataframe.index
    for col in dataframe.columns:
        name_tmp=""
        for i in range(len(dataframe.columns[0])):
            name_tmp = name_tmp +'_'+str(col[i])
        res[str(name_tmp)] = dataframe[col].values
    return res

<h3> Fonctions particulières </h3>

In [7]:
def cohort_attribution (x):
    if (x<2017):
        return 2016
    else :
        return x

<h1> Dataset creation </h1>

<h3> Import de la data </h3>

In [8]:
src_root = os.getcwd()
print(src_root)

C:\Users\UgoMANTEL\Work\Github\Vertbaudet_2021\src\notebooks


- Préparation des colonnes:

In [9]:
col_names = ['AUFTRNR','KDNR','ARTNRERF','WTR','ARTGRERF','DATERF','PREISERF','PREIS','EKP','PREISNET','RABATT','RABATTSATZ',
             'MENGE','MENGERET','DEPARTEMENT','CAUFTRAGSPOS','MWST','MWSTRABATT','ANZKINDER','ANLAGEDAT',
             'DATAUFTRAG1','STRASSE','PLZ','ORT','BUNDESLAND','CLAND','CWERBESP','DATWERBESP','CLIEFERSP','DATLIEFERSP',
             'CADRESSSP','DATADRESSSP','UMSATZ1','DATLETZTAUFTR','UMSLETZTAUFTR','ANZKATLETZTAUFTR','WKOSTLETZTAUFTR',
             'DBKDNR','WKOSTENKUM','UMSATZKUM','UMSATZKUMTEL','UMSATZKUMFAX','UMSATZKUMWEB','UMSATZLIEFKUM','RUECKSTAUFTR',
             'ANZAUFTR','REFPRES','REFSTK','CODDOC','PLANEKP','RAYON','FAMILLE','DEPART','CODMARQ','ARTGROESSE','DATAUFTRAG',
             'DATERFASSUNG','DATVERARB','GUTSCHEINWERT']

In [10]:
dict_department = {'B':'BEBE','N':'PAP FILLE','U':'PAP GARCON','D':'CHAUSSURE','F':'FEMME',
'T':'TEXTILE HOME','P':'PUERICULTURE','W':'NON PAP ENFANT','K':'DECO','M':'CHAMBRE ET LITERIE','R':'JOUETS'}

<h3> Création du data </h3>

- Définition des répertoires :

In [11]:
repertoire = "C:/Users/UgoMANTEL/Work/Github/Vertbaudet_2021/data/1. Germany"


In [12]:
output_rep = "C:/Users/UgoMANTEL/eleven/Engagements - Vertbaudet/5. Analyses/3. Outputs python"

- Création de la donnée:

In [13]:
df = pd.DataFrame()
for file in os.listdir(repertoire):
    file_name_tmp = os.path.join(repertoire,file)
    df_tmp = pd.read_csv(file_name_tmp,sep=",",encoding= "unicode_escape", usecols=col_names)
    
    df_tmp = df_tmp.loc[df_tmp.CAUFTRAGSPOS!='$null$']
    df_tmp.CAUFTRAGSPOS = df_tmp.CAUFTRAGSPOS.astype(int)
    df_tmp = df_tmp.loc[df_tmp.CAUFTRAGSPOS <= 7]
    
    ################### CONVERSION DES CHAMPS ###################
    df_tmp['DATERF'] = pd.to_datetime(df_tmp.DATERF)
    df_tmp['DATAUFTRAG1'] = pd.to_datetime(df_tmp.DATAUFTRAG1)
    df_tmp.DEPARTEMENT = df_tmp.DEPARTEMENT.map(dict_department) 
    
    ################### CREATION DES CHAMPS ###################
    df_tmp['TOT_SALES'] = (df_tmp.PREIS - df_tmp.RABATT)*(df_tmp.MENGE - df_tmp.MENGERET)
    df_tmp['NET_DEMAND'] = (df_tmp.PREIS - df_tmp.RABATT) * df_tmp.MENGE
    df_tmp['GROSS_DEMAND'] = df_tmp.PREIS * df_tmp.MENGE
    df_tmp['YEAR_FIRST_ORDER'] = df_tmp.DATAUFTRAG1.dt.year
    df_tmp['YEAR_ORDER'] = df_tmp.DATERF.dt.year
    df_tmp['COHORT'] = df_tmp.YEAR_FIRST_ORDER.apply(lambda x: cohort_attribution(x))
    df_tmp['MONTH_RECRUITMENT'] = df_tmp.DATAUFTRAG1.dt.month
    df_tmp['ID_ORDER'] = df_tmp.AUFTRNR.astype(str) + '_'+ df_tmp.KDNR.astype(str)
    
    cat_order =  df_tmp.groupby(['ID_ORDER','DEPARTEMENT']).agg({'NET_DEMAND':sum}).reset_index().sort_values(by='NET_DEMAND',
    cat_order = cat_order.drop_duplicates(subset=['ID_ORDER'],keep='first')
    cat_order.columns = ['ID_ORDER','DEPARTEMENT_CMD','NET_DEMAND']
    df_tmp = pd.merge(df_tmp,cat_order[['ID_ORDER','DEPARTEMENT_CMD']], on='ID_ORDER', how='left', suffixes=(False,False))
    
    df_tmp = df_tmp[['DATERF','DATAUFTRAG1','DEPARTEMENT','TOT_SALES','NET_DEMAND','GROSS_DEMAND',
                    'YEAR_FIRST_ORDER','YEAR_ORDER','COHORT','MONTH_RECRUITMENT','ID_ORDER','KDNR','ARTNRERF',
                     'DEPARTEMENT_CMD','RABATT','RABATTSATZ','PREIS','ANZKINDER','ARTGRERF','ARTGROESSE']]
    df = pd.concat([df,df_tmp])
    del(df_tmp)

################### CREATION DE CHAMPS ###################
################### UNIVERS RECRUTEMENT
cat_rec = df.sort_values(by='DATERF',ascending=True).drop_duplicates(subset='KDNR', keep='first', inplace=False)[['KDNR','DEPARTEMENT_CMD']] 
cat_rec.columns = ['KDNR','DEPARTEMENT_FIRST']
df = pd.merge(df,cat_rec,on='KDNR',how='left',suffixes=(False,False))

################### DATE POUR LTV 12/24 MOIS
df['DATAUFTRAG1_12MONTH'] = df['DATAUFTRAG1']+ timedelta(days=365)
df['DATAUFTRAG1_24MONTH'] = df['DATAUFTRAG1']+ timedelta(days=730)

In [24]:
df.groupby(['YEAR_ORDER']).agg({'GROSS_DEMAND' : sum})

Unnamed: 0_level_0,GROSS_DEMAND
YEAR_ORDER,Unnamed: 1_level_1
2017,86300890.0
2018,76100530.0
2019,61147110.0
2020,71001060.0
2021,76443180.0


In [38]:
file_name = 'data_LTV_Germany.csv'
df.to_csv(os.path.join(output_rep,file_name),sep=";")

<h1> Sales according the product size </h1>

- Import du dataset:

In [31]:
df_size = pd.read_csv("C:/Users/UgoMANTEL/Work/Github/Vertbaudet_2021/data/3. Other/20210928_SIZE_ARTICLE_GER.tab",sep="\t",encoding= "unicode_escape")

In [34]:
df_size

Unnamed: 0,PRODUCT_ID,PRODUCT_SIZE,PRODUCT_ATTRIBUTE,PRODUCT_ATTRIBUTE_TEXT,TRANSLATION EN
0,1027384.0,92,$null$,KEIN MERKMAL,NO ATTRIBUTE
1,1027399.0,21,14,Schuhgrößen Gr. 012-040,Shoe sizes size 012-040
2,1027399.0,22,14,Schuhgrößen Gr. 012-040,Shoe sizes size 012-040
3,1027399.0,23,14,Schuhgrößen Gr. 012-040,Shoe sizes size 012-040
4,1027399.0,25,14,Schuhgrößen Gr. 012-040,Shoe sizes size 012-040
...,...,...,...,...,...
228256,9999537.0,071,$null$,KEIN MERKMAL,NO ATTRIBUTE
228257,9999537.0,074,$null$,KEIN MERKMAL,NO ATTRIBUTE
228258,9999537.0,080,$null$,KEIN MERKMAL,NO ATTRIBUTE
228259,9999537.0,086,$null$,KEIN MERKMAL,NO ATTRIBUTE


In [67]:
df.loc[df.DEPARTEMENT.isin(['PAP FILLE','PAP GARCON','PUERICULTURE'])].ARTGROESSE.unique()

array(['146', '122', '116', '128', '134', '158', '092', '110', '098',
       '140', '086', '000', 'VB', 116, 0, 140, 128, 146, 110, 86, 122, 92,
       98, 134, 158, '004', 74, 1, '001', '002', 37, '074', '044', 44, 48,
       '037', '048', '080', 80, '068', 71, 68, 62, '071', 3, 2, '062',
       '003', 'U', 'N', 4, 'RN', 5, '070', 70, '005', 'C1', 'AV', 'A5',
       'HT', '164', '170', 170, 164, 'ME', 'BM', 'L2', 'CF', 'MY', 'BA',
       'PN', 'R2', 'MK', '060', 60, 'DM', 'WD', 'SW'], dtype=object)

<h1> Sales according the number of children </h1>

- Import du dataset:

In [19]:
df_children = pd.read_csv("C:/Users/UgoMANTEL/Work/Github/Vertbaudet_2021/data/3. Other/20210927_Number_children_GER.csv",sep=",")

- Séléction des colonnes :

In [20]:
df_children['DATERSTELL'] = pd.to_datetime(df_children.DATERSTELL)
df_children.sort_values(by='DATERSTELL', ascending=True)
df_children.drop_duplicates(subset=['KDNR','GEBDATUM'], keep='last')

Unnamed: 0,DATERSTELL,BASE,KDNR,ORIGINAL,GESCHLECHT,GEBDATUM,STICHTAG
0,2018-05-03,MPLUS,5242,0,1,2010-05-01 00:00:00,2021-08-31
2,2018-11-10,MPLUS,5242,0,1,2010-11-01 00:00:00,2021-08-31
5,2017-01-22,MPLUS,5242,0,1,2011-01-01 00:00:00,2021-08-31
6,2017-02-15,MPLUS,5242,0,1,2011-02-01 00:00:00,2021-08-31
7,2018-03-24,MPLUS,5242,0,1,2011-03-01 00:00:00,2021-08-31
...,...,...,...,...,...,...,...
11568707,2021-07-16,MPLUS,36398670,0,1,2019-07-01 00:00:00,2021-08-31
11568708,2021-07-16,MPLUS,36398670,0,2,2011-07-01 00:00:00,2021-08-31
11568709,2021-07-16,MPLUS,36398670,0,2,2013-07-01 00:00:00,2021-08-31
11568710,2021-07-16,MPLUS,36398696,0,1,2011-07-01 00:00:00,2021-08-31


In [27]:
res_children = flatten(df_children.groupby(['KDNR']).agg({'STICHTAG': pd.Series.nunique}))

In [26]:
res_children.GEBDATUM.describe()

count    2.498163e+06
mean     4.101609e+00
std      4.636346e+00
min      1.000000e+00
25%      1.000000e+00
50%      3.000000e+00
75%      5.000000e+00
max      1.830000e+02
Name: GEBDATUM, dtype: float64

In [29]:
res_children.describe()

Unnamed: 0,KDNR,STICHTAG
count,2498163.0,2498163.0
mean,17415170.0,1.0
std,10544670.0,0.0
min,38.0,1.0
25%,8480833.0,1.0
50%,16592250.0,1.0
75%,26942920.0,1.0
max,36749550.0,1.0
