In [2]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import ticker
import matplotlib.dates as mdates
from datetime import datetime, time ,date ,timedelta
from dateutil.relativedelta import relativedelta
import seaborn as sns
from random import randint
pd.options.mode.chained_assignment = None

In [3]:
import warnings
warnings.filterwarnings("ignore")

<h1> Functions </h1>

<h3> Fonctions générales </h3>

In [4]:
def floatise (df, list_columns_to_floatise):
    for i in list_columns_to_floatise:
        df[i] = df[i].apply(lambda x: float(x.replace(',','.')))
    return (df)

In [5]:
def intise (df, list_columns_to_floatise):
    for i in list_columns_to_floatise:
        df[i] = df[i].astype(int)
    return (df)

In [6]:
def flatten (table):
    if type(table.columns)==pd.MultiIndex:
        columns_to_look = [name_tmp for name_tmp in table.columns]

        columns_df = [ str(t[0])+'_'+str(t[1]) for t in columns_to_look]
        columns_df.insert(0,table.index.name)

        df = pd.DataFrame(columns = columns_df)

        index = 0
        for i in table.index:
            row = [table[r][i] for r in columns_to_look]
            row.insert(0,i)
            df.loc[index] = row
            index = index + 1
        return(df)
    else :
        table = pd.DataFrame(table)
        table.reset_index(level=0, inplace=True)
        return table

In [7]:
def flatten_soft(dataframe):
    res = pd.DataFrame()
    res[dataframe.index.name] = dataframe.index
    for col in dataframe.columns:
        name_tmp=""
        for i in range(len(dataframe.columns[0])):
            name_tmp = name_tmp +'_'+str(col[i])
        res[str(name_tmp)] = dataframe[col].values
    return res

<h3> Fonctions particulières </h3>

In [8]:
def cohort_attribution (x):
    if (x<2017):
        return 2016
    else :
        return x

In [9]:
def customer_category_attribution(x):
    if x==1:
        return 'One-timer'
    if x==2:
        return 'Two-timer'
    else:
        return 'Recurring'

In [10]:
def describe_discount(x):
    if x<5:
        return '<5%'
    if (x>=5) and (x<10):
        return( '5-10%')
    if (x>=10) and (x<20):
        return( '10-20%')
    if (x>=20) and (x<30):
        return( '20-30%')
    if (x>=30) and (x<40):
        return( '30-40%')
    if (x>=40) and (x<50):
        return( '40-50%')
    if (x>=50) and (x<60):
        return( '50-60%')
    if (x>=60) and (x<70):
        return( '60-70%')
    if (x>=70):
        return( '>70%')            

<h1> Dataset creation </h1>

- Définition des répertoires :

In [11]:
backup_rep = "C:/Users/UgoMANTEL/Work/Github/Vertbaudet_2021/data/4. Back-up"
output_rep = "C:/Users/UgoMANTEL/eleven/Engagements - Vertbaudet/5. Analyses/3. Outputs python"

<h3> Import du dataset </h3>

- Création de la donnée:

In [12]:
df = pd.read_csv(os.path.join(backup_rep,'Raw_Data_Germany.csv'),sep=";")
df.drop(columns=['Unnamed: 0'],inplace=True)

- Conversion des champs:

In [13]:
df['DATERF'] = pd.to_datetime(df.DATERF)
df['DATAUFTRAG1'] = pd.to_datetime(df.DATAUFTRAG1)
df['DATAUFTRAG1_12MONTH'] = pd.to_datetime(df.DATAUFTRAG1_12MONTH)
df['DATAUFTRAG1_24MONTH'] = pd.to_datetime(df.DATAUFTRAG1_24MONTH)

In [14]:
# df

<h3> Rajout du nombre de commandes </h3>

In [15]:
nb_order =  flatten(df.sort_values(by='DATERF',ascending=True).groupby(['ID_ORDER']).agg({'KDNR':'last'}))
nb_order['ORDER_NUMBER'] = nb_order.groupby(['KDNR']).cumcount()+1
df = pd.merge(df,nb_order[['ID_ORDER','ORDER_NUMBER']],on='ID_ORDER',how='left')

<h3> Rajout de l'opt-in </h3>

- Import opt-in:

In [16]:
df_optin = pd.read_csv('C:/Users/UgoMANTEL/Work/Github/Vertbaudet_2021/data/3. Other/20211012_GER_MATCHING.csv',sep=",")
df_optin = df_optin.sort_values(by='DAT_DOI',ascending=True).drop_duplicates(subset='KDNR', keep='last',inplace=False)
df_optin

Unnamed: 0,KDNR,EMAIL,DAT_REG,NLFREQ,DAT_DOI,DAT_MATCH
676908,8651704,nicole_schittenhelm@web.de,2014-06-27 00:00:00,1 Newsletter pro Monat,2018-05-18 00:00:00,2019-08-07 00:00:00
3631415,11507410,sahrasong@web.de,2014-06-27 00:00:00,Alle Newsletter,2018-05-18 00:00:00,2019-08-07 00:00:00
2420249,27942076,dirk.bachmeier@gmx.de,2018-03-30 00:00:00,Alle Newsletter,2018-05-18 00:00:00,2019-08-07 00:00:00
1705357,2050008,sandra-sunshine@gmx.de,2014-06-27 00:00:00,Alle Newsletter,2018-05-18 00:00:00,2019-08-07 00:00:00
2421760,27945744,dsteinerx@gmail.com,2018-04-03 00:00:00,Alle Newsletter,2018-05-18 00:00:00,2019-08-07 00:00:00
...,...,...,...,...,...,...
19612466,16844015,w.piechottka@gmx.de,2014-08-11 00:00:00,Alle Newsletter,2021-10-01 00:00:00,2021-10-01 00:00:00
19612465,16843957,t.barisic@gmx.net,2016-01-13 00:00:00,Alle Newsletter,2021-10-01 00:00:00,2021-10-01 00:00:00
19612464,16843701,yvonnejaeger@gmx.net,2020-01-11 00:00:00,Alle Newsletter,2021-10-01 00:00:00,2021-10-01 00:00:00
19612477,16845143,bodo_bodo1@gmx.de,2014-06-27 00:00:00,Alle Newsletter,2021-10-01 00:00:00,2021-10-01 00:00:00


- Merge:

In [17]:
df = pd.merge(df,df_optin,on='KDNR',how='left')
df = df.loc[df.EMAIL.isnull()==False]

In [18]:
len(df)

2833093

<h3> Rajout de l'univers majoritaire </h3>

- Obtention de l'univers majoritaire:

In [19]:
univ_maj = df.groupby(['KDNR','DEPARTEMENT']).agg({'NET_DEMAND' : 'sum'})
univ_maj.reset_index(inplace=True)
univ_maj

Unnamed: 0,KDNR,DEPARTEMENT,NET_DEMAND
0,8956580,PAP GARCON,129.98
1,25000785,BEBE,67.98
2,25000785,NON PAP ENFANT,32.08
3,25000785,PAP GARCON,651.88
4,25000785,TEXTILE HOME,7.99
...,...,...,...
614302,34174676,NON PAP ENFANT,12.66
614303,34174676,PAP GARCON,78.89
614304,34174705,TEXTILE HOME,62.36
614305,34174713,DECO,103.90


In [20]:
univ_maj = univ_maj.sort_values(by='NET_DEMAND',ascending=False).drop_duplicates(subset='KDNR', keep='first',inplace=False)
univ_maj.columns =['KDNR','DEPARTEMENT_MAJORITAIRE','NET_DEMAND']

- Merge:

In [21]:
df = pd.merge(df,univ_maj[['KDNR','DEPARTEMENT_MAJORITAIRE']],on='KDNR',how='left')

<h3> Extract </h3>

In [24]:
extract = df[['KDNR','EMAIL','DEPARTEMENT_FIRST','DEPARTEMENT_MAJORITAIRE','MONTH_RECRUITMENT','COHORT','CLIENT_CATEGORY','DATERF']].sort_values(by='DATERF',ascending=True).drop_duplicates(subset='KDNR', keep='last',inplace=False)
extract.columns =['KDNR','EMAIL','DEPARTEMENT_FIRST','DEPARTEMENT_MAJORITAIRE','MONTH_RECRUITMENT','COHORT','CLIENT_CATEGORY','DATE_LAST_PURCHASE']
extract

Unnamed: 0,KDNR,EMAIL,DEPARTEMENT_FIRST,DEPARTEMENT_MAJORITAIRE,MONTH_RECRUITMENT,COHORT,CLIENT_CATEGORY,DATE_LAST_PURCHASE
590,25397160,nicole-schubert@web.de,CHAMBRE ET LITERIE,CHAMBRE ET LITERIE,1,2017,One-timer,2017-01-01
573,25378030,vogel_sandra@bluewin.ch,DECO,DECO,1,2017,One-timer,2017-01-01
491,25398523,marina.helwig@gmx.de,TEXTILE HOME,TEXTILE HOME,1,2017,One-timer,2017-01-01
459,25398687,sophia.suenboldt@gmx.de,BEBE,BEBE,1,2017,One-timer,2017-01-01
464,25398638,sjensen83@gmx.de,FEMME,FEMME,1,2017,One-timer,2017-01-01
...,...,...,...,...,...,...,...,...
1636873,29794438,karin-h.mueller@gmx.de,CHAMBRE ET LITERIE,CHAMBRE ET LITERIE,2,2019,Recurring,2021-09-14
2129959,33744457,barth.benny87@googlemail.com,JOUETS,BEBE,11,2020,Recurring,2021-09-14
1814967,30269540,halima1978@hotmail.de,FEMME,CHAMBRE ET LITERIE,6,2019,Recurring,2021-09-15
2220790,33570573,prassler.claudia@gmail.com,DECO,CHAMBRE ET LITERIE,11,2020,Two-timer,2021-09-15


In [25]:
extract.to_csv(os.path.join(output_rep,'20211013_List_clients_survey_Germany.csv'),sep=";")