In [1]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# ------------------------------------------
# desabilita warnings
# ------------------------------------------
import warnings
warnings.filterwarnings('ignore')

In [3]:
# ------------------------------------------
# bibliotecas principais
# ------------------------------------------
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

In [4]:
# ------------------------------------------
# DATABASE: DF-CONSOLIDADO
#
# features dropadas de inicio : ID, cidade
# ------------------------------------------
df = pd.read_csv("/content/drive/Shareddrives/grupo4-rappi-hour/bases-rappi/df-consolidado.csv")
df.drop(["Unnamed: 0", "ID", "CIDADE"], axis=1, inplace=True)
print(f"Total de linhas: {df.shape[0]}")
df.info()

Total de linhas: 157627
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 157627 entries, 0 to 157626
Data columns (total 47 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   GENERO                          157627 non-null  object 
 1   DATA_NASCIMENTO                 157627 non-null  float64
 2   IS_ACTIVE                       157627 non-null  object 
 3   TRANSPORTE                      157627 non-null  object 
 4   AUTO_ACEITE                     138568 non-null  object 
 5   COUNT_ORDERS_LAST_7D            157627 non-null  float64
 6   COUNT_ORDERS_LAST_30D           157627 non-null  float64
 7   COUNT_ORDERS_CANCELED_LAST_7D   157627 non-null  float64
 8   COUNT_ORDERS_CANCELED_LAST_30D  157627 non-null  float64
 9   GORJETA                         157627 non-null  float64
 10  PRIMEIRO_PEDIDO                 138568 non-null  object 
 11  ULTIMO_PEDIDO                   138568 non-null  objec

In [5]:
# ------------------------------------------
# TARGET : CHECAGEM DE INTEGRIDADE
# exibe lista de ocorrencias das variaveis
# qtd de ocorrencia de cada uma das categorias
# ------------------------------------------
print(df["IS_ACTIVE"].unique())
df["IS_ACTIVE"].value_counts(normalize=True)

['True' 'False' 'Quasi' False]


Quasi    0.636928
True     0.241412
False    0.064526
False    0.057135
Name: IS_ACTIVE, dtype: float64

In [6]:
# ------------------------------------------
# substitui False (bool) por False (string)
# ------------------------------------------
df["IS_ACTIVE"].replace({ False: "False"}, inplace=True)

In [7]:
# ------------------------------------------
# substitui dados categoricos por numericos
# ------------------------------------------
df["IS_ACTIVE"].replace({
    "False": 0,
    "True": 1,
    "Quasi": 2
}, inplace=True)

In [8]:
# ------------------------------------------
# CHECAGEM DO TRATAMENTO DO TARGET
# ------------------------------------------
print(df["IS_ACTIVE"].unique())
df["IS_ACTIVE"].value_counts(normalize=True)

[1 0 2]


2    0.636928
1    0.241412
0    0.121661
Name: IS_ACTIVE, dtype: float64

In [9]:
# ------------------------------------------
# FEATURE : GENERO
# exibe lista de ocorrencias das variaveis
# qtd de ocorrencia de cada uma das categorias
# ------------------------------------------
print(df["GENERO"].unique())
df["GENERO"].value_counts(normalize=True)

['M' 'F' 'O']


M    0.876620
F    0.123253
O    0.000127
Name: GENERO, dtype: float64

In [10]:
# ------------------------------------------
# substituicao de dados categoricos por numericos
# ------------------------------------------
df["GENERO"].replace({'M':0, 'F':1, "O": 0}, inplace=True)

In [11]:
# ------------------------------------------
# CHECAGEM DO TRATAMENTO DA FEATURE
# ------------------------------------------
print(df["GENERO"].unique())
df["GENERO"].value_counts(normalize=True)

[0 1]


0    0.876747
1    0.123253
Name: GENERO, dtype: float64

In [12]:
# ------------------------------------------
# FEATURE : DATA_NASCIMENTO
# exibe lista de ocorrencias das variaveis
# qtd de ocorrencia de cada uma das categorias
#
# existencia de dados absurdos: 1505, 1371, 3199, etc
# ------------------------------------------
df["DATA_NASCIMENTO"].unique()

array([1988., 1998., 1994., 1993., 2004., 1997., 1978., 1995., 1992.,
       1983., 1979., 1991., 1973., 1984., 1982., 1985., 1964., 1965.,
       2003., 1996., 2000., 2001., 1999., 1981., 1989., 1980., 1959.,
       1974., 1975., 1972., 1963., 1976., 1987., 1990., 2002., 1950.,
       1952., 1966., 1977., 1969., 1986., 1971., 1961., 1970., 1951.,
       1967., 1968., 1955., 1960., 1962., 1957., 1945., 1958., 1932.,
       1954., 1953., 1956., 1905., 1929., 1940., 1948., 1949., 1914.,
       1909., 1947., 1944., 1908., 1902., 1927., 1943., 1939., 1904.,
       1936., 1942., 1938., 1921., 1906., 1903., 1907., 1919., 2022.,
       1946., 2021., 1916., 1937., 1935., 1933., 1924., 1900., 1930.,
       1922., 1901., 1915., 1926., 2020., 2010., 2016., 2009., 1505.,
       2018.,  199., 2007., 2008., 1739., 2015., 1590., 2014., 2017.,
       2006., 2986., 2011., 2195., 2997., 1371., 2019., 3199., 2013.,
       2198.,  195., 1081., 1928., 1198.])

In [13]:
# ------------------------------------------
# 80% das observacoes estao situadas entre os anos de 1983 e 2002
# ------------------------------------------
nascimentos = df["DATA_NASCIMENTO"].value_counts(normalize=True).nlargest(n=20).cumsum()
print(f"Maior ano de nascimento: {nascimentos.index.max()}")
print(f"Menor ano de nascimento: {nascimentos.index.min()}")
nascimentos

Maior ano de nascimento: 2002.0
Menor ano de nascimento: 1983.0


1990.0    0.151966
1999.0    0.197415
1997.0    0.241545
2000.0    0.284748
1996.0    0.327837
1998.0    0.370914
1995.0    0.412302
1994.0    0.452790
1993.0    0.490379
2001.0    0.525703
1992.0    0.559124
1991.0    0.591460
1989.0    0.622140
2002.0    0.651855
1988.0    0.680112
1987.0    0.707956
1986.0    0.733935
1985.0    0.758715
1984.0    0.780977
1983.0    0.802274
Name: DATA_NASCIMENTO, dtype: float64

In [14]:
# ------------------------------------------
# substituicao de registros absurdos por limites maximos e minimos de idade
# max: 60 anos
# min: 18 anos
# ------------------------------------------
from datetime import date

current_year = date.today().year

df['DATA_NASCIMENTO'] = np.where(df['DATA_NASCIMENTO'].lt(current_year - 60), current_year - 60, df['DATA_NASCIMENTO'])
df['DATA_NASCIMENTO'] = np.where(df['DATA_NASCIMENTO'].gt(current_year - 18), current_year - 18, df['DATA_NASCIMENTO'])
df["DATA_NASCIMENTO"].unique()

array([1988., 1998., 1994., 1993., 2004., 1997., 1978., 1995., 1992.,
       1983., 1979., 1991., 1973., 1984., 1982., 1985., 1964., 1965.,
       2003., 1996., 2000., 2001., 1999., 1981., 1989., 1980., 1962.,
       1974., 1975., 1972., 1963., 1976., 1987., 1990., 2002., 1966.,
       1977., 1969., 1986., 1971., 1970., 1967., 1968.])

In [15]:
# ------------------------------------------
# anos de nascimento minimos e maximos apos tratamento
# ------------------------------------------
min = df['DATA_NASCIMENTO'].min()
max = df['DATA_NASCIMENTO'].max()
amp = max - min
print(min, max, amp)

1962.0 2004.0 42.0


In [16]:
# ------------------------------------------
# segmentacao em buckets
# ------------------------------------------
def birth_year_buckets(row):  
  if row['DATA_NASCIMENTO'] >= min and row['DATA_NASCIMENTO'] < min + amp/4:
      return 1
  elif row['DATA_NASCIMENTO'] >= min + amp/4 and row['DATA_NASCIMENTO'] < min + amp/4*2:
      return 2
  elif row['DATA_NASCIMENTO'] >= min + amp/4*2 and row['DATA_NASCIMENTO'] < min + amp/4*3:
      return 3
  return 4

df['DATA_NASCIMENTO_treated'] = df.apply(lambda row: birth_year_buckets(row), axis=1)

In [17]:
age_dummies = pd.get_dummies(df["DATA_NASCIMENTO_treated"])
df = pd.concat([df, age_dummies], 1)
df.rename(columns={1: "age-1st-qtile", 2: "age-2nd-qtile", 3: "age-3rd-qtile", 4: "age-4th-qtile"}, inplace=True)
df.drop(["DATA_NASCIMENTO_treated"], axis=1, inplace=True)

In [18]:
# ------------------------------------------
# CHECAGEM DO TRATAMENTO DA FEATURE
#
# DATA_NASCIMENTO : ajuste de observacoes esdruxulas
# age-1st-qtile : gerados dummies por quartil
# ------------------------------------------
df.head(2)

Unnamed: 0,GENERO,DATA_NASCIMENTO,IS_ACTIVE,TRANSPORTE,AUTO_ACEITE,COUNT_ORDERS_LAST_7D,COUNT_ORDERS_LAST_30D,COUNT_ORDERS_CANCELED_LAST_7D,COUNT_ORDERS_CANCELED_LAST_30D,GORJETA,...,DISCIPLINE_INCIDENTS,FRAUD_INCIDENTS,MANUAL_INCIDENTS,PERFORMANCE_INCIDENTS,WARNING_INCIDENTS,ORDERS_PER_HOURS_CONNECTED,age-1st-qtile,age-2nd-qtile,age-3rd-qtile,age-4th-qtile
0,0,1988.0,1,motorbike,True,1.0,1.0,0.0,0.0,0.0,...,,,,,,0.95815,0,0,1,0
1,0,1998.0,1,motorbike,True,7.0,7.0,0.0,0.0,38.0,...,,,,,,1.444707,0,0,0,1


In [19]:
# ------------------------------------------
# FEATURE : TRANSPORTE
# exibe lista de ocorrencias das variaveis
# qtd de ocorrencia de cada uma das categorias
# ------------------------------------------
print(df["TRANSPORTE"].unique())
df["TRANSPORTE"].value_counts()

['motorbike' 'bicycle' 'car' 'neither' 'cargo_van' 'motorbike_trailer']


motorbike            69686
bicycle              55375
car                  32481
neither                 77
cargo_van                6
motorbike_trailer        2
Name: TRANSPORTE, dtype: int64

In [20]:
# ------------------------------------------
# one hot encoding nos dados de transporte
# ------------------------------------------
transport_dummies = pd.get_dummies(df['TRANSPORTE'])
df = pd.concat([df, transport_dummies], axis=1)

In [21]:
# ------------------------------------------
# FEATURE : AUTO_ACEITE
# exibe lista de ocorrencias das variaveis
# qtd de ocorrencia de cada uma das categorias
# ------------------------------------------
print(df["AUTO_ACEITE"].unique())
df["AUTO_ACEITE"].value_counts(normalize = True)

[True False nan]


True     0.611649
False    0.388351
Name: AUTO_ACEITE, dtype: float64

In [22]:
# ------------------------------------------
# couriers com auto_aceite == nan deram churn definitivo
# CONCLUSAO : preencher nan com False
# ------------------------------------------
df[df["AUTO_ACEITE"].isnull()]["IS_ACTIVE"].value_counts()
df["AUTO_ACEITE"].fillna(False, inplace=True)

In [23]:
# ------------------------------------------
# substituicao de dados categoricos por numericos
# ------------------------------------------
df["AUTO_ACEITE"].replace({
    True : 1,
    False : 0
}, inplace=True) 

In [24]:
# ------------------------------------------
# CHECAGEM DO TRATAMENTO DA FEATURE
# ------------------------------------------
print(df["AUTO_ACEITE"].unique())
df["AUTO_ACEITE"].value_counts()

[1 0]


1    84755
0    72872
Name: AUTO_ACEITE, dtype: int64

In [25]:
# ------------------------------------------
# FEATURE : COUNT_ORDERS_LAST_7D
# exibe lista de ocorrencias das variaveis
# qtd de ocorrencia de cada uma das categorias
#
# 87,8% dos entregadores não possuiam entregas na semana
# ------------------------------------------
df["COUNT_ORDERS_LAST_7D"].value_counts(normalize = True)

0.0      0.878257
1.0      0.014947
2.0      0.009624
3.0      0.007518
4.0      0.006160
           ...   
175.0    0.000006
122.0    0.000006
193.0    0.000006
275.0    0.000006
182.0    0.000006
Name: COUNT_ORDERS_LAST_7D, Length: 154, dtype: float64

In [26]:
# ------------------------------------------
# maximo de entregas em uma semana
# ------------------------------------------
max = df["COUNT_ORDERS_LAST_7D"].max()
max

275.0

In [27]:
# ------------------------------------------
# segmentacao em buckets
# ------------------------------------------
def weekly_orders_buckets(row):  
  if row['COUNT_ORDERS_LAST_7D'] == 0:
      return 0
  elif row['COUNT_ORDERS_LAST_7D'] >= 1 and row['COUNT_ORDERS_LAST_7D'] < max * 0.05:
      return 1
  elif row['COUNT_ORDERS_LAST_7D'] >= max * 0.05 and row['COUNT_ORDERS_LAST_7D'] < max * 0.1:
      return 2
  elif row['COUNT_ORDERS_LAST_7D'] >= 0.1 and row['COUNT_ORDERS_LAST_7D'] < max * 0.25:
      return 3
  return 4

df['COUNT_ORDERS_LAST_7D_treated'] = df.apply(lambda row: weekly_orders_buckets(row), axis=1)
df.drop("COUNT_ORDERS_LAST_7D", axis=1, inplace=True)

In [28]:
# ------------------------------------------
# CHECAGEM DO TRATAMENTO DA FEATURE
# ------------------------------------------
print(df["COUNT_ORDERS_LAST_7D_treated"].unique())
df["COUNT_ORDERS_LAST_7D_treated"].value_counts(normalize=True)

[1 0 2 3 4]


0    0.878257
1    0.069937
2    0.024698
3    0.022851
4    0.004257
Name: COUNT_ORDERS_LAST_7D_treated, dtype: float64

In [29]:
# ------------------------------------------
# FEATURE : COUNT_ORDERS_LAST_30D
# exibe lista de ocorrencias das variaveis
# qtd de ocorrencia de cada uma das categorias
#
# 79,2% dos entregadores não possuiam entregas no mes
# ------------------------------------------
df["COUNT_ORDERS_LAST_30D"].value_counts(normalize = True)

0.0       0.792618
1.0       0.023162
2.0       0.014642
3.0       0.011020
4.0       0.008615
            ...   
425.0     0.000006
1102.0    0.000006
384.0     0.000006
430.0     0.000006
629.0     0.000006
Name: COUNT_ORDERS_LAST_30D, Length: 440, dtype: float64

In [30]:
# ------------------------------------------
# maximo de entregas em um mes
# ------------------------------------------
max = df["COUNT_ORDERS_LAST_30D"].max()
max

1102.0

In [31]:
# ------------------------------------------
# segmentacao em buckets
# ------------------------------------------
def monthly_orders_buckets(row):  
  if row['COUNT_ORDERS_LAST_30D'] == 0:
      return 0
  elif row['COUNT_ORDERS_LAST_30D'] >= 1 and row['COUNT_ORDERS_LAST_30D'] < max * 0.05:
      return 1
  elif row['COUNT_ORDERS_LAST_30D'] >= max * 0.05 and row['COUNT_ORDERS_LAST_30D'] < max * 0.1:
      return 2
  elif row['COUNT_ORDERS_LAST_30D'] >= 0.1 and row['COUNT_ORDERS_LAST_30D'] < max * 0.25:
      return 3
  return 4

df['COUNT_ORDERS_LAST_30D_treated'] = df.apply(lambda row: monthly_orders_buckets(row), axis=1)
df.drop("COUNT_ORDERS_LAST_30D", axis=1, inplace=True)

In [32]:
# ------------------------------------------
# CHECAGEM DO TRATAMENTO DA FEATURE
# ------------------------------------------
print(df["COUNT_ORDERS_LAST_30D_treated"].unique())
df["COUNT_ORDERS_LAST_30D_treated"].value_counts(normalize=True)

[1 0 2 3 4]


0    0.792618
1    0.156712
2    0.027629
3    0.020656
4    0.002385
Name: COUNT_ORDERS_LAST_30D_treated, dtype: float64

In [33]:
# ------------------------------------------
# FEATURE : COUNT_ORDERS_CANCELED_LAST_7D
# exibe lista de ocorrencias das variaveis
# qtd de ocorrencia de cada uma das categorias
#
# 94,9% dos entregadores não possuiam entregas canceladas na semana
# ------------------------------------------
df["COUNT_ORDERS_CANCELED_LAST_7D"].value_counts(normalize = True)

0.0     0.949197
1.0     0.031441
2.0     0.011185
3.0     0.004815
4.0     0.001929
5.0     0.000844
6.0     0.000343
7.0     0.000121
8.0     0.000063
9.0     0.000032
13.0    0.000019
10.0    0.000006
15.0    0.000006
Name: COUNT_ORDERS_CANCELED_LAST_7D, dtype: float64

In [34]:
# ------------------------------------------
# CHECAGEM DO TRATAMENTO DA FEATURE
#
# a feature foi dropada, por entendermos não possuir significancia para o modelo
# ------------------------------------------
df.drop("COUNT_ORDERS_CANCELED_LAST_7D", axis=1, inplace=True)

In [35]:
# ------------------------------------------
# FEATURE : COUNT_ORDERS_CANCELED_LAST_30D
# exibe lista de ocorrencias das variaveis
# qtd de ocorrencia de cada uma das categorias
#
# 88,2% dos entregadores não possuiam entregas canceladas no mes
# ------------------------------------------
df["COUNT_ORDERS_CANCELED_LAST_30D"].value_counts(normalize = True)

0.0     0.881791
1.0     0.048386
2.0     0.024171
3.0     0.015315
4.0     0.009713
5.0     0.006388
6.0     0.004187
7.0     0.003013
8.0     0.002068
9.0     0.001554
10.0    0.001053
11.0    0.000672
12.0    0.000476
13.0    0.000381
14.0    0.000241
15.0    0.000146
16.0    0.000140
17.0    0.000102
18.0    0.000063
20.0    0.000038
19.0    0.000032
24.0    0.000025
21.0    0.000013
39.0    0.000006
25.0    0.000006
32.0    0.000006
23.0    0.000006
22.0    0.000006
Name: COUNT_ORDERS_CANCELED_LAST_30D, dtype: float64

In [36]:
# ------------------------------------------
# CHECAGEM DO TRATAMENTO DA FEATURE
#
# a feature foi dropada, por entendermos não possuir significancia para o modelo
# ------------------------------------------
df.drop("COUNT_ORDERS_CANCELED_LAST_30D", axis=1, inplace=True)

In [37]:
# ------------------------------------------
# FEATURE : GORJETA
# 
# feature quantitativa de gorjetas recebidas por courier
# ------------------------------------------
df["GORJETA"].value_counts(normalize = True)

0.0       0.209158
3.0       0.046769
2.0       0.044485
6.0       0.024406
4.0       0.021817
            ...   
2380.0    0.000006
3478.0    0.000006
3282.0    0.000006
3839.0    0.000006
4001.0    0.000006
Name: GORJETA, Length: 4573, dtype: float64

In [38]:
# ------------------------------------------
# CHECAGEM DO TRATAMENTO DA FEATURE
#
# não existem campos nulos
# ------------------------------------------
df["GORJETA"].isna().value_counts()

False    157627
Name: GORJETA, dtype: int64

In [39]:
# ------------------------------------------
# FEATURES : PRIMEIRO_PEDIDO && ULTIMO_PEDIDO
# 
# registro das datas do primeiro pedido e do ultimo pedido realizado pelo courier
#
# tratamento proposto : calculo do delta entre os pedidos deve refletir a atuacao
# do courier na plataforma. Assim, substituir essas features por uma nova
# ------------------------------------------

In [40]:
from datetime import datetime

def time_delta(row):
  try:  
    first_order = datetime.strptime(df["PRIMEIRO_PEDIDO"][row.name][:10], '%Y-%m-%d').date()
    last_order = datetime.strptime(df["ULTIMO_PEDIDO"][row.name][:10], '%Y-%m-%d').date()
    return last_order - first_order
  except:
    return

df["TIME_DELTA_treated"] = df.apply(lambda row: time_delta(row), axis=1)
df.drop(["PRIMEIRO_PEDIDO", "ULTIMO_PEDIDO"], axis=1, inplace=True)

values = {"TIME_DELTA_treated": '0 days 00:00:00'}
df.fillna(value=values, inplace=True)

In [41]:
# ------------------------------------------
# CHECAGEM DO TRATAMENTO DA FEATURE
# ------------------------------------------
print(df["TIME_DELTA_treated"].isna().value_counts())
df['TIME_DELTA_treated'].value_counts()

False    157627
Name: TIME_DELTA_treated, dtype: int64


0 days      46333
1 days       5081
2 days       3011
3 days       2335
4 days       1981
            ...  
303 days       93
297 days       90
324 days       82
299 days       75
307 days       73
Name: TIME_DELTA_treated, Length: 366, dtype: int64

In [42]:
# ------------------------------------------
# FEATURE : COUNT_ORDERS_RESTAURANTES
# exibe lista de ocorrencias das variaveis
# qtd de ocorrencia de cada uma das categorias
#
# 35,0% dos entregadores não possuiam entregas para restaurantes no periodo
# ------------------------------------------
df["COUNT_ORDERS_RESTAURANTES"].value_counts(normalize = True)

0.0       0.350118
1.0       0.091577
2.0       0.052700
3.0       0.036491
4.0       0.029208
            ...   
923.0     0.000006
1255.0    0.000006
1354.0    0.000006
1473.0    0.000006
1401.0    0.000006
Name: COUNT_ORDERS_RESTAURANTES, Length: 1312, dtype: float64

In [43]:
# ------------------------------------------
# maximo de entregas em um mes
# ------------------------------------------
max = df["COUNT_ORDERS_RESTAURANTES"].max()
max

2411.0

In [44]:
# ------------------------------------------
# segmentacao em buckets
# ------------------------------------------
def restaurant_orders_buckets(row):  
  if row['COUNT_ORDERS_RESTAURANTES'] == 0:
      return 0
  elif row['COUNT_ORDERS_RESTAURANTES'] >= 1 and row['COUNT_ORDERS_RESTAURANTES'] < max * 0.05:
      return 1
  elif row['COUNT_ORDERS_RESTAURANTES'] >= max * 0.05 and row['COUNT_ORDERS_RESTAURANTES'] < max * 0.1:
      return 2
  elif row['COUNT_ORDERS_RESTAURANTES'] >= 0.1 and row['COUNT_ORDERS_RESTAURANTES'] < max * 0.25:
      return 3
  return 4

df['COUNT_ORDERS_RESTAURANTES_treated'] = df.apply(lambda row: restaurant_orders_buckets(row), axis=1)
df.drop("COUNT_ORDERS_RESTAURANTES", axis=1, inplace=True)

In [45]:
# ------------------------------------------
# CHECAGEM DO TRATAMENTO DA FEATURE
# ------------------------------------------
print(df["COUNT_ORDERS_RESTAURANTES_treated"].unique())
df["COUNT_ORDERS_RESTAURANTES_treated"].value_counts(normalize=True)

[1 0 2 3 4]


1    0.553807
0    0.350118
2    0.046616
3    0.037316
4    0.012143
Name: COUNT_ORDERS_RESTAURANTES_treated, dtype: float64

In [46]:
# ------------------------------------------
# FEATURE : COUNT_ORDERS_MERCADO
# exibe lista de ocorrencias das variaveis
# qtd de ocorrencia de cada uma das categorias
#
# 49,5% dos entregadores não possuiam entregas para restaurantes no periodo
# ------------------------------------------
df["COUNT_ORDERS_MERCADO"].value_counts(normalize = True)

0.0       0.495017
1.0       0.091666
2.0       0.047739
3.0       0.031651
4.0       0.024127
            ...   
1016.0    0.000006
856.0     0.000006
706.0     0.000006
758.0     0.000006
659.0     0.000006
Name: COUNT_ORDERS_MERCADO, Length: 784, dtype: float64

In [47]:
# ------------------------------------------
# maximo de entregas em um mes
# ------------------------------------------
max = df["COUNT_ORDERS_MERCADO"].max()
max

1845.0

In [48]:
# ------------------------------------------
# segmentacao em buckets
# ------------------------------------------
def market_orders_buckets(row):  
  if row['COUNT_ORDERS_MERCADO'] == 0:
      return 0
  elif row['COUNT_ORDERS_MERCADO'] >= 1 and row['COUNT_ORDERS_MERCADO'] < max * 0.05:
      return 1
  elif row['COUNT_ORDERS_MERCADO'] >= max * 0.05 and row['COUNT_ORDERS_MERCADO'] < max * 0.1:
      return 2
  elif row['COUNT_ORDERS_MERCADO'] >= 0.1 and row['COUNT_ORDERS_MERCADO'] < max * 0.25:
      return 3
  return 4

df['COUNT_ORDERS_MERCADO_treated'] = df.apply(lambda row: market_orders_buckets(row), axis=1)
df.drop("COUNT_ORDERS_MERCADO", axis=1, inplace=True)

In [49]:
# ------------------------------------------
# CHECAGEM DO TRATAMENTO DA FEATURE
# ------------------------------------------
print(df["COUNT_ORDERS_MERCADO_treated"].unique())
df["COUNT_ORDERS_MERCADO_treated"].value_counts(normalize=True)

[0 1 2 3 4]


0    0.495017
1    0.447937
2    0.031524
3    0.021513
4    0.004009
Name: COUNT_ORDERS_MERCADO_treated, dtype: float64

In [50]:
# ------------------------------------------
# FEATURE : COUNT_ORDERS_FARMACIA
# exibe lista de ocorrencias das variaveis
# qtd de ocorrencia de cada uma das categorias
#
# 46,0% dos entregadores não possuiam entregas para restaurantes no periodo
# ------------------------------------------
df["COUNT_ORDERS_FARMACIA"].value_counts(normalize = True)

0.0      0.460016
1.0      0.104233
2.0      0.053436
3.0      0.035432
4.0      0.026277
           ...   
750.0    0.000006
960.0    0.000006
800.0    0.000006
539.0    0.000006
590.0    0.000006
Name: COUNT_ORDERS_FARMACIA, Length: 605, dtype: float64

In [51]:
# ------------------------------------------
# maximo de entregas em um mes
# ------------------------------------------
max = df["COUNT_ORDERS_FARMACIA"].max()
max

1452.0

In [52]:
# ------------------------------------------
# segmentacao em buckets
# ------------------------------------------
def pharmacy_orders_buckets(row):  
  if row['COUNT_ORDERS_FARMACIA'] == 0:
      return 0
  elif row['COUNT_ORDERS_FARMACIA'] >= 1 and row['COUNT_ORDERS_FARMACIA'] < max * 0.05:
      return 1
  elif row['COUNT_ORDERS_FARMACIA'] >= max * 0.05 and row['COUNT_ORDERS_FARMACIA'] < max * 0.1:
      return 2
  elif row['COUNT_ORDERS_FARMACIA'] >= 0.1 and row['COUNT_ORDERS_FARMACIA'] < max * 0.25:
      return 3
  return 4

df['COUNT_ORDERS_FARMACIA_treated'] = df.apply(lambda row: pharmacy_orders_buckets(row), axis=1)
df.drop("COUNT_ORDERS_FARMACIA", axis=1, inplace=True)

In [53]:
# ------------------------------------------
# CHECAGEM DO TRATAMENTO DA FEATURE
# ------------------------------------------
print(df["COUNT_ORDERS_FARMACIA_treated"].unique())
df["COUNT_ORDERS_FARMACIA_treated"].value_counts(normalize=True)

[0 1 2 3 4]


1    0.474874
0    0.460016
2    0.038572
3    0.023518
4    0.003020
Name: COUNT_ORDERS_FARMACIA_treated, dtype: float64

In [54]:
# ------------------------------------------
# FEATURE : COUNT_ORDERS_EXPRESS
# exibe lista de ocorrencias das variaveis
# qtd de ocorrencia de cada uma das categorias
#
# 55,4% dos entregadores não possuiam entregas para restaurantes no periodo
# ------------------------------------------
df["COUNT_ORDERS_EXPRESS"].value_counts(normalize = True)

0.0      0.553649
1.0      0.092510
2.0      0.048298
3.0      0.030889
4.0      0.023562
           ...   
532.0    0.000006
289.0    0.000006
700.0    0.000006
402.0    0.000006
386.0    0.000006
Name: COUNT_ORDERS_EXPRESS, Length: 462, dtype: float64

In [55]:
# ------------------------------------------
# maximo de entregas em um mes
# ------------------------------------------
max = df["COUNT_ORDERS_EXPRESS"].max()
max

1072.0

In [56]:
# ------------------------------------------
# segmentacao em buckets
# ------------------------------------------
def express_orders_buckets(row):  
  if row['COUNT_ORDERS_EXPRESS'] == 0:
      return 0
  elif row['COUNT_ORDERS_EXPRESS'] >= 1 and row['COUNT_ORDERS_EXPRESS'] < max * 0.05:
      return 1
  elif row['COUNT_ORDERS_EXPRESS'] >= max * 0.05 and row['COUNT_ORDERS_EXPRESS'] < max * 0.1:
      return 2
  elif row['COUNT_ORDERS_EXPRESS'] >= 0.1 and row['COUNT_ORDERS_EXPRESS'] < max * 0.25:
      return 3
  return 4

df['COUNT_ORDERS_EXPRESS_treated'] = df.apply(lambda row: express_orders_buckets(row), axis=1)
df.drop("COUNT_ORDERS_EXPRESS", axis=1, inplace=True)

In [57]:
# ------------------------------------------
# CHECAGEM DO TRATAMENTO DA FEATURE
# ------------------------------------------
print(df["COUNT_ORDERS_EXPRESS_treated"].unique())
df["COUNT_ORDERS_EXPRESS_treated"].value_counts(normalize=True)

[0 1 2 3 4]


0    0.553649
1    0.394996
2    0.031124
3    0.017624
4    0.002607
Name: COUNT_ORDERS_EXPRESS_treated, dtype: float64

In [58]:
# ------------------------------------------
# FEATURE : COUNT_ORDERS_ECOMMERCE
# exibe lista de ocorrencias das variaveis
# qtd de ocorrencia de cada uma das categorias
#
# 59,3% dos entregadores não possuiam entregas para restaurantes no periodo
# ------------------------------------------
df["COUNT_ORDERS_ECOMMERCE"].value_counts(normalize = True)

0.0      0.592612
1.0      0.128190
2.0      0.063629
3.0      0.039403
4.0      0.027726
           ...   
137.0    0.000007
247.0    0.000007
118.0    0.000007
102.0    0.000007
149.0    0.000007
Name: COUNT_ORDERS_ECOMMERCE, Length: 141, dtype: float64

In [59]:
# ------------------------------------------
# maximo de entregas em um mes
# ------------------------------------------
max = df["COUNT_ORDERS_ECOMMERCE"].max()
max

315.0

In [60]:
# ------------------------------------------
# segmentacao em buckets
# ------------------------------------------
def ecomm_orders_buckets(row):  
  if row['COUNT_ORDERS_ECOMMERCE'] == 0:
      return 0
  elif row['COUNT_ORDERS_ECOMMERCE'] >= 1 and row['COUNT_ORDERS_ECOMMERCE'] < max * 0.05:
      return 1
  elif row['COUNT_ORDERS_ECOMMERCE'] >= max * 0.05 and row['COUNT_ORDERS_ECOMMERCE'] < max * 0.1:
      return 2
  elif row['COUNT_ORDERS_ECOMMERCE'] >= 0.1 and row['COUNT_ORDERS_ECOMMERCE'] < max * 0.25:
      return 3
  return 4

df['COUNT_ORDERS_ECOMMERCE_treated'] = df.apply(lambda row: ecomm_orders_buckets(row), axis=1)
df.drop("COUNT_ORDERS_ECOMMERCE", axis=1, inplace=True)

In [61]:
# ------------------------------------------
# CHECAGEM DO TRATAMENTO DA FEATURE
# ------------------------------------------
print(df["COUNT_ORDERS_ECOMMERCE_treated"].unique())
df["COUNT_ORDERS_ECOMMERCE_treated"].value_counts(normalize=True)

[0 1 2 3 4]


0    0.520958
1    0.319457
4    0.121946
2    0.026252
3    0.011388
Name: COUNT_ORDERS_ECOMMERCE_treated, dtype: float64

In [62]:
# ------------------------------------------
# FEATURE : COUNT_ORDERS_ANTOJO
# exibe lista de ocorrencias das variaveis
# qtd de ocorrencia de cada uma das categorias
#
# 79,6% dos entregadores não possuiam entregas para restaurantes no periodo
# ------------------------------------------
df["COUNT_ORDERS_ANTOJO"].value_counts(normalize = True)

0.0      0.795409
1.0      0.074086
2.0      0.033294
3.0      0.019362
4.0      0.013475
           ...   
93.0     0.000006
113.0    0.000006
100.0    0.000006
80.0     0.000006
91.0     0.000006
Name: COUNT_ORDERS_ANTOJO, Length: 104, dtype: float64

In [63]:
# ------------------------------------------
# CHECAGEM DO TRATAMENTO DA FEATURE
# 
# drop da feature por considerar insignificante para o modelo
# ------------------------------------------
df.drop("COUNT_ORDERS_ANTOJO", axis=1, inplace=True)

In [64]:
# ------------------------------------------
# FEATURE : FRETE_MEDIO
# 
# preenchimento de dados missing com média
# ------------------------------------------
df["FRETE_MEDIO"].fillna((df['FRETE_MEDIO'].mean()), inplace=True)

In [65]:
# ------------------------------------------
# CHECAGEM DO TRATAMENTO DA FEATURE
# ------------------------------------------
print(df["FRETE_MEDIO"].isna().unique())
df["FRETE_MEDIO"].value_counts(normalize=True)

[False]


41.400223    0.120925
44.055000    0.010480
61.855000    0.009484
44.455500    0.006281
62.255500    0.006274
               ...   
38.587583    0.000006
34.175390    0.000006
50.142293    0.000006
50.078393    0.000006
33.712545    0.000006
Name: FRETE_MEDIO, Length: 82906, dtype: float64

In [66]:
# ------------------------------------------
# FEATURE : COOKING_TIME_MEDIO
# 
# preenchimento de dados missing com média
# ------------------------------------------
df["COOKING_TIME_MEDIO"].fillna((df['COOKING_TIME_MEDIO'].mean()), inplace=True)

In [67]:
# ------------------------------------------
# CHECAGEM DO TRATAMENTO DA FEATURE
# ------------------------------------------
print(df["COOKING_TIME_MEDIO"].isna().unique())
df["COOKING_TIME_MEDIO"].value_counts(normalize=True)

[False]


23.691503     0.208651
120.000000    0.022686
12.000000     0.012143
8.000000      0.011216
10.000000     0.010956
                ...   
0.489796      0.000006
18.707031     0.000006
37.657895     0.000006
46.600000     0.000006
11.650307     0.000006
Name: COOKING_TIME_MEDIO, Length: 54096, dtype: float64

In [68]:
# ------------------------------------------
# FEATURE : ITENS_MEDIO
# 
# preenchimento de dados missing com média
# ------------------------------------------
df["ITENS_MEDIO"].fillna((df['ITENS_MEDIO'].mean()), inplace=True)

In [69]:
# ------------------------------------------
# CHECAGEM DO TRATAMENTO DA FEATURE
# ------------------------------------------
print(df["ITENS_MEDIO"].isna().unique())
df["ITENS_MEDIO"].value_counts(normalize=True)

[False]


7.789481     0.122378
1.000000     0.047295
2.000000     0.041947
3.000000     0.025186
4.000000     0.015384
               ...   
4.516260     0.000006
45.875000    0.000006
14.305051    0.000006
15.018182    0.000006
4.215951     0.000006
Name: ITENS_MEDIO, Length: 46359, dtype: float64

In [70]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 157627 entries, 0 to 157626
Data columns (total 53 columns):
 #   Column                             Non-Null Count   Dtype          
---  ------                             --------------   -----          
 0   GENERO                             157627 non-null  int64          
 1   DATA_NASCIMENTO                    157627 non-null  float64        
 2   IS_ACTIVE                          157627 non-null  int64          
 3   TRANSPORTE                         157627 non-null  object         
 4   AUTO_ACEITE                        157627 non-null  int64          
 5   GORJETA                            157627 non-null  float64        
 6   FRETE_MEDIO                        157627 non-null  float64        
 7   COOKING_TIME_MEDIO                 157627 non-null  float64        
 8   ITENS_MEDIO                        157627 non-null  float64        
 9   DISTANCE_TO_USER                   146836 non-null  float64        
 10  ORDERS_D

In [71]:
df.head()

Unnamed: 0,GENERO,DATA_NASCIMENTO,IS_ACTIVE,TRANSPORTE,AUTO_ACEITE,GORJETA,FRETE_MEDIO,COOKING_TIME_MEDIO,ITENS_MEDIO,DISTANCE_TO_USER,...,motorbike_trailer,neither,COUNT_ORDERS_LAST_7D_treated,COUNT_ORDERS_LAST_30D_treated,TIME_DELTA_treated,COUNT_ORDERS_RESTAURANTES_treated,COUNT_ORDERS_MERCADO_treated,COUNT_ORDERS_FARMACIA_treated,COUNT_ORDERS_EXPRESS_treated,COUNT_ORDERS_ECOMMERCE_treated
0,0,1988.0,1,motorbike,1,0.0,62.2555,10.0,1.0,3.161503,...,0,0,1,1,0 days,1,0,0,0,0
1,0,1998.0,1,motorbike,1,38.0,43.444714,23.142857,3.0,4.07912,...,0,0,1,1,0 days,1,1,0,0,0
2,1,1994.0,1,motorbike,1,15.0,47.23675,7.25,1.25,4.229596,...,0,0,1,1,0 days,1,0,0,0,0
3,0,1993.0,1,motorbike,1,0.0,35.5555,10.0,2.5,2.464864,...,0,0,1,1,0 days,1,0,0,0,0
4,1,2004.0,1,bicycle,1,0.0,57.8055,23.691503,7.789481,1.986454,...,0,0,0,0,0 days,0,0,0,0,0


In [72]:
# ------------------------------------------
# FEATURE : ORDERS_DONE
# 
# preenchimento de dados missing com média
# ------------------------------------------
df["ORDERS_DONE"].fillna((df['ORDERS_DONE'].mean()), inplace = True)

In [73]:
# ------------------------------------------
# CHECAGEM DO TRATAMENTO DA FEATURE
# ------------------------------------------
print(df["ORDERS_DONE"].isna().unique())
df["ORDERS_DONE"].value_counts(normalize=True)

[False]


192.198982     0.091501
1.000000       0.080056
2.000000       0.047257
0.000000       0.042588
3.000000       0.034892
                 ...   
2342.000000    0.000006
2090.000000    0.000006
2005.000000    0.000006
3231.000000    0.000006
3710.000000    0.000006
Name: ORDERS_DONE, Length: 3122, dtype: float64

In [74]:
# ------------------------------------------
# FEATURE : ORDERS_CANCEL
# 
# preenchimento de dados missing com média
# ------------------------------------------
df["ORDERS_CANCEL"].fillna((df['ORDERS_CANCEL'].mean()), inplace = True)

In [75]:
# ------------------------------------------
# CHECAGEM DO TRATAMENTO DA FEATURE
# ------------------------------------------
print(df["ORDERS_CANCEL"].isna().unique())
df["ORDERS_CANCEL"].value_counts(normalize=True)

[False]


0.000000      0.257576
1.000000      0.171259
6.706084      0.091501
2.000000      0.088716
3.000000      0.058664
                ...   
272.000000    0.000006
179.000000    0.000006
135.000000    0.000006
139.000000    0.000006
163.000000    0.000006
Name: ORDERS_CANCEL, Length: 183, dtype: float64

In [76]:
# ------------------------------------------
# FEATURE : CANCELS_OPS_RT
# 
# preenchimento de dados missing com 0, consideramos que 
# para os dados faltantes, o entregador não teve pedidos
# cancelados pelo time de operação 
# ------------------------------------------
df["CANCELS_OPS_RT"].fillna(value = 0, inplace = True)

In [77]:
# ------------------------------------------
# CHECAGEM DO TRATAMENTO DA FEATURE
# ------------------------------------------
print(df["CANCELS_OPS_RT"].isna().unique())
df["CANCELS_OPS_RT"].value_counts(normalize=True)

[False]


0.0     0.575212
1.0     0.182050
2.0     0.079174
3.0     0.044320
4.0     0.028599
          ...   
69.0    0.000006
49.0    0.000006
60.0    0.000006
83.0    0.000006
84.0    0.000006
Name: CANCELS_OPS_RT, Length: 74, dtype: float64

In [78]:
# ------------------------------------------
# FEATURE : EARNINGS 
# 
# preenchimento de dados missing com 0, considerando que 
# os dados faltantes representam ~11,3% da coluna
# desta forma, pretendemos não enviesar o dataset 
# com novos valores
# ------------------------------------------
df["EARNINGS"].fillna(value = 0, inplace = True)

In [79]:
# ------------------------------------------
# CHECAGEM DO TRATAMENTO DA FEATURE
# ------------------------------------------
print(df["EARNINGS"].isna().unique())
df["EARNINGS"].value_counts(normalize=True)

[False]


0.000000      0.113274
5.000000      0.027432
10.000000     0.008406
15.000000     0.003553
4.500000      0.001865
                ...   
97.408333     0.000006
531.753333    0.000006
950.345556    0.000006
449.051111    0.000006
3.710000      0.000006
Name: EARNINGS, Length: 84517, dtype: float64

In [80]:
# ------------------------------------------
# FEATURE : Tips 
# 
# preenchimento de dados missing com 0, considerando que 
# os dados faltantes representam ~11,2% da coluna,
# desta forma, pretendemos não enviesar o dataset 
# com novos valores
# ------------------------------------------
df["TIPS"].fillna(value = 0, inplace = True)

In [81]:
# ------------------------------------------
# CHECAGEM DO TRATAMENTO DA FEATURE
# ------------------------------------------
print(df["TIPS"].isna().unique())
df["TIPS"].value_counts(normalize=True)

[False]


0.000000      0.198773
3.000000      0.048386
2.000000      0.035686
6.000000      0.022534
4.000000      0.021583
                ...   
69.125000     0.000006
65.636364     0.000006
390.666667    0.000006
304.333333    0.000006
83.195000     0.000006
Name: TIPS, Length: 12687, dtype: float64

In [82]:
# Unindo as colunas EARNINGS + TIPS
df['total_earnings'] = df["EARNINGS"] + df["TIPS"]
df.drop(["EARNINGS", "TIPS"], axis=1, inplace=True)

In [83]:
# ------------------------------------------
# FEATURE : DISTANCE_TO_USER
# 
# preenchimento de dados missing com 0, tendo em vista 
# que os dados faltantes representam ~7% da coluna e 
# pretendemos não enviesar o dataset com novos valores.
# ------------------------------------------
df["DISTANCE_TO_USER"].fillna(value = 0, inplace = True)

In [84]:
# ------------------------------------------
# FEATURE : PUNISHMENT_MINUTES
# 
# preenchimento de dados missing com 0, tendo em vista 
# que os dados faltantes representam ~22,3% da coluna e 
# pretendemos não enviesar o dataset com novos valores.
# ------------------------------------------
df["PUNISHMENT_MINUTES"].fillna(value = 0, inplace = True)

In [85]:
# ------------------------------------------
# CHECAGEM DO TRATAMENTO DA FEATURE
# ------------------------------------------
print(df["PUNISHMENT_MINUTES"].isna().unique())
df["PUNISHMENT_MINUTES"].value_counts(normalize=True)

[False]


0.0            0.306705
30.0           0.036193
15.0           0.033751
60.0           0.024158
360.0          0.021957
                 ...   
108008880.0    0.000006
129601980.0    0.000006
144000450.0    0.000006
21633120.0     0.000006
144008640.0    0.000006
Name: PUNISHMENT_MINUTES, Length: 5650, dtype: float64

In [86]:
# ------------------------------------------
# Decorre que o tratamento das seguintes features é feito, 
# levando em consideração a hipótese que para os dados faltantes
# não houve nenhum tipo de penalização do entregador, além do
# mais, por representarem mais de 20% do dataset, seu preenchimento
# com quaisquer outros valores numéricos poderia prejudicar o
# processo de machine learning
# ------------------------------------------

In [87]:
# ------------------------------------------
# FEATURE : TEMPORARY_BLOCKS
# 
# preenchimento de dados missing com 0, tendo em vista 
# que os dados faltantes representam ~22,2% da coluna e 
# pretendemos não enviesar o dataset com novos valores.
# ------------------------------------------
df["TEMPORARY_BLOCKS"].fillna(value = 0, inplace = True)

In [88]:
# ------------------------------------------
# CHECAGEM DO TRATAMENTO DA FEATURE
# ------------------------------------------
print(df["TEMPORARY_BLOCKS"].isna().unique())

[False]


In [89]:
# ------------------------------------------
# FEATURE : WARNINGS
# 
# preenchimento de dados missing com 0, tendo em vista 
# que os dados faltantes representam ~22,2% da coluna e 
# pretendemos não enviesar o dataset com novos valores.
# ------------------------------------------
df["WARNINGS"].fillna(value = 0, inplace = True)

In [90]:
# ------------------------------------------
# CHECAGEM DO TRATAMENTO DA FEATURE
# ------------------------------------------
print(df["WARNINGS"].isna().unique())

[False]


In [91]:
# ------------------------------------------
# ------------------------------------------
# ------------------------------------------
#
# SECAO DE TRATAMENTO DE INCIDENTES
#
# AS FEATURES FORAM TRATADAS DE MODO CONJUNTO E AGRUPADOS COM PESOS PROPORCIONAIS
# A GRAVIDADE A RECORRENCIA DOS INCIDENTES
# ------------------------------------------
# ------------------------------------------
# ------------------------------------------

In [92]:
# ------------------------------------------
# dados faltantes representam ~22,2% dos incidentes
# ------------------------------------------
pct_na = round(df["DISCIPLINE_INCIDENTS"].describe()[0] / df.shape[0] * 100, 1)
print(f"Dados missing - discipline incidents: {pct_na}%")
# ------------------------------------------
pct_na = round(df["FRAUD_INCIDENTS"].describe()[0] / df.shape[0] * 100, 1)
print(f"Dados missing - fraud incidents: {pct_na}%")
# ------------------------------------------
pct_na = round(df["MANUAL_INCIDENTS"].describe()[0] / df.shape[0] * 100, 1)
print(f"Dados missing - manual incidents: {pct_na}%")
# ------------------------------------------
pct_na = round(df["PERFORMANCE_INCIDENTS"].describe()[0] / df.shape[0] * 100, 1)
print(f"Dados missing - performance incidents: {pct_na}%")
# ------------------------------------------
pct_na = round(df["WARNING_INCIDENTS"].describe()[0] / df.shape[0] * 100, 1)
print(f"Dados missing - warning incidents: {pct_na}%")

Dados missing - discipline incidents: 77.7%
Dados missing - fraud incidents: 77.7%
Dados missing - manual incidents: 77.7%
Dados missing - performance incidents: 77.7%


In [93]:
# ------------------------------------------
# discipline incidents apresenta couriers com alto numero de penalidades
# ------------------------------------------
df["DISCIPLINE_INCIDENTS"].describe()

count    122518.000000
mean         14.324467
std          25.191736
min           0.000000
25%           2.000000
50%           5.000000
75%          15.000000
max         487.000000
Name: DISCIPLINE_INCIDENTS, dtype: float64

In [94]:
# ------------------------------------------
# segmentacao em buckets de discipline incidents
#
# CLASSE 0 == dados nulos
# ------------------------------------------
max = df["DISCIPLINE_INCIDENTS"].describe()[7]

def discipline_incidents_buckets(row):  
  if row['DISCIPLINE_INCIDENTS'] == 0:
      return 1
  elif row['DISCIPLINE_INCIDENTS'] > 0 and row['DISCIPLINE_INCIDENTS'] < 6:
      return 2
  elif row['DISCIPLINE_INCIDENTS'] >= 6 and row['DISCIPLINE_INCIDENTS'] < 11:
      return 3
  elif row['DISCIPLINE_INCIDENTS'] >= 11 and row['DISCIPLINE_INCIDENTS'] < 21:
      return 4
  elif row['DISCIPLINE_INCIDENTS'] >= 21 and row['DISCIPLINE_INCIDENTS'] < max:
      return 5
  return 0

df['DISCIPLINE_INCIDENTS_treated'] = df.apply(lambda row: discipline_incidents_buckets(row), axis=1)

In [95]:
# ------------------------------------------
# fraud incidents apresenta couriers com alto numero de penalidades
# ------------------------------------------
df["FRAUD_INCIDENTS"].describe()

count    122518.000000
mean          0.431586
std           1.656604
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max         139.000000
Name: FRAUD_INCIDENTS, dtype: float64

In [96]:
# ------------------------------------------
# segmentacao em buckets de fraud incidents
#
# CLASSE 0 == dados nulos
# ------------------------------------------
max = df["FRAUD_INCIDENTS"].describe()[7]

def discipline_incidents_buckets(row):  
  if row['FRAUD_INCIDENTS'] == 0:
      return 1
  elif row['FRAUD_INCIDENTS'] > 0 and row['FRAUD_INCIDENTS'] < 6:
      return 2
  elif row['FRAUD_INCIDENTS'] >= 6 and row['FRAUD_INCIDENTS'] < 11:
      return 3
  elif row['FRAUD_INCIDENTS'] >= 11 and row['FRAUD_INCIDENTS'] < 21:
      return 4
  elif row['FRAUD_INCIDENTS'] >= 21 and row['FRAUD_INCIDENTS'] < max:
      return 5
  return 0

df['FRAUD_INCIDENTS_treated'] = df.apply(lambda row: discipline_incidents_buckets(row), axis=1)

In [97]:
# ------------------------------------------
# manual incidents NAO apresenta couriers com alto numero de penalidades
# PORTANTO, sera dropado do dataset
# ------------------------------------------
df["MANUAL_INCIDENTS"].value_counts()

0.0     107229
1.0      11290
2.0       2645
3.0        825
4.0        309
5.0        121
6.0         54
7.0         24
8.0         14
10.0         5
9.0          2
Name: MANUAL_INCIDENTS, dtype: int64

In [98]:
# ------------------------------------------
# performance incidents NAO apresenta couriers com alto numero de penalidades
# PORTANTO, sera dropado do dataset
# ------------------------------------------
df["PERFORMANCE_INCIDENTS"].value_counts()

0.0    120982
1.0       831
2.0       400
3.0       304
5.0         1
Name: PERFORMANCE_INCIDENTS, dtype: int64

In [99]:
# ------------------------------------------
# performance incidents NAO apresenta couriers com alto numero de penalidades
# PORTANTO, sera dropado do dataset
# ------------------------------------------
df["WARNING_INCIDENTS"].value_counts()

0.0    104263
1.0      4522
2.0      3694
3.0      3212
4.0      3056
5.0      3039
6.0       590
7.0        90
8.0        51
9.0         1

In [100]:
# ------------------------------------------
# checagem de dados nulos
# ------------------------------------------
na_before = sum(df["DISCIPLINE_INCIDENTS"].isna())
na_after = sum(df["DISCIPLINE_INCIDENTS_treated"] == 0)
print(f"Discipline incidents - Dados nulos antes do ajuste: {na_before}")
print(f"Discipline incidents - Dados nulos depois do ajuste: {na_after}")
# ------------------------------------------
na_before = sum(df["FRAUD_INCIDENTS"].isna())
na_after = sum(df["FRAUD_INCIDENTS_treated"] == 0)
print(f"Fraud incidents - Dados nulos antes do ajuste: {na_before}")
print(f"Fraud incidents - Dados nulos depois do ajuste: {na_after}")

Discipline incidents - Dados nulos antes do ajuste: 35109
Discipline incidents - Dados nulos depois do ajuste: 35110
Fraud incidents - Dados nulos antes do ajuste: 35109
Fraud incidents - Dados nulos depois do ajuste: 35110


In [101]:
# ------------------------------------------
# drop das features nao relevantes
# ------------------------------------------
df.drop(["DISCIPLINE_INCIDENTS", "FRAUD_INCIDENTS", "MANUAL_INCIDENTS", "PERFORMANCE_INCIDENTS", "WARNING_INCIDENTS"], axis=1, inplace=True)

In [102]:
df.columns

Index(['GENERO', 'DATA_NASCIMENTO', 'IS_ACTIVE', 'TRANSPORTE', 'AUTO_ACEITE',
       'GORJETA', 'FRETE_MEDIO', 'COOKING_TIME_MEDIO', 'ITENS_MEDIO',
       'DISTANCE_TO_USER', 'ORDERS_DONE', 'ORDERS_CANCEL', 'CANCELS_OPS_RT',
       'ORDERS', 'GMV_TOTAL', 'COMPENSATIONS', 'LEVEL_NAME', 'TOTAL_TICKETS',
       'RES_TIME_TOTAL', 'RES_TIME_MEAN', 'ACCEPTANCE_RATE', 'PRODUCT_RETURNS',
       'N°_PEDIDOS', 'PUNISHMENT_MINUTES', 'PERMANENT_BLOCK',
       'age-1st-qtile', 'age-2nd-qtile', 'age-3rd-qtile', 'age-4th-qtile',
       'bicycle', 'car', 'cargo_van', 'motorbike', 'motorbike_trailer',
       'neither', 'COUNT_ORDERS_LAST_7D_treated',
       'COUNT_ORDERS_LAST_30D_treated', 'TIME_DELTA_treated',
       'COUNT_ORDERS_RESTAURANTES_treated', 'COUNT_ORDERS_MERCADO_treated',
       'COUNT_ORDERS_FARMACIA_treated', 'COUNT_ORDERS_EXPRESS_treated',
       'COUNT_ORDERS_ECOMMERCE_treated', 'total_earnings',
       'DISCIPLINE_INCIDENTS_treated', 'FRAUD_INCIDENTS_treated'],
      dtype='object')

In [103]:
# ------------------------------------------
# CHECAGEM DO TRATAMENTO DA FEATURE
# ------------------------------------------
df['total_incidents'] = df["DISCIPLINE_INCIDENTS_treated"] + df["FRAUD_INCIDENTS_treated"]
incidents_dummies = pd.get_dummies(df["total_incidents"])
df = pd.concat([df, incidents_dummies], 1)
df.rename(columns={
    0: "incidents_na",
    1: "incidents_1",
    2: "incidents_2",
    3: "incidents_3",
    4: "incidents_4",
    5: "incidents_5",
    6: "incidents_6",
    7: "incidents_7",
    8: "incidents_8",
    9: "incidents_9",
    10: "incidents_10"
    }, inplace=True)
df.drop(["DISCIPLINE_INCIDENTS_treated", "FRAUD_INCIDENTS_treated", "total_incidents"], axis=1, inplace=True)

In [104]:
df.columns

Index(['GENERO', 'DATA_NASCIMENTO', 'IS_ACTIVE', 'TRANSPORTE', 'AUTO_ACEITE',
       'GORJETA', 'FRETE_MEDIO', 'COOKING_TIME_MEDIO', 'ITENS_MEDIO',
       'DISTANCE_TO_USER', 'ORDERS_DONE', 'ORDERS_CANCEL', 'CANCELS_OPS_RT',
       'ORDERS', 'GMV_TOTAL', 'COMPENSATIONS', 'LEVEL_NAME', 'TOTAL_TICKETS',
       'RES_TIME_TOTAL', 'RES_TIME_MEAN', 'ACCEPTANCE_RATE', 'PRODUCT_RETURNS',
       'N°_PEDIDOS', 'PUNISHMENT_MINUTES', 'PERMANENT_BLOCK',
       'age-1st-qtile', 'age-2nd-qtile', 'age-3rd-qtile', 'age-4th-qtile',
       'bicycle', 'car', 'cargo_van', 'motorbike', 'motorbike_trailer',
       'neither', 'COUNT_ORDERS_LAST_7D_treated',
       'COUNT_ORDERS_LAST_30D_treated', 'TIME_DELTA_treated',
       'COUNT_ORDERS_RESTAURANTES_treated', 'COUNT_ORDERS_MERCADO_treated',
       'COUNT_ORDERS_FARMACIA_treated', 'COUNT_ORDERS_EXPRESS_treated',
       'COUNT_ORDERS_ECOMMERCE_treated', 'total_earnings', 'incidents_na',
       'incidents_1', 'incidents_2', 'incidents_3', 'incidents_4',
      

In [105]:
# ------------------------------------------
# FEATURE : ACCEPTANCE_RATE
# 
# preenchimento de dados missing com a média
# ------------------------------------------
df["ACCEPTANCE_RATE"].fillna((df.ACCEPTANCE_RATE.mean()), inplace = True)

In [106]:
# ------------------------------------------
# CHECAGEM DO TRATAMENTO DA FEATURE
# ------------------------------------------
print(df["ACCEPTANCE_RATE"].isna().unique())

[False]


In [107]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 157627 entries, 0 to 157626
Data columns (total 58 columns):
 #   Column                             Non-Null Count   Dtype          
---  ------                             --------------   -----          
 0   GENERO                             157627 non-null  int64          
 1   DATA_NASCIMENTO                    157627 non-null  float64        
 2   IS_ACTIVE                          157627 non-null  int64          
 3   TRANSPORTE                         157627 non-null  object         
 4   AUTO_ACEITE                        157627 non-null  int64          
 5   GORJETA                            157627 non-null  float64        
 6   FRETE_MEDIO                        157627 non-null  float64        
 7   COOKING_TIME_MEDIO                 157627 non-null  float64        
 8   ITENS_MEDIO                        157627 non-null  float64        
 9   DISTANCE_TO_USER                   157627 non-null  float64        
 10  ORDERS_D

In [108]:
# ------------------------------------------
# FEATURE : ACCEPTANCE_RATE
# 
# preenchimento de dados missing com a média
# ------------------------------------------
df["ACCEPTANCE_RATE"].fillna((df.ACCEPTANCE_RATE.mean()), inplace = True)

In [109]:
# ------------------------------------------
# CHECAGEM DO TRATAMENTO DA FEATURE
# ------------------------------------------
print(df["ACCEPTANCE_RATE"].isna().unique())

[False]


In [110]:
# ------------------------------------------
# FEATURE : ORDERS_PER_HOURS_CONNECTED
# 
# preenchimento de dados missing com a média
# ------------------------------------------
df.drop(df[df.ORDERS_PER_HOURS_CONNECTED > 3].index, inplace=True)
df["ORDERS_PER_HOURS_CONNECTED"].fillna((df.ORDERS_PER_HOURS_CONNECTED.mean()), inplace = True)

In [111]:
# ------------------------------------------
# CHECAGEM DO TRATAMENTO DA FEATURE
# ------------------------------------------
print(df["ORDERS_PER_HOURS_CONNECTED"].isna().unique())

[False]


In [112]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 157470 entries, 0 to 157626
Data columns (total 58 columns):
 #   Column                             Non-Null Count   Dtype          
---  ------                             --------------   -----          
 0   GENERO                             157470 non-null  int64          
 1   DATA_NASCIMENTO                    157470 non-null  float64        
 2   IS_ACTIVE                          157470 non-null  int64          
 3   TRANSPORTE                         157470 non-null  object         
 4   AUTO_ACEITE                        157470 non-null  int64          
 5   GORJETA                            157470 non-null  float64        
 6   FRETE_MEDIO                        157470 non-null  float64        
 7   COOKING_TIME_MEDIO                 157470 non-null  float64        
 8   ITENS_MEDIO                        157470 non-null  float64        
 9   DISTANCE_TO_USER                   157470 non-null  float64        
 10  ORDERS_D

In [113]:
# ---------------------------------------------------------
# Drop de colunas que foram julgadas desisnteressantes para 
# o desenvolvimento do modelo, pois apresentam informações
# que não implicam diretamente na probabilidade de churn 
# ou são irrelevantes dado a quantidade de dados faltantes
# ---------------------------------------------------------
df.drop(columns=['TOTAL_TICKETS', 'ORDERS', 'ORDERS_DONE', 'ORDERS_CANCEL', 'COMPENSATIONS', 
                 'GMV_TOTAL', 'GORJETA', 'RES_TIME_TOTAL', 'RES_TIME_MEAN', 'PERMANENT_BLOCK', 
                 'PRODUCT_RETURNS', 'N°_PEDIDOS', 'LEVEL_NAME', 'GENERO', 'DATA_NASCIMENTO',
                 'FRETE_MEDIO', 'COUNT_ORDERS_LAST_7D_treated', 'COUNT_ORDERS_LAST_30D_treated',
                 'TIME_DELTA_treated', 'COUNT_ORDERS_RESTAURANTES_treated', 'TRANSPORTE',
                 'COUNT_ORDERS_MERCADO_treated', 'COUNT_ORDERS_FARMACIA_treated',
                 'COUNT_ORDERS_EXPRESS_treated', 'COUNT_ORDERS_ECOMMERCE_treated'], inplace=True)

In [114]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 157470 entries, 0 to 157626
Data columns (total 33 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   IS_ACTIVE                   157470 non-null  int64  
 1   AUTO_ACEITE                 157470 non-null  int64  
 2   COOKING_TIME_MEDIO          157470 non-null  float64
 3   ITENS_MEDIO                 157470 non-null  float64
 4   DISTANCE_TO_USER            157470 non-null  float64
 5   CANCELS_OPS_RT              157470 non-null  float64
 6   ACCEPTANCE_RATE             157470 non-null  float64
 7   PUNISHMENT_MINUTES          157470 non-null  float64
 8   TEMPORARY_BLOCKS            157470 non-null  float64
 10  ORDERS_PER_HOURS_CONNECTED  157470 non-null  float64
 11  age-1st-qtile               157470 non-null  uint8  
 12  age-2nd-qtile               157470 non-null  uint8  
 13  age-3rd-qtile               157470 non-null  uint8  
 14  age-4th-qtile 

In [115]:
# ------------------------------------------
# Transformação do dataframe em um arquivo csv 
# reutilizável e sem informações comprometedores 
# dos RT's
# ------------------------------------------
df.to_csv("/content/drive/Shareddrives/grupo4-rappi-hour/bases-rappi/df-oficial.csv")