# Importações

In [561]:
import string
letters = list(string.ascii_lowercase)

import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

pd.options.display.max_columns = None
sns.set_theme()

# Ler dados

In [562]:
df = pd.read_csv('../data/arquivo_preprocessado.csv')
df.shape

(299999, 42)

In [563]:
df.head()

Unnamed: 0,legId,searchDate,flightDate,startingAirport,destinationAirport,fareBasisCode,travelDuration,elapsedDays,isBasicEconomy,isRefundable,isNonStop,baseFare,totalFare,seatsRemaining,totalTravelDistance,segmentsDepartureTimeEpochSeconds,segmentsDepartureTimeRaw,segmentsArrivalTimeEpochSeconds,segmentsArrivalTimeRaw,segmentsArrivalAirportCode,segmentsDepartureAirportCode,segmentsAirlineName,segmentsAirlineCode,segmentsEquipmentDescription,segmentsDurationInSeconds,segmentsDistance,segmentsCabinCode,daysDifferenceBetweenSearchAndFlightDate,travelDurationInSeconds,taxesFare,segment1AirlineName,segment2AirlineName,segment3AirlineName,segment4AirlineName,segment1CabinCode,segment2CabinCode,segment3CabinCode,segment4CabinCode,segment1EquipmentDescription,segment2EquipmentDescription,segment3EquipmentDescription,segment4EquipmentDescription
0,9ca0e81111c683bec1012473feefd28f,2022-04-16,2022-04-17,ATL,BOS,LA0NX0MC,PT2H29M,0,False,False,True,217.67,248.6,9,947.0,1650214620,2022-04-17T12:57:00.000-04:00,1650223560,2022-04-17T15:26:00.000-04:00,BOS,ATL,Delta,DL,Airbus A321,8940,947,coach,1,8940.0,30.93,Delta,,,,coach,,,,Airbus A321,,,
1,98685953630e772a098941b71906592b,2022-04-16,2022-04-17,ATL,BOS,LA0NX0MC,PT2H30M,0,False,False,True,217.67,248.6,4,947.0,1650191400,2022-04-17T06:30:00.000-04:00,1650200400,2022-04-17T09:00:00.000-04:00,BOS,ATL,Delta,DL,Airbus A321,9000,947,coach,1,9000.0,30.93,Delta,,,,coach,,,,Airbus A321,,,
2,98d90cbc32bfbb05c2fc32897c7c1087,2022-04-16,2022-04-17,ATL,BOS,LA0NX0MC,PT2H30M,0,False,False,True,217.67,248.6,9,947.0,1650209700,2022-04-17T11:35:00.000-04:00,1650218700,2022-04-17T14:05:00.000-04:00,BOS,ATL,Delta,DL,Boeing 757-200,9000,947,coach,1,9000.0,30.93,Delta,,,,coach,,,,Boeing 757-200,,,
3,969a269d38eae583f455486fa90877b4,2022-04-16,2022-04-17,ATL,BOS,LA0NX0MC,PT2H32M,0,False,False,True,217.67,248.6,8,947.0,1650218340,2022-04-17T13:59:00.000-04:00,1650227460,2022-04-17T16:31:00.000-04:00,BOS,ATL,Delta,DL,Airbus A321,9120,947,coach,1,9120.0,30.93,Delta,,,,coach,,,,Airbus A321,,,
4,980370cf27c89b40d2833a1d5afc9751,2022-04-16,2022-04-17,ATL,BOS,LA0NX0MC,PT2H34M,0,False,False,True,217.67,248.6,9,947.0,1650203940,2022-04-17T09:59:00.000-04:00,1650213180,2022-04-17T12:33:00.000-04:00,BOS,ATL,Delta,DL,Airbus A321,9240,947,coach,1,9240.0,30.93,Delta,,,,coach,,,,Airbus A321,,,


In [564]:
TARGET = 'totalFare'

## Remover colunas

Removemos as colunas com segmentos separados com '||', pois já separamos os dados de segmentos necessários no preprocessamento.

In [565]:
df.drop([
    'legId',
    'fareBasisCode',
    'travelDuration',
    'segmentsDepartureTimeEpochSeconds',
    'segmentsDepartureTimeRaw',
    'segmentsArrivalTimeEpochSeconds',
    'segmentsArrivalTimeRaw',
    'segmentsArrivalAirportCode',
    'segmentsDepartureAirportCode',
    'segmentsAirlineName',
    'segmentsAirlineCode',
    'segmentsEquipmentDescription',
    'segmentsDurationInSeconds',
    'segmentsDistance',
    'segmentsCabinCode'
], axis=1, inplace=True)

In [566]:
df.head()

Unnamed: 0,searchDate,flightDate,startingAirport,destinationAirport,elapsedDays,isBasicEconomy,isRefundable,isNonStop,baseFare,totalFare,seatsRemaining,totalTravelDistance,daysDifferenceBetweenSearchAndFlightDate,travelDurationInSeconds,taxesFare,segment1AirlineName,segment2AirlineName,segment3AirlineName,segment4AirlineName,segment1CabinCode,segment2CabinCode,segment3CabinCode,segment4CabinCode,segment1EquipmentDescription,segment2EquipmentDescription,segment3EquipmentDescription,segment4EquipmentDescription
0,2022-04-16,2022-04-17,ATL,BOS,0,False,False,True,217.67,248.6,9,947.0,1,8940.0,30.93,Delta,,,,coach,,,,Airbus A321,,,
1,2022-04-16,2022-04-17,ATL,BOS,0,False,False,True,217.67,248.6,4,947.0,1,9000.0,30.93,Delta,,,,coach,,,,Airbus A321,,,
2,2022-04-16,2022-04-17,ATL,BOS,0,False,False,True,217.67,248.6,9,947.0,1,9000.0,30.93,Delta,,,,coach,,,,Boeing 757-200,,,
3,2022-04-16,2022-04-17,ATL,BOS,0,False,False,True,217.67,248.6,8,947.0,1,9120.0,30.93,Delta,,,,coach,,,,Airbus A321,,,
4,2022-04-16,2022-04-17,ATL,BOS,0,False,False,True,217.67,248.6,9,947.0,1,9240.0,30.93,Delta,,,,coach,,,,Airbus A321,,,


In [567]:
df.dtypes

searchDate                                   object
flightDate                                   object
startingAirport                              object
destinationAirport                           object
elapsedDays                                   int64
isBasicEconomy                                 bool
isRefundable                                   bool
isNonStop                                      bool
baseFare                                    float64
totalFare                                   float64
seatsRemaining                                int64
totalTravelDistance                         float64
daysDifferenceBetweenSearchAndFlightDate      int64
travelDurationInSeconds                     float64
taxesFare                                   float64
segment1AirlineName                          object
segment2AirlineName                          object
segment3AirlineName                          object
segment4AirlineName                          object
segment1Cabi

## Categorização

Com a análise univariada, identificamos que os dados numéricos são multiplos de 5. Então, definimos este para o valor dos quantis.

In [568]:
def create_labels(bins):
    
    labels = []
    
    for i in range(len(bins)-1):
        label = f"{letters[i]}_({round(bins[i], 2)}_{round(bins[i+1], 2)}]"
        labels.append(label)
    
    return labels

def categorize_column(data, column, q=5, bins=None, suffix="cat"):
    
    if column == 'elapsedDays':
        return data
    
    if bins is None:
        _, bins = pd.qcut(data[column], q=q, retbins=True, duplicates='drop')
        bins[0] = -np.inf
        bins[-1] = np.inf

    labels = create_labels(bins)

    data[f"{column}_{suffix}"] = pd.cut(data[column], bins=bins, labels=labels)
    data[f"{column}_{suffix}"] = data[f"{column}_{suffix}"].astype(str)

    return data

In [569]:
numeric_cols = list(df.select_dtypes(include='number').columns)
numeric_cols.remove(TARGET)
numeric_cols.remove('baseFare')
numeric_cols.remove('taxesFare')

for col in numeric_cols:
    categorize_column(df, col)

In [570]:
df.head()

Unnamed: 0,searchDate,flightDate,startingAirport,destinationAirport,elapsedDays,isBasicEconomy,isRefundable,isNonStop,baseFare,totalFare,seatsRemaining,totalTravelDistance,daysDifferenceBetweenSearchAndFlightDate,travelDurationInSeconds,taxesFare,segment1AirlineName,segment2AirlineName,segment3AirlineName,segment4AirlineName,segment1CabinCode,segment2CabinCode,segment3CabinCode,segment4CabinCode,segment1EquipmentDescription,segment2EquipmentDescription,segment3EquipmentDescription,segment4EquipmentDescription,seatsRemaining_cat,totalTravelDistance_cat,daysDifferenceBetweenSearchAndFlightDate_cat,travelDurationInSeconds_cat
0,2022-04-16,2022-04-17,ATL,BOS,0,False,False,True,217.67,248.6,9,947.0,1,8940.0,30.93,Delta,,,,coach,,,,Airbus A321,,,,e_(8.0_inf],b_(776.0_1238.0],a_(-inf_4.0],a_(-inf_13020.0]
1,2022-04-16,2022-04-17,ATL,BOS,0,False,False,True,217.67,248.6,4,947.0,1,9000.0,30.93,Delta,,,,coach,,,,Airbus A321,,,,b_(2.0_4.0],b_(776.0_1238.0],a_(-inf_4.0],a_(-inf_13020.0]
2,2022-04-16,2022-04-17,ATL,BOS,0,False,False,True,217.67,248.6,9,947.0,1,9000.0,30.93,Delta,,,,coach,,,,Boeing 757-200,,,,e_(8.0_inf],b_(776.0_1238.0],a_(-inf_4.0],a_(-inf_13020.0]
3,2022-04-16,2022-04-17,ATL,BOS,0,False,False,True,217.67,248.6,8,947.0,1,9120.0,30.93,Delta,,,,coach,,,,Airbus A321,,,,d_(7.0_8.0],b_(776.0_1238.0],a_(-inf_4.0],a_(-inf_13020.0]
4,2022-04-16,2022-04-17,ATL,BOS,0,False,False,True,217.67,248.6,9,947.0,1,9240.0,30.93,Delta,,,,coach,,,,Airbus A321,,,,e_(8.0_inf],b_(776.0_1238.0],a_(-inf_4.0],a_(-inf_13020.0]


In [571]:
df.dtypes

searchDate                                       object
flightDate                                       object
startingAirport                                  object
destinationAirport                               object
elapsedDays                                       int64
isBasicEconomy                                     bool
isRefundable                                       bool
isNonStop                                          bool
baseFare                                        float64
totalFare                                       float64
seatsRemaining                                    int64
totalTravelDistance                             float64
daysDifferenceBetweenSearchAndFlightDate          int64
travelDurationInSeconds                         float64
taxesFare                                       float64
segment1AirlineName                              object
segment2AirlineName                              object
segment3AirlineName                             

# Feature Engineering

Abaixo, identificamos os bins de forma manual, o que achamos melhor para o modelo.

Na coluna daysDifferenceBetweenSearchAndFlightDate, utilizamos 7 bins.

In [572]:
print(df['daysDifferenceBetweenSearchAndFlightDate_cat'].sort_values().unique())
print(df['daysDifferenceBetweenSearchAndFlightDate'].min())
print(df['daysDifferenceBetweenSearchAndFlightDate'].max())

['a_(-inf_4.0]' 'b_(4.0_7.0]' 'c_(7.0_11.0]' 'd_(11.0_17.0]'
 'e_(17.0_inf]']
1
24


In [573]:
bins = [-np.inf, 4, 8, 12, 16, 20, np.inf]
_ = categorize_column(df, 'daysDifferenceBetweenSearchAndFlightDate', bins=bins, suffix='mcat')

Para seatsRemaining, utilizamos 6 bins.

In [574]:
print(df['seatsRemaining_cat'].sort_values().unique())
print(df['seatsRemaining'].min())
print(df['seatsRemaining'].max())

['a_(-inf_2.0]' 'b_(2.0_4.0]' 'c_(4.0_7.0]' 'd_(7.0_8.0]' 'e_(8.0_inf]']
0
10


In [575]:
bins = [-np.inf, 2, 4, 6, 8, np.inf]
_ = categorize_column(df, 'seatsRemaining', bins=bins, suffix='mcat')

Para totalTravelDistance, utilizamos 7 bins, trazendo um intervalor menos para os dados.

In [576]:
print(df['totalTravelDistance_cat'].sort_values().unique())
print(df['totalTravelDistance'].min())
print(df['totalTravelDistance'].max())

['a_(-inf_776.0]' 'b_(776.0_1238.0]' 'c_(1238.0_1702.0]'
 'd_(1702.0_2469.0]' 'e_(2469.0_inf]' np.str_('nan')]
97.0
3958.0


In [577]:
bins = [
    -np.inf,
    97.0,
    648.57,
    1200.14,
    1751.71,
    2303.29,
    2854.86,
    3406.43,
    3958.0,
    np.inf
]
_ = categorize_column(data=df, column='totalTravelDistance', bins=bins, suffix='mcat')

E também, 7 bins para travelDurationInSeconds.

In [578]:
print(df['travelDurationInSeconds_cat'].sort_values().unique())
print(df['travelDurationInSeconds'].min())
print(df['travelDurationInSeconds'].max())

['a_(-inf_13020.0]' 'b_(13020.0_21120.0]' 'c_(21120.0_27180.0]'
 'd_(27180.0_35340.0]' 'e_(35340.0_inf]']
2760.0
96240.0


In [579]:
bins = [
    -np.inf,
    2760.0,
    16114.3,
    29468.6,
    42822.9,
    56177.1,
    69531.4,
    82885.7,
    96240.0,
    np.inf
]
_ = categorize_column(data=df, column='travelDurationInSeconds', bins=bins, suffix='mcat')

In [580]:
df.head()

Unnamed: 0,searchDate,flightDate,startingAirport,destinationAirport,elapsedDays,isBasicEconomy,isRefundable,isNonStop,baseFare,totalFare,seatsRemaining,totalTravelDistance,daysDifferenceBetweenSearchAndFlightDate,travelDurationInSeconds,taxesFare,segment1AirlineName,segment2AirlineName,segment3AirlineName,segment4AirlineName,segment1CabinCode,segment2CabinCode,segment3CabinCode,segment4CabinCode,segment1EquipmentDescription,segment2EquipmentDescription,segment3EquipmentDescription,segment4EquipmentDescription,seatsRemaining_cat,totalTravelDistance_cat,daysDifferenceBetweenSearchAndFlightDate_cat,travelDurationInSeconds_cat,daysDifferenceBetweenSearchAndFlightDate_mcat,seatsRemaining_mcat,totalTravelDistance_mcat,travelDurationInSeconds_mcat
0,2022-04-16,2022-04-17,ATL,BOS,0,False,False,True,217.67,248.6,9,947.0,1,8940.0,30.93,Delta,,,,coach,,,,Airbus A321,,,,e_(8.0_inf],b_(776.0_1238.0],a_(-inf_4.0],a_(-inf_13020.0],a_(-inf_4],e_(8_inf],c_(648.57_1200.14],b_(2760.0_16114.3]
1,2022-04-16,2022-04-17,ATL,BOS,0,False,False,True,217.67,248.6,4,947.0,1,9000.0,30.93,Delta,,,,coach,,,,Airbus A321,,,,b_(2.0_4.0],b_(776.0_1238.0],a_(-inf_4.0],a_(-inf_13020.0],a_(-inf_4],b_(2_4],c_(648.57_1200.14],b_(2760.0_16114.3]
2,2022-04-16,2022-04-17,ATL,BOS,0,False,False,True,217.67,248.6,9,947.0,1,9000.0,30.93,Delta,,,,coach,,,,Boeing 757-200,,,,e_(8.0_inf],b_(776.0_1238.0],a_(-inf_4.0],a_(-inf_13020.0],a_(-inf_4],e_(8_inf],c_(648.57_1200.14],b_(2760.0_16114.3]
3,2022-04-16,2022-04-17,ATL,BOS,0,False,False,True,217.67,248.6,8,947.0,1,9120.0,30.93,Delta,,,,coach,,,,Airbus A321,,,,d_(7.0_8.0],b_(776.0_1238.0],a_(-inf_4.0],a_(-inf_13020.0],a_(-inf_4],d_(6_8],c_(648.57_1200.14],b_(2760.0_16114.3]
4,2022-04-16,2022-04-17,ATL,BOS,0,False,False,True,217.67,248.6,9,947.0,1,9240.0,30.93,Delta,,,,coach,,,,Airbus A321,,,,e_(8.0_inf],b_(776.0_1238.0],a_(-inf_4.0],a_(-inf_13020.0],a_(-inf_4],e_(8_inf],c_(648.57_1200.14],b_(2760.0_16114.3]


## Feature Selection

In [581]:
df.columns

Index(['searchDate', 'flightDate', 'startingAirport', 'destinationAirport',
       'elapsedDays', 'isBasicEconomy', 'isRefundable', 'isNonStop',
       'baseFare', 'totalFare', 'seatsRemaining', 'totalTravelDistance',
       'daysDifferenceBetweenSearchAndFlightDate', 'travelDurationInSeconds',
       'taxesFare', 'segment1AirlineName', 'segment2AirlineName',
       'segment3AirlineName', 'segment4AirlineName', 'segment1CabinCode',
       'segment2CabinCode', 'segment3CabinCode', 'segment4CabinCode',
       'segment1EquipmentDescription', 'segment2EquipmentDescription',
       'segment3EquipmentDescription', 'segment4EquipmentDescription',
       'seatsRemaining_cat', 'totalTravelDistance_cat',
       'daysDifferenceBetweenSearchAndFlightDate_cat',
       'travelDurationInSeconds_cat',
       'daysDifferenceBetweenSearchAndFlightDate_mcat', 'seatsRemaining_mcat',
       'totalTravelDistance_mcat', 'travelDurationInSeconds_mcat'],
      dtype='object')

Selecionamos colunas que podem ser adequadas para o modelo, excluímos colunas onde os valore não possuem variação significativa ou que não fazem sentido para a análise do modelo.

In [582]:
cols_model = [
    'searchDate',
    'flightDate',
    'startingAirport',
    'destinationAirport',
    'elapsedDays',
    'isBasicEconomy',
    # 'isRefundable',
    # 'isNonStop',
    # 'baseFare',
    # 'totalFare',
    'seatsRemaining',
    # 'totalTravelDistance',
    # 'daysDifferenceBetweenSearchAndFlightDate',
    # 'travelDurationInSeconds',
    # 'taxesFare',
    'segment1AirlineName',
    'segment2AirlineName',
    'segment3AirlineName',
    'segment4AirlineName',
    'segment1CabinCode',
    'segment2CabinCode',
    'segment3CabinCode',
    'segment4CabinCode',
    'segment1EquipmentDescription',
    'segment2EquipmentDescription',
    'segment3EquipmentDescription',
    'segment4EquipmentDescription',
    # 'seatsRemaining_cat',
    # 'totalTravelDistance_cat',
    # 'daysDifferenceBetweenSearchAndFlightDate_cat',
    # 'travelDurationInSeconds_cat',
    'daysDifferenceBetweenSearchAndFlightDate_mcat',
    # 'seatsRemaining_mcat',
    'totalTravelDistance_mcat',
    'travelDurationInSeconds_mcat',
    TARGET
]

In [583]:
x = df[cols_model]
y = x.pop(TARGET)

## Encode

Abaixo identificamos as colunas que podem ser utilizadas no OrdinalEncoder.

In [584]:
oe_cat = [
    'startingAirport',
    'destinationAirport',
    'segment1AirlineName',
    'segment2AirlineName',
    'segment3AirlineName',
    'segment4AirlineName',
    'segment1CabinCode',
    'segment2CabinCode',
    'segment3CabinCode',
    'segment4CabinCode',
    'segment1EquipmentDescription',
    'segment2EquipmentDescription',
    'segment3EquipmentDescription',
    'segment4EquipmentDescription',
    'daysDifferenceBetweenSearchAndFlightDate_mcat',
    'totalTravelDistance_mcat',
    'travelDurationInSeconds_mcat'
]

In [585]:
oe_enc = OrdinalEncoder()
oe_enc.fit(x[oe_cat])

In [586]:
x[oe_cat] = oe_enc.transform(x[oe_cat])

Fizemos o mesmo para OneHotEncoder.

In [587]:
ohe_cat = [
    'isBasicEconomy'
]

In [588]:
ohe_enc = OneHotEncoder(handle_unknown='ignore', drop='if_binary')
ohe_enc.fit(x[ohe_cat])

In [589]:
x[ohe_cat] = ohe_enc.transform(x[ohe_cat]).toarray()

In [590]:
x.head()

Unnamed: 0,searchDate,flightDate,startingAirport,destinationAirport,elapsedDays,isBasicEconomy,seatsRemaining,segment1AirlineName,segment2AirlineName,segment3AirlineName,segment4AirlineName,segment1CabinCode,segment2CabinCode,segment3CabinCode,segment4CabinCode,segment1EquipmentDescription,segment2EquipmentDescription,segment3EquipmentDescription,segment4EquipmentDescription,daysDifferenceBetweenSearchAndFlightDate_mcat,totalTravelDistance_mcat,travelDurationInSeconds_mcat
0,2022-04-16,2022-04-17,0.0,1.0,0,0.0,9,5.0,,,,1.0,,,,7.0,,,,0.0,2.0,1.0
1,2022-04-16,2022-04-17,0.0,1.0,0,0.0,4,5.0,,,,1.0,,,,7.0,,,,0.0,2.0,1.0
2,2022-04-16,2022-04-17,0.0,1.0,0,0.0,9,5.0,,,,1.0,,,,20.0,,,,0.0,2.0,1.0
3,2022-04-16,2022-04-17,0.0,1.0,0,0.0,8,5.0,,,,1.0,,,,7.0,,,,0.0,2.0,1.0
4,2022-04-16,2022-04-17,0.0,1.0,0,0.0,9,5.0,,,,1.0,,,,7.0,,,,0.0,2.0,1.0


# Salvar dados

In [591]:
df = pd.read_csv('../data/arquivo_preprocessado.csv')
df.shape

(299999, 42)

In [592]:
TARGET = 'totalFare'

Radicionamos as colunas categóricas.

In [593]:
bins = [-np.inf, 4, 8, 12, 16, 20, np.inf]
_ = categorize_column(df, 'daysDifferenceBetweenSearchAndFlightDate', bins=bins, suffix='mcat')

bins = [
    -np.inf,
    97.0,
    648.57,
    1200.14,
    1751.71,
    2303.29,
    2854.86,
    3406.43,
    3958.0,
    np.inf
]
_ = categorize_column(data=df, column='totalTravelDistance', bins=bins, suffix='mcat')

bins = [
    -np.inf,
    2760.0,
    16114.3,
    29468.6,
    42822.9,
    56177.1,
    69531.4,
    82885.7,
    96240.0,
    np.inf
]
_ = categorize_column(data=df, column='travelDurationInSeconds', bins=bins, suffix='mcat')

In [594]:
cols_model = [
    'searchDate',
    'flightDate',
    'startingAirport',
    'destinationAirport',
    'elapsedDays',
    'isBasicEconomy',
    'seatsRemaining',
    'segment1AirlineName',
    'segment2AirlineName',
    'segment3AirlineName',
    'segment4AirlineName',
    'segment1CabinCode',
    'segment2CabinCode',
    'segment3CabinCode',
    'segment4CabinCode',
    'segment1EquipmentDescription',
    'segment2EquipmentDescription',
    'segment3EquipmentDescription',
    'segment4EquipmentDescription',
    'daysDifferenceBetweenSearchAndFlightDate_mcat',
    'totalTravelDistance_mcat',
    'travelDurationInSeconds_mcat',
    TARGET
]

In [595]:
df = df[cols_model]

In [596]:
ohe_cat = [
    'isBasicEconomy'
]

oe_cat = [
    'startingAirport',
    'destinationAirport',
    'segment1AirlineName',
    'segment2AirlineName',
    'segment3AirlineName',
    'segment4AirlineName',
    'segment1CabinCode',
    'segment2CabinCode',
    'segment3CabinCode',
    'segment4CabinCode',
    'segment1EquipmentDescription',
    'segment2EquipmentDescription',
    'segment3EquipmentDescription',
    'segment4EquipmentDescription',
    'daysDifferenceBetweenSearchAndFlightDate_mcat',
    'totalTravelDistance_mcat',
    'travelDurationInSeconds_mcat'
]

In [597]:
oe_enc = OrdinalEncoder()
oe_enc.fit(df[oe_cat])

In [598]:
df[oe_cat] = oe_enc.transform(df[oe_cat])
df[oe_cat] = df[oe_cat].fillna(-1.0)

In [599]:
ohe_enc = OneHotEncoder(handle_unknown='ignore', drop='if_binary')
ohe_enc.fit(df[ohe_cat])

Criamos as colunas codificadas.

In [600]:
def create_encoded_cols(columns, encoder):

    encoded_cols = []
    
    for i, col in enumerate(columns):
        if len(encoder.categories_[i]) == 2:
            col_name = f"{col}_{encoder.categories_[i][1]}"
            encoded_cols.append(col_name)
        else:
            for colj in encoder.categories_[i]:
                col_name = f"{col}_{colj}"
                encoded_cols.append(col_name)

    return encoded_cols

In [601]:
ohe_cols = create_encoded_cols(ohe_cat, ohe_enc)
df[ohe_cols] = ohe_enc.transform(df[ohe_cat]).toarray()

In [602]:
df.drop(ohe_cat, axis=1, inplace=True)

In [603]:
df.head()

Unnamed: 0,searchDate,flightDate,startingAirport,destinationAirport,elapsedDays,seatsRemaining,segment1AirlineName,segment2AirlineName,segment3AirlineName,segment4AirlineName,segment1CabinCode,segment2CabinCode,segment3CabinCode,segment4CabinCode,segment1EquipmentDescription,segment2EquipmentDescription,segment3EquipmentDescription,segment4EquipmentDescription,daysDifferenceBetweenSearchAndFlightDate_mcat,totalTravelDistance_mcat,travelDurationInSeconds_mcat,totalFare,isBasicEconomy_True
0,2022-04-16,2022-04-17,0.0,1.0,0,9,5.0,-1.0,-1.0,-1.0,1.0,-1.0,-1.0,-1.0,7.0,-1.0,-1.0,-1.0,0.0,2.0,1.0,248.6,0.0
1,2022-04-16,2022-04-17,0.0,1.0,0,4,5.0,-1.0,-1.0,-1.0,1.0,-1.0,-1.0,-1.0,7.0,-1.0,-1.0,-1.0,0.0,2.0,1.0,248.6,0.0
2,2022-04-16,2022-04-17,0.0,1.0,0,9,5.0,-1.0,-1.0,-1.0,1.0,-1.0,-1.0,-1.0,20.0,-1.0,-1.0,-1.0,0.0,2.0,1.0,248.6,0.0
3,2022-04-16,2022-04-17,0.0,1.0,0,8,5.0,-1.0,-1.0,-1.0,1.0,-1.0,-1.0,-1.0,7.0,-1.0,-1.0,-1.0,0.0,2.0,1.0,248.6,0.0
4,2022-04-16,2022-04-17,0.0,1.0,0,9,5.0,-1.0,-1.0,-1.0,1.0,-1.0,-1.0,-1.0,7.0,-1.0,-1.0,-1.0,0.0,2.0,1.0,248.6,0.0


In [604]:
df.to_csv('../data/arquivo_fe.csv', index=False)

In [605]:
import pickle

with open('../pkls/oe.pickle', 'wb') as handle:
    pickle.dump(oe_enc, handle, protocol=pickle.HIGHEST_PROTOCOL)


with open('../pkls/ohe.pickle', 'wb') as handle:
    pickle.dump(ohe_enc, handle, protocol=pickle.HIGHEST_PROTOCOL)