In [1]:
import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as pl
import numpy as np
from numpy import array
import pandas as pd
import sklearn as sk

from data_clear import DataClear
from data_transform import DataTransform

from nn import NN
from regression import Regression
from sgd_regression import SGDRegression
from linear_svr_regression import LinearSVRRegression

%matplotlib inline
print(pd.__version__) # version 0.23.4
print(sk.__version__) # version 0.20.1

Using TensorFlow backend.


0.23.4
0.20.1


In [None]:
data_clear = DataClear()
data_transform = DataTransform()

In [None]:
#df = readData('data/data_2017.csv')
data_clear = DataClear()
df = data_clear.readData('data/data.csv') # lendo os dados
df = data_clear.clearNotBrazilianFlights(df) # removendo voos com origem <> de Brasil
df = data_clear.createColumns(df) # criando novas colunas (asentos, inter e passageiros)
df = data_clear.clearData(df) # removendo dados errados e outliers
df = data_clear.removeNotUsedColumns(df) # removendo colunas que não serão utilizadas
df = data_clear.renameColumsn(df) # renomeando as colunas
df = data_clear.convertTypes(df) # convertendo os tipos
df = data_clear.groupBy(df) # agrupando por ano, mes, origem, destino e inter

#### Exemplo dos dados

In [None]:
df.head()

#### Dados estatísticos

In [None]:
df.describe()

#### Gerando Dummies
Convertendo as colunas de texto em representação numérica

In [None]:
df_dummies = data_transform.getDummies(df)

#### Dados com Dummies

In [None]:
df_dummies[(df_dummies['ANO'] == 2017) & (df_dummies['ORIGEM_NORDESTE'] == 1) & (df_dummies['DESTINO_AMÉRICA DO NORTE'] == 1)].head()

#### Aplicando PCA nos dados numéricos

In [None]:
df_pca = data_transform.aplyPCA(df_dummies)
df_pca.plot.scatter(x='x1', y='x2', figsize=(15,10))

#### Dividindo a massa de teste e realizando a normalização

In [None]:
X_train, X_test, y_train, y_test = data_transform.train_test_split(df_dummies)
data_transform.fitNormalizeData(X_train)
X_scaled = data_transform.normalizeData(X_train)
X_test_scaled = data_transform.normalizeData(X_test)

#### Treinar Regressão

In [None]:
regression = Regression()
regression.estimate(X_scaled, X_test_scaled, y_train, y_test)
print("score {}".format(regression.best_score))

#### Treinar Suport Vector Regression

In [None]:
sgd_regression = SGDRegression()
sgd_regression.estimate(X_scaled, X_test_scaled, y_train, y_test)
print("score {}".format(sgd_regression.best_score))

#### Treinar Linear Support Vector Regression

In [None]:
linear_svr = LinearSVRRegression()
linear_svr.estimate(X_scaled, X_test_scaled, y_train, y_test)
print("score {}".format(linear_svr.best_score))

#### Estimativa

In [None]:
estimativa = array([[2017, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0]])

In [None]:
estimativa_t = data_transform.normalizeData(estimativa)

In [None]:
display(regression.best_clt.predict(estimativa_t))
display(sgd_regression.best_clt.predict(estimativa_t))
display(linear_svr.best_clt.predict(estimativa_t))

#### Neural Network

In [None]:
nn = NN()
nn.baseline_model()
X = df_dummies.drop(columns=['PASSAGEIROS'])
y = df_dummies['PASSAGEIROS']
nn.fit(X, y, 1500, 32, 1)

In [None]:
display(nn.predict(estimativa))