# Dependências

In [17]:
%matplotlib inline

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import BayesianRidge
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR

## Carrega dados de treinamento

In [2]:
data = pd.read_csv('data/data_train_fields.csv', sep=',')

x, y = np.array(data.drop(labels=['production'], axis=1)), np.array(data.production)
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.1, random_state=42)

print(x_train.shape, x_val.shape, y_train.shape, y_val.shape)

(4718, 16) (525, 16) (4718,) (525,)


In [3]:
std = StandardScaler()
std.fit(x_train)
x_train = std.transform(x_train)
x_val = std.transform(x_val)


In [4]:
### Regressão Linear

In [5]:
clf = LinearRegression()
clf.fit(x_train, y_train)
y_pred = clf.predict(x_val)

print(mean_absolute_error(y_val, y_pred))

0.08893285602559807


In [7]:
df_test = pd.read_csv('data/data_test_fields.csv', sep=",")

x_test = df_test.values
x_test = std.transform(x_test)
y_pred = clf.predict(x_test)

df_test['production'] = MinMaxScaler().fit_transform(y_pred.reshape(-1, 1))

In [8]:
df_test.to_csv('data/submission_LinearRegression.csv', sep=",", columns=['Id', 'production'], index=False)

In [9]:
### Regressão Bayesiana

In [10]:
clf =  BayesianRidge(compute_score=True)
clf.fit(x_train, y_train)
y_pred = clf.predict(x_val)

print(mean_absolute_error(y_val, y_pred))

0.08878819677050159


In [11]:
df_test = pd.read_csv('data/data_test_fields.csv', sep=",")

x_test = df_test.values
x_test = std.transform(x_test)
y_pred = clf.predict(x_test)

df_test['production'] = MinMaxScaler().fit_transform(y_pred.reshape(-1, 1))

In [12]:
df_test.to_csv('data/submission_BayesianRegression.csv', sep=",", columns=['Id', 'production'], index=False)

In [13]:
### Seleção de atributos com RFE utilizando Regressão Random Forest

In [14]:
clf = RandomForestRegressor()
clf.fit(x_train, y_train)
y_pred = clf.predict(x_val)

print(mean_absolute_error(y_val, y_pred))

0.05109354305987547


In [15]:
### Selecao de atributos com RFE e Random Forest

In [18]:
selector = RFE(clf)
selector = selector.fit(x_train, y_train)
print(selector.support_ )

print(selector.ranking_)

[ True  True  True False False False False False  True  True  True  True
 False False False  True]
[1 1 1 3 4 8 6 9 1 1 1 1 7 5 2 1]


In [19]:
### Arquivo com atributos selecionados

In [20]:
data = pd.read_csv('data/data_train_RFE.csv', sep='\t')

x, y = np.array(data.drop(labels=['production'], axis=1)), np.array(data.production)
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.1, random_state=42)

print(x_train.shape, x_val.shape, y_train.shape, y_val.shape)

(4718, 4) (525, 4) (4718,) (525,)


In [21]:
std = StandardScaler()
std.fit(x_train)
x_train = std.transform(x_train)
x_val = std.transform(x_val)

In [22]:
clf = RandomForestRegressor()
clf.fit(x_train, y_train)
y_pred = clf.predict(x_val)

print(mean_absolute_error(y_val, y_pred))

0.0742306908964974


In [23]:
df_test = pd.read_csv('data/data_test_RFE.csv', sep="\t")

x_test = df_test.values
x_test = std.transform(x_test)
y_pred = clf.predict(x_test)

df_test['production'] = MinMaxScaler().fit_transform(y_pred.reshape(-1, 1))

In [24]:
df_test.to_csv('data/submission_rfe.csv', sep=",", columns=['Id', 'production'], index=False)