# Introdução a Ciência dos Dados - Atividade 3

<span style="position: absolute; top: 10px; right: 10px; background: green; padding: 0.5em; color: white; border-radius: 0.25em; font-weight: bold">Vaux Gomes</span>

#### Bibliotecas

In [1]:
# Imports
import numpy as np
import pandas as pd

# Regression
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR

# Regression metrics
from sklearn.metrics import mean_squared_error as mse

# Data transformation
from sklearn.preprocessing import OneHotEncoder
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler


# Tunning & Split
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

# Others
import time

import warnings
warnings.filterwarnings('ignore')

from tabulate import tabulate

In [2]:
# Timed Experiment
def timed_experiment(clf, X, y):
  start = time.time()
  clf.fit(X, y)
  end = time.time()

  print(f'Experiment time: {end - start:.2f}')
  
  return clf

#### Loading

In [3]:
df = pd.read_csv('../data/iphone-olx-ce-ft-eng.csv')

print(f'Shape: {df.shape}')
print(f'Columns: {df.columns}')

Shape: (1194, 20)
Columns: Index(['adDate', 'region', 'zipcode', 'areaCode', 'category', 'paidPromotion',
       'subject', 'eletronicsModel', 'storage', 'color', 'batteryHealth',
       'eletronicsCondition', 'price', 'pictures', 'sellerName', 'storageGB',
       'subjectLength', 'picturesPerc', 'modelOrder', 'macroRegion'],
      dtype='object')


#### Preparação das colunas

In [4]:
# Dropping
df = df.drop(columns=['adDate', 'region', 'zipcode', 'areaCode', 'category', 'subject', 'storage', 'sellerName', 'picturesPerc', 'modelOrder'])

# Copy
df_ = df.copy()

# Columns Dummies
df = pd.get_dummies(df, columns=['paidPromotion', 'eletronicsModel', 'color', 
  'batteryHealth', 'eletronicsCondition', 'macroRegion'])

df.columns

Index(['price', 'pictures', 'storageGB', 'subjectLength',
       'paidPromotion_False', 'paidPromotion_True',
       'eletronicsModel_IPHONE 11', 'eletronicsModel_IPHONE 11 PRO',
       'eletronicsModel_IPHONE 11 PRO MAX', 'eletronicsModel_IPHONE 12',
       'eletronicsModel_IPHONE 12 MINI', 'eletronicsModel_IPHONE 12 PRO',
       'eletronicsModel_IPHONE 12 PRO MAX', 'eletronicsModel_IPHONE 13',
       'eletronicsModel_IPHONE 13 MINI', 'eletronicsModel_IPHONE 13 PRO',
       'eletronicsModel_IPHONE 13 PRO MAX', 'eletronicsModel_IPHONE 14',
       'eletronicsModel_IPHONE 14 PLUS', 'eletronicsModel_IPHONE 14 PRO',
       'eletronicsModel_IPHONE 14 PRO MAX', 'eletronicsModel_IPHONE 6',
       'eletronicsModel_IPHONE 6S', 'eletronicsModel_IPHONE 6S PLUS',
       'eletronicsModel_IPHONE 7', 'eletronicsModel_IPHONE 7 PLUS',
       'eletronicsModel_IPHONE 8', 'eletronicsModel_IPHONE 8 PLUS',
       'eletronicsModel_IPHONE SE 2020', 'eletronicsModel_IPHONE X',
       'eletronicsModel_IPHONE XR

## Tarefa de Regressão

In [5]:
X = df.drop(columns='price').values
y = df['price'].values

print(X.shape, y.shape)

(1194, 60) (1194,)


#### Split inicial

- 70% Training
- 30% Testing

In [6]:
X_train, X_test, y_train, y_test = \
  train_test_split(X, y, test_size=0.3, random_state=42)

print(f'Train X: {X_train.shape}, y: {y_train.shape}')
print(f' Test X: {X_test.shape}, y: {y_test.shape}')

Train X: (835, 60), y: (835,)
 Test X: (359, 60), y: (359,)


#### Linear Regression (LR)

In [7]:
lr = LinearRegression() # Não paramétrico
lr = timed_experiment(lr, X_train, y_train)
lr_pred = lr.predict(X_test)

#
lr_mse = mse(y_test, lr_pred,  squared=False)

Experiment time: 0.01


#### Decision Tree Regressor (DTR)

In [None]:
dtr = GridSearchCV(
  estimator=DecisionTreeRegressor(), 
  param_grid={
    'criterion': ['squared_error', 'friedman_mse', 'absolute_error'],
    'splitter': ['best', 'random'],
    'max_depth': [None, 2, 5, 10]
  },
  cv=10,
  refit=True,
  n_jobs=4,
  scoring='neg_mean_squared_error'
)

dtr = timed_experiment(dtr, X_train, y_train)
dtr_pred = dtr.predict(X_test)

#
dtr_mse = mse(y_test, dtr_pred,  squared=False)

#### Support Vector Regression (SVR)

In [None]:
svr = GridSearchCV(
  estimator=SVR(), 
  param_grid={
    'gamma': ['auto', 'scale', 10^-3, 10^-1, 10, 10^3, 10^5],
    'C': [1, 2, 3, 4, 5]
  },
  cv=10,
  refit=True,
  n_jobs=4,
  scoring='neg_mean_squared_error'
)

svr = timed_experiment(svr, X_train, y_train)
svr_pred = svr.predict(X_test)

#
svr_mse = mse(y_test, svr_pred,  squared=False)

### Resultados I

In [None]:
print('Root Mean Squared Error')
print('-'*40)
print(f'LR:  {lr_mse:.2f}')
print(f'DTR: {dtr_mse:.2f}')
print(f'SVR: {svr_mse:.2f}')

---

### Undersampling

#### Split

In [None]:
X_train, X_test, y_train, y_test = \
train_test_split(
    np.hstack((X, y.reshape((y.size, 1)))), 
    pd.Categorical(df_['eletronicsModel']).codes, 
    test_size=0.3, random_state=42)

print(f'Train X: {X_train.shape}, y: {y_train.shape}')
print(f' Test X: {X_test.shape}, y: {y_test.shape}')

#### Undersampling

In [None]:
rus = RandomUnderSampler(random_state=42)
X_under, _ = rus.fit_resample(X_train, y_train_)

In [None]:
# Removendo a coluna y original
X_train = X_under[:, :-1]
y_train = X_under[:, -1]

# Removendo a coluna y original
y_test = X_test[:, -1]
X_test = X_test[:, :-1]

#### Linear Regression (LR)

In [None]:
lr = LinearRegression() # Não paramétrico
lr = timed_experiment(lr, X_train, y_train)
lr_pred_under = lr.predict(X_test)

#
lr_mse_under = mse(y_test, lr_pred_under,  squared=False)

#### Decision Tree Regressor (DTR)

In [None]:
dtr = GridSearchCV(
  estimator=DecisionTreeRegressor(), 
  param_grid={
    'criterion': ['squared_error', 'friedman_mse', 'absolute_error'],
    'splitter': ['best', 'random'],
    'max_depth': [None, 2, 5, 10]
  },
  cv=10,
  refit=True,
  n_jobs=4,
  scoring='neg_mean_squared_error'
)

dtr = timed_experiment(dtr, X_train, y_train)
dtr_pred_under = dtr.predict(X_test)

#
dtr_mse_under = mse(y_test, dtr_pred_under,  squared=False)

#### Support Vector Regression (SVR)

In [None]:
svr = GridSearchCV(
  estimator=SVR(), 
  param_grid={
    'gamma': ['auto', 'scale', 10^-3, 10^-1, 10, 10^3, 10^5],
    'C': [1, 2, 3, 4, 5]
  },
  cv=10,
  refit=True,
  n_jobs=4,
  scoring='neg_mean_squared_error'
)

svr = timed_experiment(svr, X_train, y_train)
svr_pred_under = svr.predict(X_test)

#
svr_mse_under = mse(y_test, svr_pred_under,  squared=False)

### Resultados II

In [None]:
print('Root Mean Squared Error')
print('-'*40)
print(f'LR:  {lr_mse_under:.2f} ({lr_mse:.2f})')
print(f'DTR: {dtr_mse_under:.2f} ({dtr_mse:.2f})')
print(f'SVR: {svr_mse_under:.2f} ({svr_mse:.2f})')

---

### Oversampling

#### Split

In [None]:
X_train, X_test, y_train, y_test = \
train_test_split(
    np.hstack((X, y.reshape((y.size, 1)))), 
    pd.Categorical(df_['eletronicsModel']).codes, 
    test_size=0.3, random_state=42)

print(f'Train X: {X_train.shape}, y: {y_train.shape}')
print(f' Test X: {X_test.shape}, y: {y_test.shape}')

#### Undersampling

In [None]:
ros = RandomOverSampler(sampling_strategy='minority')
X_over, _ = ros.fit_resample(X_train, y_train_)

In [None]:
# Removendo a coluna y original
X_train = X_over[:, :-1]
y_train = X_over[:, -1]

# Removendo a coluna y original
y_test = X_test[:, -1]
X_test = X_test[:, :-1]

#### Linear Regression (LR)

In [None]:
lr = LinearRegression() # Não paramétrico
lr = timed_experiment(lr, X_train, y_train)
lr_pred_over = lr.predict(X_test)

#
lr_mse_over = mse(y_test, lr_pred_over,  squared=False)

#### Decision Tree Regressor (DTR)

In [None]:
dtr = GridSearchCV(
  estimator=DecisionTreeRegressor(), 
  param_grid={
    'criterion': ['squared_error', 'friedman_mse', 'absolute_error'],
    'splitter': ['best', 'random'],
    'max_depth': [None, 2, 5, 10]
  },
  cv=10,
  refit=True,
  n_jobs=4,
  scoring='neg_mean_squared_error'
)

dtr = timed_experiment(dtr, X_train, y_train)
dtr_pred_over = dtr.predict(X_test)

#
dtr_mse_over = mse(y_test, dtr_pred_over,  squared=False)

#### Support Vector Regression (SVR)

In [None]:
svr = GridSearchCV(
  estimator=SVR(), 
  param_grid={
    'gamma': ['auto', 'scale', 10^-3, 10^-1, 10, 10^3, 10^5],
    'C': [1, 2, 3, 4, 5]
  },
  cv=10,
  refit=True,
  n_jobs=4,
  scoring='neg_mean_squared_error'
)

svr = timed_experiment(svr, X_train, y_train)
svr_pred_over = svr.predict(X_test)

#
svr_mse_over = mse(y_test, svr_pred_over,  squared=False)

### Resultados III

In [None]:
print('Root Mean Squared Error')
print('-'*40)
print(f'LR:  {lr_mse_over:.2f} ({lr_mse:.2f})')
print(f'DTR: {dtr_mse_over:.2f} ({dtr_mse:.2f})')
print(f'SVR: {svr_mse_over:.2f} ({svr_mse:.2f})')

---

In [None]:
table = [
    ["Algorithm", "Unbal.","Under","Over"],
    ["Linear Regression", f'{lr_mse:.2f}',  f'{lr_mse_under:.2f}',  f'{lr_mse_over:.2f}'],
    ["Decision Tree", f'{dtr_mse:.2f}',  f'{dtr_mse_under:.2f}',  f'{dtr_mse_over:.2f}'],
    ["Support Vector", f'{svr_mse:.2f}',  f'{svr_mse_under:.2f}',  f'{svr_mse_over:.2f}'],
]

print(tabulate(table, headers="firstrow"))