# Introdução a Ciência dos Dados - Atividade 3

<span style="position: absolute; top: 10px; right: 10px; background: green; padding: 0.5em; color: white; border-radius: 0.25em; font-weight: bold">Vaux Gomes</span>

## Introdução

Para os dados que possuo duas tarefas são possíveis:
- Regressão: Predizer o valor dos aparelhos com base nos atributos
- Classificação: Predizer o modelo do aparelho com base nos atributos

Para cada uma das tarefas vamos usar 3 diferentes algoritmos e compará-los

#### Bibliotecas

In [1]:
# Imports
import numpy as np
import pandas as pd

# Regression
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR

# Regression metrics
from sklearn.metrics import mean_squared_error as mse

# Classification
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# Classification Metrics
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

# Data transformation
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
# from sklearn.preprocessing import StandardScaler

# Tunning & Split
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

# Others
import time
import warnings
warnings.filterwarnings('ignore')

# from tabulate import tabulate

In [2]:
# Timed Experiment
def timed_experiment(clf, X, y):
  start = time.time()
  clf.fit(X, y)
  end = time.time()

  print(f'Experiment time: {end - start:.2f}')
  
  return clf


#### Loading

In [3]:
df = pd.read_csv('../data/iphone-olx-ce-ft-eng.csv')

print(f'Shape: {df.shape}')
print(f'Columns: {df.columns}')

Shape: (1194, 20)
Columns: Index(['adDate', 'region', 'zipcode', 'areaCode', 'category', 'paidPromotion',
       'subject', 'eletronicsModel', 'storage', 'color', 'batteryHealth',
       'eletronicsCondition', 'price', 'pictures', 'sellerName', 'storageGB',
       'subjectLength', 'picturesPerc', 'modelOrder', 'macroRegion'],
      dtype='object')


#### Preparação dos dados

In [4]:
# Dropping
df = df.drop(columns=['adDate', 'region', 'zipcode', 'areaCode', 'category', 'subject', 'storage', 'sellerName', 'picturesPerc', 'modelOrder'])
df_ = df.copy()

In [5]:
# Categorical Features Labeling
le = LabelEncoder()

for col in ['paidPromotion', 'eletronicsModel', 'color', 'batteryHealth', 'eletronicsCondition', 'macroRegion']:
  df[col] = le.fit_transform(df[col])

## Tarefa de Regressão

In [6]:
X = df.drop(columns='price').values
y = df['price'].values

#### Split inicial

- 70% Training
- 30% Testing

In [7]:
X_train, X_test, y_train, y_test = \
  train_test_split(X, y, test_size=0.3, random_state=42)

print(f'Train X: {X_train.shape}, y: {y_train.shape}')
print(f' Test X: {X_test.shape}, y: {y_test.shape}')

Train X: (835, 9), y: (835,)
 Test X: (359, 9), y: (359,)


#### Linear Regression (LR)

In [8]:
lr = LinearRegression() # Não paramétrico
lr = timed_experiment(lr, X_train, y_train)
lr_pred = lr.predict(X_test)

Experiment time: 0.00


#### Decision Tree Regressor (DTR)

In [9]:
dtr = GridSearchCV(
  estimator=DecisionTreeRegressor(), 
  param_grid={
    'criterion': ['squared_error', 'friedman_mse', 'absolute_error'],
    'splitter': ['best', 'random'],
    'max_depth': [None, 2, 5, 10]
  },
  cv=10,
  refit=True,
  n_jobs=4,
  scoring='neg_mean_squared_error'
)

dtr = timed_experiment(dtr, X_train, y_train)
dtr_pred = dtr.predict(X_test)

Experiment time: 0.88


#### Support Vector Regression (SVR)

In [10]:
svr = GridSearchCV(
  estimator=SVR(), 
  param_grid={
    'gamma': ['auto', 'scale', 10^-3, 10^-1, 10, 10^3, 10^5],
    'C': [1, 2, 3, 4, 5]
  },
  cv=10,
  refit=True,
  n_jobs=4,
  scoring='neg_mean_squared_error'
)

svr = timed_experiment(svr, X_train, y_train)
svr_pred = svr.predict(X_test)

Experiment time: 1.56


### Resultados

In [19]:
print('Root Mean Squared Error')
print('-'*40)
print(f'LR:  {mse(y_test, lr_pred,  squared=False):.2f}')
print(f'DTR: {mse(y_test, dtr_pred, squared=False):.2f}')
print(f'SVR: {mse(y_test, svr_pred, squared=False):.2f}')

Root Mean Squared Error
----------------------------------------
LR:  261.53
DTR: 269.20
SVR: 682.87


---

## Tarefa de Classificação

In [22]:
df = df[df['eletronicsModel'].isin(['IPHONE 11', 'IPHONE XR', 'IPHONE 8 PLUS', 'IPHONE 7', 'IPHONE 7 PLUS'])]

In [12]:
df = pd.get_dummies(df, columns=['paidPromotion', 'eletronicsModel', 'color', 
  'batteryHealth', 'eletronicsCondition', 'macroRegion'])

df.columns

Index(['price', 'pictures', 'storageGB', 'subjectLength',
       'paidPromotion_False', 'paidPromotion_True',
       'eletronicsModel_IPHONE 11', 'eletronicsModel_IPHONE 7',
       'eletronicsModel_IPHONE 7 PLUS', 'eletronicsModel_IPHONE 8 PLUS',
       'eletronicsModel_IPHONE XR', 'color_Amarelo', 'color_Azul',
       'color_Branco', 'color_Bronze', 'color_Cinza', 'color_Dourado',
       'color_Laranja', 'color_Outra', 'color_Prata', 'color_Preto',
       'color_Rosa', 'color_Roxo', 'color_Verde', 'color_Vermelho',
       'batteryHealth_Boa (80% até 94%)', 'batteryHealth_OK (60% até 79%)',
       'batteryHealth_Perfeita (95% até 100%)',
       'batteryHealth_Ruim (40% até 59%)', 'eletronicsCondition_Com defeito',
       'eletronicsCondition_Novo', 'eletronicsCondition_Recondicionado',
       'eletronicsCondition_Usado - Bom',
       'eletronicsCondition_Usado - Excelente', 'macroRegion_Cariri',
       'macroRegion_Fortaleza', 'macroRegion_Litoral Leste',
       'macroRegion_Norte', 'm

In [57]:
X = df.drop(columns=[
  #'eletronicsModel',
  'eletronicsModel_IPHONE 11', 'eletronicsModel_IPHONE 11 PRO',
  'eletronicsModel_IPHONE 11 PRO MAX', 'eletronicsModel_IPHONE 12',
  'eletronicsModel_IPHONE 12 MINI', 'eletronicsModel_IPHONE 12 PRO',
  'eletronicsModel_IPHONE 12 PRO MAX', 'eletronicsModel_IPHONE 13',
  'eletronicsModel_IPHONE 13 MINI', 'eletronicsModel_IPHONE 13 PRO',
  'eletronicsModel_IPHONE 13 PRO MAX', 'eletronicsModel_IPHONE 14',
  'eletronicsModel_IPHONE 14 PLUS', 'eletronicsModel_IPHONE 14 PRO',
  'eletronicsModel_IPHONE 14 PRO MAX', 'eletronicsModel_IPHONE 6',
  'eletronicsModel_IPHONE 6S', 'eletronicsModel_IPHONE 6S PLUS',
  'eletronicsModel_IPHONE 7', 'eletronicsModel_IPHONE 7 PLUS',
  'eletronicsModel_IPHONE 8', 'eletronicsModel_IPHONE 8 PLUS',
  'eletronicsModel_IPHONE SE 2020', 'eletronicsModel_IPHONE X',
  'eletronicsModel_IPHONE XR', 'eletronicsModel_IPHONE XS',
  'eletronicsModel_IPHONE XS MAX'
]).values

# Categorical Features Labeling
le = LabelEncoder()
y = le.fit_transform(classes)

#### Split inicial

- 70% Training
- 30% Testing

In [58]:
X_train, X_test, y_train, y_test = \
  train_test_split(X, y, test_size=0.3, random_state=42)

print(f'Train X: {X_train.shape}, y: {y_train.shape}')
print(f' Test X: {X_test.shape}, y: {y_test.shape}')

Train X: (835, 34), y: (835,)
 Test X: (359, 34), y: (359,)


#### SGDClassifier (SGD)

In [59]:
sgd = GridSearchCV(
  estimator=SGDClassifier(), 
  param_grid={
    'loss': ['log_loss', 'squared_error', 'hinge'],
    'penalty': ['l2', 'l1', 'elasticnet'],
    'learning_rate': ['optimal'],
    'random_state': [42],
    'average': [0]
  },
  cv=10,
  refit=True,
  n_jobs=4,
  scoring='f1_macro'
)

sgd = timed_experiment(sgd, X_train, y_train)
sgd_pred = sgd.predict(X_test)

Experiment time: 4.62


#### KNeighborsClassifier (KNN)

In [61]:
knn = GridSearchCV(
  estimator=KNeighborsClassifier(), 
  param_grid={
    'n_neighbors': [2**i + 1 for i in range(10)],
    'metric': ['euclidean', 'manhattan', 'cosine', 'nan_euclidean'],
  },
  cv=10,
  refit=True,
  n_jobs=4,
  scoring='f1_macro'
)

knn = timed_experiment(knn, X_train, y_train)
knn_pred = knn.predict(X_test)

Experiment time: 1.64


#### Support Vector Machine Classifier (SVC)

In [62]:
svc = GridSearchCV(
  estimator=SVC(), 
  param_grid={
    'gamma': ['auto', 'scale', 10^-3, 10^-1, 10, 10^3, 10^5],
    'C': [1, 2, 3, 4, 5]
  },
  cv=10,
  refit=True,
  n_jobs=4,
  scoring='f1_macro'
)

svc = timed_experiment(svc, X_train, y_train)
svc_pred = svc.predict(X_test)

Experiment time: 5.23


#### Decision Tree Classifier (DTC)

In [63]:
dtc = GridSearchCV(
  estimator=DecisionTreeClassifier(), 
  param_grid={
    'criterion': ['gini', 'entropy', 'log_loss'],
    'splitter': ['best', 'random'],
    'max_depth': [None, 2, 5, 10]
  },
  cv=10,
  refit=True,
  n_jobs=4,
  scoring='f1_macro'
)

dtc = timed_experiment(dtc, X_train, y_train)
dtc_pred = dtc.predict(X_test)

Experiment time: 0.32


#### Random Forest Classifier (RF)

In [64]:
rf = GridSearchCV(
  estimator=RandomForestClassifier(), 
  param_grid={
    'n_estimators': [10**i for i in range(1, 4)],
    'criterion': ['gini', 'entropy', 'log_loss'],
    'max_depth': [None, 2, 5, 10]
  },
  cv=10,
  refit=True,
  n_jobs=4,
  scoring='f1_macro'
)

rf = timed_experiment(rf, X_train, y_train)
rf_pred = rf.predict(X_test)

Experiment time: 56.23


### Resultados

In [65]:
print('='*40)
print('Accuracy Score')
print('-'*40)
print(f'SGD: {accuracy_score(y_test, sgd_pred):.4f}')
print(f'KNN: {accuracy_score(y_test, knn_pred):.4f}')
print(f'SVC: {accuracy_score(y_test, svc_pred):.4f}')
print(f'DTC: {accuracy_score(y_test, dtc_pred):.4f}')
print(f'RF:  {accuracy_score(y_test, rf_pred):.4f}')

print('')
print('='*40)

print('F1 Score')
print('-'*40)
print(f'SGD: {f1_score(y_test, sgd_pred,  average="macro"):.4f}')
print(f'KNN: {f1_score(y_test, knn_pred, average="macro"):.4f}')
print(f'SVC: {f1_score(y_test, svc_pred, average="macro"):.4f}')
print(f'DTC: {f1_score(y_test, dtc_pred, average="macro"):.4f}')
print(f'RF:  {f1_score(y_test, rf_pred, average="macro"):.4f}')

Accuracy Score
----------------------------------------
SGD: 0.1337
KNN: 0.5404
SVC: 0.4596
DTC: 0.5487
RF:  0.5599

F1 Score
----------------------------------------
SGD: 0.0190
KNN: 0.3206
SVC: 0.2526
DTC: 0.3666
RF:  0.3402
