#### Conteúdos
0. sklearn workflow
1. preparar dados
2. escolher o modelo
3. treinar o modelo e fazer previsões nos dados
4. avaliar o modelo
5. ajustar o modelo
6. salvar e carregar um modelo
7. juntar tudo

## 1. preparar dados

In [13]:
import pandas as pd
import numpy as np
heart_disease = pd.read_csv('heart-disease.csv')

X = heart_disease.drop('target', axis=1)
y = heart_disease['target']

## 2. escolher o modelo

In [6]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()

## 3. treinar o modelo e fazer previsões nos dados

In [8]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [12]:
clf.fit(X_train, y_train);

In [17]:
# fazer uma previsão
y_preds = clf.predict(X_test)

## 4. avaliar o modelo

In [19]:
# avaliar o modelo no conjunto de treinamento e teste
clf.score(X_train, y_train)

1.0

In [21]:
clf.score(X_test, y_test)

0.7704918032786885

In [28]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(classification_report(y_test, y_preds))

              precision    recall  f1-score   support

           0       0.92      0.66      0.77        35
           1       0.67      0.92      0.77        26

    accuracy                           0.77        61
   macro avg       0.79      0.79      0.77        61
weighted avg       0.81      0.77      0.77        61



In [25]:
confusion_matrix(y_test, y_preds)

array([[23, 12],
       [ 2, 24]], dtype=int64)

In [29]:
accuracy_score(y_test, y_preds)

0.7704918032786885

## 5. ajustar o modelo

In [38]:
np.random.seed(42)

for i in range(10, 100, 10):
    print(f'Trying model with {i} estimators')
    clf = RandomForestClassifier(n_estimators=i).fit(X_train, y_train)
    print(f'Model accuracy on test set: {clf.score(X_test, y_test) * 100:.2f}% \n')

Trying model with 10 estimators
Model accuracy on test set: 80.33% 

Trying model with 20 estimators
Model accuracy on test set: 75.41% 

Trying model with 30 estimators
Model accuracy on test set: 77.05% 

Trying model with 40 estimators
Model accuracy on test set: 78.69% 

Trying model with 50 estimators
Model accuracy on test set: 77.05% 

Trying model with 60 estimators
Model accuracy on test set: 80.33% 

Trying model with 70 estimators
Model accuracy on test set: 78.69% 

Trying model with 80 estimators
Model accuracy on test set: 78.69% 

Trying model with 90 estimators
Model accuracy on test set: 80.33% 



## 6. salvar e carregar um modelo

In [41]:
import pickle

pickle.dump(clf, open('random_forest_model_1.pkl', 'wb'))

In [43]:
loaded_model = pickle.load(open("random_forest_model_1.pkl", 'rb'))

loaded_model.score(X_test, y_test)

0.8032786885245902

## 1. preparar dados (em detalhes)

1. Dividir os dados em X e y
2. Converter valores não numéricos em valores numéricos (feature encoding)
3. Preencher (imputing) ou eliminar valores nulos

In [144]:
car_sales = pd.read_csv('scikit-learn-data/car-sales-extended.csv')

X = car_sales.drop('Price', axis=1)
y = car_sales['Price']

In [145]:
# transformar categorias em números

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorcal_features = ["Make", "Colour", "Doors"]
one_hot = OneHotEncoder()
transformer = ColumnTransformer([("one_hot", 
                                   one_hot,
                                   categorcal_features)],
                                   remainder="passthrough")

transformed_X = transformer.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(transformed_X, y, test_size=0.2)

In [146]:
from sklearn.ensemble import RandomForestRegressor

np.random.seed(42)
model = RandomForestRegressor()
model.fit(X_train, y_train)
model.score(X_test, y_test)

0.29444524256551574

### 1.3.2 trabalhando com datasets com dados faltantes

In [1]:
# Standard imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [166]:
car_sales_missing = pd.read_csv("https://raw.githubusercontent.com/mrdbourke/zero-to-mastery-ml/master/data/car-sales-extended-missing-data.csv")
car_sales_missing.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0


In [167]:
# Check missing values
car_sales_missing.isna().sum()

Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

In [168]:
# Drop the rows with no labels
car_sales_missing.dropna(subset=["Price"], inplace=True)
car_sales_missing.isna().sum()

Make             47
Colour           46
Odometer (KM)    48
Doors            47
Price             0
dtype: int64

In [169]:
from sklearn.model_selection import train_test_split

# Split into X & y
X = car_sales_missing.drop("Price", axis=1)
y = car_sales_missing["Price"]

# Split data into train and test
np.random.seed(42)
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2)

In [170]:
# Check missing values
X.isna().sum()

Make             47
Colour           46
Odometer (KM)    48
Doors            47
dtype: int64

In [171]:
# Fill missing values with Scikit-Learn
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

# Fill categorical values with 'missing' & numerical values with mean
cat_imputer = SimpleImputer(strategy="constant", fill_value="missing")
door_imputer = SimpleImputer(strategy="constant", fill_value=4)
num_imputer = SimpleImputer(strategy="mean")

# Define columns
cat_features = ["Make", "Colour"]
door_feature = ["Doors"]
num_features = ["Odometer (KM)"]

# Create an imputer (something that fills missing data)
imputer = ColumnTransformer([
    ("cat_imputer", cat_imputer, cat_features),
    ("door_imputer", door_imputer, door_feature),
    ("num_imputer", num_imputer, num_features)
])

# Fill train and test values separately
filled_X_train = imputer.fit_transform(X_train) # fit_transform imputes the missing values from the training set and fills them simultaneously
filled_X_test = imputer.transform(X_test) # tranform takes the imputing missing values from the training set and fills the test set with them

# Check filled X_train
filled_X_train

array([['Honda', 'White', 4.0, 71934.0],
       ['Toyota', 'Red', 4.0, 162665.0],
       ['Honda', 'White', 4.0, 42844.0],
       ...,
       ['Toyota', 'White', 4.0, 196225.0],
       ['Honda', 'Blue', 4.0, 133117.0],
       ['Honda', 'missing', 4.0, 150582.0]], dtype=object)

In [172]:
# Get our transformed data array's back into DataFrame's
car_sales_filled_train = pd.DataFrame(filled_X_train, 
                                      columns=["Make", "Colour", "Doors", "Odometer (KM)"])

car_sales_filled_test = pd.DataFrame(filled_X_test, 
                                     columns=["Make", "Colour", "Doors", "Odometer (KM)"])

# Check missing data in training set
car_sales_filled_train.isna().sum()

Make             0
Colour           0
Doors            0
Odometer (KM)    0
dtype: int64

In [173]:
# Check missing data in test set
car_sales_filled_test.isna().sum()

Make             0
Colour           0
Doors            0
Odometer (KM)    0
dtype: int64

In [174]:
# Check to see the original... still missing values
car_sales_missing.isna().sum()

Make             47
Colour           46
Odometer (KM)    48
Doors            47
Price             0
dtype: int64

In [175]:
# Import OneHotEncoder class from sklearn
from sklearn.preprocessing import OneHotEncoder

# Now let's one hot encode the features with the same code as before 
categorical_features = ["Make", "Colour", "Doors"]
one_hot = OneHotEncoder()
transformer = ColumnTransformer([("one_hot", 
                                 one_hot, 
                                 categorical_features)],
                                 remainder="passthrough")

# Fill train and test values separately
transformed_X_train = transformer.fit_transform(car_sales_filled_train) # fit and transform the training data
transformed_X_test = transformer.transform(car_sales_filled_test) # transform the test data

# Check transformed and filled X_train
transformed_X_train.toarray()

array([[0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 7.19340e+04],
       [0.00000e+00, 0.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 1.62665e+05],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 4.28440e+04],
       ...,
       [0.00000e+00, 0.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 1.96225e+05],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 1.33117e+05],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 1.50582e+05]])

In [176]:
# Now we've transformed X, let's see if we can fit a model
np.random.seed(42)
from sklearn.ensemble import RandomForestRegressor

# Setup model
model = RandomForestRegressor()

# Make sure to use transformed (filled and one-hot encoded X data)
model.fit(transformed_X_train, y_train)
model.score(transformed_X_test, y_test)

0.21229043336119102

## 2. escolher o modelo

Mapa de modelos do scikit-learn: https://scikit-learn.org/stable/tutorial/machine_learning_map/index.html

### 2.1 para um problema de regressão

In [12]:
# Standard imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [17]:
# dataset de imóveis da Califórnia
from sklearn.datasets import fetch_california_housing
housing = fetch_california_housing()
housing_df = pd.DataFrame(housing['data'], columns=housing['feature_names'])
housing_df['target'] = housing["target"]

In [20]:
from sklearn.model_selection import train_test_split

np.random.seed(42)

X = housing_df.drop("target", axis=1)
y = housing_df['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [22]:
from sklearn.linear_model import Ridge
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

clf = Ridge()
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

0.5758549611440126

In [31]:
# utilizando modelos de ensemble
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(oob_score=True, max_depth=100)
rf.fit(X_train, y_train)
print(rf.score(X_test, y_test))
print(rf.oob_score_)

0.8071211142561433
0.8065744954925429


### 2.2 para um problema de classificação

In [55]:
heart_disease = pd.read_csv('scikit-learn-data/heart-disease.csv')

In [56]:
np.random.seed(42)

X = heart_disease.drop('target', axis=1)
y = heart_disease['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [57]:
## com LinearSVC
from sklearn.svm import LinearSVC

svc = LinearSVC()
svc.fit(X_train, y_train)
svc.score(X_test, y_test)



0.8688524590163934

In [74]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(ccp_alpha=1E-1)
rfc.fit(X_train, y_train)
rfc.score(X_test, y_test)

0.9016393442622951

Dica:
1. Se os dados são estruturados, usar modelo de ensemble
2. Se os dados não são estruturados, usar deep learning ou transfer learning

### 3. treinar o modelo e fazer previsões nos dados

#### 3.1 treinar o modelo

In [79]:
np.random.seed(42)

X = heart_disease.drop('target', axis=1)
y = heart_disease['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(ccp_alpha=1E-1)
rfc.fit(X_train, y_train)
rfc.score(X_test, y_test)

0.8688524590163934

### 3.2 fazer previsões usando um modelo treinado e ver métricas de acurácia

In [85]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

y_preds = rfc.predict(X_test)

print(accuracy_score(y_preds, y_test)) 
print(confusion_matrix(y_preds, y_test))
print(classification_report(y_preds, y_test))

0.8688524590163934
[[24  3]
 [ 5 29]]
              precision    recall  f1-score   support

           0       0.83      0.89      0.86        27
           1       0.91      0.85      0.88        34

    accuracy                           0.87        61
   macro avg       0.87      0.87      0.87        61
weighted avg       0.87      0.87      0.87        61



In [96]:
y_preds_prob = rfc.predict_proba(X_test) # retorna probabilidades da classificação ordenadas pela label das classes

y_preds_prob[:5]

array([[0.51531024, 0.48468976],
       [0.41835722, 0.58164278],
       [0.4688158 , 0.5311842 ],
       [0.57923256, 0.42076744],
       [0.42750656, 0.57249344]])