# Exercício

# Regressão

**Exercício: Prever o gasto com seguro**

**Arquivo insurance.csv**

**Passos:**

1. Carregue o conjunto de dados em um DataFrame usando a biblioteca pandas.
2. Explore e visualize os dados para entender suas características.
3. Divida os dados em recursos (X) e rótulos (y).
4. Divida o conjunto de dados em conjuntos de treinamento e teste.
5. Utilização do KNN `KNeighborsRegressor`.
6. Utilização da regressão linear (`LinearRegression`).
7. Utilização da árvore de decisão `DecisionTreeRegressor`.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn import preprocessing
import seaborn as sns
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, r2_score
from sklearn.model_selection import train_test_split
import math
from sklearn.neighbors import DistanceMetric, KNeighborsClassifier
from sklearn import neighbors
from sklearn.metrics import mean_squared_error,mean_absolute_error, mean_absolute_percentage_error, r2_score
from sklearn.cluster import KMeans
from sklearn import tree
import graphviz
from scipy import stats

# Regressão

In [3]:
df_regression = pd.read_csv('../../Datasets/insurance.csv', sep = ",", low_memory=False)

#removendo os ids
df_regression = df_regression.drop(df_regression.columns[0], axis=1)
df_regression

Unnamed: 0,sex,bmi,children,smoker,region,charges
0,female,27.900,0,yes,southwest,16884.92400
1,male,33.770,1,no,southeast,1725.55230
2,male,33.000,3,no,southeast,4449.46200
3,male,22.705,0,no,northwest,21984.47061
4,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...
1333,male,30.970,3,no,northwest,10600.54830
1334,female,31.920,0,no,northeast,2205.98080
1335,female,36.850,0,no,southeast,1629.83350
1336,female,25.800,0,no,southwest,2007.94500


In [5]:
df_regression.duplicated().sum()

1

In [6]:
df_regression.drop_duplicates(keep='first', inplace=True)

In [7]:
df_regression.isna().sum().sum()

0

In [8]:
df_regression.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1337 entries, 0 to 1337
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   sex       1337 non-null   object 
 1   bmi       1337 non-null   float64
 2   children  1337 non-null   int64  
 3   smoker    1337 non-null   object 
 4   region    1337 non-null   object 
 5   charges   1337 non-null   float64
dtypes: float64(2), int64(1), object(3)
memory usage: 73.1+ KB


In [9]:
# label encoder

le = preprocessing.LabelEncoder()
for column in df_regression.columns:
    if(df_regression[column].dtypes=='object'):
        df_regression[column] = le.fit_transform(df_regression[column])
        
df_regression.head()

Unnamed: 0,sex,bmi,children,smoker,region,charges
0,0,27.9,0,1,3,16884.924
1,1,33.77,1,0,2,1725.5523
2,1,33.0,3,0,2,4449.462
3,1,22.705,0,0,1,21984.47061
4,1,28.88,0,0,1,3866.8552


In [10]:
# Correlação

corr = df_regression.corr()
corr.style.background_gradient(cmap = 'coolwarm')

Unnamed: 0,sex,bmi,children,smoker,region,charges
sex,1.0,0.046397,0.017848,0.076596,0.004936,0.058044
bmi,0.046397,1.0,0.012755,0.003746,0.157574,0.198401
children,0.017848,0.012755,1.0,0.007331,0.016258,0.067389
smoker,0.076596,0.003746,0.007331,1.0,-0.002358,0.787234
region,0.004936,0.157574,0.016258,-0.002358,1.0,-0.006547
charges,0.058044,0.198401,0.067389,0.787234,-0.006547,1.0


### Separando o alvo (charges) dos atributos

In [11]:
target_regression = pd.DataFrame(df_regression, columns=["charges"])

In [12]:
X_regression = df_regression.drop(df_regression.columns[5], axis=1)
y_regression = target_regression

In [13]:
X_train_regression, X_test_regression, y_train_regression, y_test_regression = train_test_split(X_regression, y_regression, test_size = 0.2, random_state = 0)

In [14]:
def results_regression(y_test,y_pred):

    mse = mean_squared_error(y_test,y_pred)
    rmse = math.sqrt(mse)
    mae = mean_absolute_error(y_test,y_pred)
    mape = mean_absolute_percentage_error(y_test,y_pred)
    r2 = r2_score(y_test,y_pred)

    print(f"mse: {mse}\n rmse: {rmse}\n mae: {mae}\n mape: {mape}\n r2_score {r2}")

### Regressão Linear

In [15]:
from sklearn.linear_model import LinearRegression

In [16]:
def regressaoLinear(X_train,y_train,X_test,y_test):
    #Criando objeto de regressão
    lr = LinearRegression()

    #Treinando regressão
    lr.fit(X_train,y_train)
    y_pred = lr.predict(X_test) 
    
    results_regression(y_test,y_pred)

    return y_pred,lr

In [18]:
y_pred, lr = regressaoLinear(X_train_regression, y_train_regression, X_test_regression, y_test_regression)

mse: 58853899.606580175
 rmse: 7671.629527458959
 mae: 5714.915504776208
 mape: 0.7698688580793931
 r2_score 0.6501572186968587


#### Removendo outliers utilizando z score

In [19]:
df_without_outliers_regression_z = df_regression[(np.abs(stats.zscore(df_regression)) < 3).all(axis=1)]

Separando o alvo (charges) dos atributos

In [20]:
target_regression_z = pd.DataFrame(df_without_outliers_regression_z, columns=["charges"])

In [21]:
X_regression_z = df_without_outliers_regression_z.drop(df_without_outliers_regression_z.columns[5], axis=1)
y_regression_z = target_regression_z

In [22]:
X_train_regression_z, X_test_regression_z, y_train_regression_z, y_test_regression_z = train_test_split(X_regression_z, y_regression_z, test_size = 0.2, random_state = 0)

Simples

In [23]:
y_pred, lr = regressaoLinear(X_train_regression_z,y_train_regression_z,X_test_regression_z,y_test_regression_z)

mse: 47257168.10383655
 rmse: 6874.384925492356
 mae: 5331.426220476478
 mape: 0.7425245420184374
 r2_score 0.693963912262842


### Usando apenas as colunas mais correlacionadas

In [24]:
X_train_regression_z_corr, X_test_regression_z_corr, y_train_regression_z_corr, y_test_regression_z_corr = train_test_split(X_regression_z[['smoker', 'bmi']],
                                                                                                         y_regression_z, test_size = 0.2, random_state = 0)

In [25]:
y_pred, lr = regressaoLinear(X_train_regression_z_corr,y_train_regression_z_corr,
                             X_test_regression_z_corr,y_test_regression_z_corr)

mse: 46870935.36705851
 rmse: 6846.235123559409
 mae: 5327.864605562998
 mape: 0.7710904254183321
 r2_score 0.6964651445724008


Aplicando standard Scaler

In [26]:
sc = StandardScaler()
X_train_regression_standard_z_corr = sc.fit_transform(X_train_regression_z_corr)
X_test_regression_standard_z_corr = sc.transform(X_test_regression_z_corr)

In [27]:
y_pred, lr = regressaoLinear(X_train_regression_standard_z_corr,y_train_regression_z_corr,
                             X_test_regression_standard_z_corr,y_test_regression_z_corr)

mse: 46870935.367058516
 rmse: 6846.235123559409
 mae: 5327.864605562998
 mape: 0.7710904254183323
 r2_score 0.6964651445724008


### Árvore de Decisão

In [46]:
def computeRegressionDecisionTree(X_train, y_train, X_test, y_test):
    arvore_regressao = tree.DecisionTreeRegressor()
    arvore_regressao.fit(X_train,y_train)
    y_pred = arvore_regressao.predict(X_test)
        
    return results_regression(y_test, y_pred)

In [47]:
computeRegressionDecisionTree(X_train_regression_z,y_train_regression_z,
                                             X_test_regression_z,y_test_regression_z)

mse: 62988246.25878726
 rmse: 7936.513482555628
 mae: 6078.76910782061
 mape: 0.9099885539636255
 r2_score 0.5920898938313877


In [51]:
computeRegressionDecisionTree(X_train_regression_z_corr,y_train_regression_z_corr,
                                             X_test_regression_z_corr,y_test_regression_z_corr)

mse: 51273315.83334256
 rmse: 7160.53879490521
 mae: 5341.174954407125
 mape: 0.8067870163889442
 r2_score 0.6679554528432708


In [52]:
computeRegressionDecisionTree(X_train_regression_standard_z_corr,y_train_regression_z_corr,
                                             X_test_regression_standard_z_corr,y_test_regression_z_corr)

mse: 50554414.54213072
 rmse: 7110.162764812822
 mae: 5329.665634407125
 mape: 0.8036512526930207
 r2_score 0.6726110373283216


### KNN

In [28]:
def computeKNNRegression(X_train,y_train,X_test,y_test,printResults=True,n_neighbors=3, algorithm='auto'):
    knn = neighbors.KNeighborsRegressor(n_neighbors = n_neighbors, algorithm=algorithm)
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)
    if printResults:
        results_regression(y_test,y_pred)
    return y_pred,knn

Simples

In [32]:
y_pred_regression,knn = computeKNNRegression(X_train_regression_z,y_train_regression_z,
                                             X_test_regression_z,y_test_regression_z,n_neighbors=5)

mse: 82262113.91039817
 rmse: 9069.846410518658
 mae: 6751.009462309161
 mape: 0.8413361128031683
 r2_score 0.46727287054505373


##### KNN usando apenas as colunas mais correlacionadas

In [33]:
y_pred_regression,knn = computeKNNRegression(X_train_regression_z_corr,y_train_regression_z_corr,
                                             X_test_regression_z_corr,y_test_regression_z_corr,n_neighbors=5)

mse: 43761037.73820491
 rmse: 6615.212599622548
 mae: 4785.875581942749
 mape: 0.7946766364180167
 r2_score 0.7166047539012161


Aplicando standard scaler

In [43]:
y_pred_regression,knn = computeKNNRegression(X_train_regression_standard_z_corr,y_train_regression_z_corr,
                                             X_test_regression_standard_z_corr,y_test_regression_z_corr,n_neighbors=9)

mse: 34010927.39196345
 rmse: 5831.888835700099
 mae: 4434.748487159033
 mape: 0.7627148404722448
 r2_score 0.7797461934985473
