# Aula 3 - Projeto de Aprendizado de Máquina

Professor: Erneson A. Oliveira<br>
Curso: MBA em Ciência de Dados<br>
Universidade: Universidade de Fortaleza<br>
Data: 8 de Fevereiro de 2020

# 1. Base de dados

In [1]:
import pandas as pd

housing=pd.read_csv('housing.csv',sep=';',encoding='utf-8')

# 2. Conjunto de treinamento e conjunto de teste

In [2]:
from sklearn.model_selection import train_test_split # Use if data is large enough

test_size=0.2 # Test set size: 20%
random_state=42 # Random seed

train_set,test_set=train_test_split(housing,test_size=test_size,random_state=random_state)

# 3. Preparar o dado para modelos de AM

In [3]:
housing=train_set.drop('median_house_value',axis=1) # X
housing_labels=train_set['median_house_value'].copy() # y

from sklearn.base import BaseEstimator, TransformerMixin

class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X[self.attribute_names].values

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import FeatureUnion

housing_cat=['ocean_proximity'] # Categorical columns only
housing_num=housing.drop(housing_cat,axis=1) # Numerical columns only

num_attribs = list(housing_num)
cat_attribs = housing_cat

num_pipeline = Pipeline([
        ('selector', DataFrameSelector(num_attribs)),
        ('imputer', SimpleImputer(strategy="median")),
        ('std_scaler', StandardScaler()),
    ])

cat_pipeline = Pipeline([
        ('selector', DataFrameSelector(cat_attribs)),
        ('cat_encoder', OneHotEncoder(sparse=False)),
    ])

full_pipeline = FeatureUnion(transformer_list=[
        ("num_pipeline", num_pipeline),
        ("cat_pipeline", cat_pipeline),
    ])

housing_prepared=full_pipeline.fit_transform(housing)

# 4. Treinar o modelo

In [4]:
X=housing_prepared
y=housing_labels

from sklearn.linear_model import LinearRegression

model=LinearRegression()

lr=model.fit(X,y)

print(model.score(X,y)) # R^2
print(lr.coef_) # w_1, w_2, ...
print(lr.intercept_) # w_0

0.6496324087252945
[-5.29409680e+04 -5.42246730e+04  1.37762447e+04 -1.31629870e+04
  4.29362062e+04 -4.33337904e+04  1.84967887e+04  7.51809312e+04
  1.41487684e+17  1.41487684e+17  1.41487684e+17  1.41487684e+17
  1.41487684e+17]
-1.4148768429185621e+17


# 5. Previsão para o conjunto de treinamento

In [5]:
from sklearn.metrics import mean_squared_error
import numpy as np

y_hat = model.predict(X)
rmse=np.sqrt(mean_squared_error(y,y_hat))

print(rmse)

68437.10704969708


# 6. Previsão para o conjunto de teste

In [6]:
housing=test_set.drop('median_house_value',axis=1) # X
housing_labels=test_set['median_house_value'].copy() # y

housing_prepared=full_pipeline.fit_transform(housing)

X=housing_prepared
y=housing_labels

y_hat = model.predict(X)
rmse=np.sqrt(mean_squared_error(y,y_hat))

print(rmse)

70111.45359120371
