# A03 - Projeto de Aprendizado de Máquina

# 0. Módulos

In [1]:
%matplotlib inline

import pandas as pd
import matplotlib.pyplot as plt
from pandas.plotting import scatter_matrix
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import Imputer,LabelEncoder,OneHotEncoder,LabelBinarizer,MinMaxScaler,StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline,FeatureUnion

from sklearn.impute import SimpleImputer

## 1 Base de dados

In [None]:
housing=pd.read_csv('housing.csv',sep=',',encoding='utf-8')

In [None]:
housing.head()

In [None]:
housing.info()

In [None]:
housing["ocean_proximity"].value_counts()

In [None]:
housing.describe()

In [None]:
housing.hist(bins=50, figsize=(20,15))

## 2 Conjunto de treinamento e conjunto de teste

In [None]:
test_size=0.2 # Test set size: 20%
random_state=42 # Random seed

train_set,test_set=train_test_split(housing,test_size=test_size,random_state=random_state)

In [None]:
train_set.info()

In [None]:
test_set.info()

## 3 Explorando o conjunto de treinamento

### 3.1 Visualizando dados geográficos

In [None]:
housing=train_set.copy()

In [None]:
housing.plot(kind='scatter',x='longitude',y='latitude')
plt.show()

In [None]:
housing.plot(kind='scatter',x='longitude',y='latitude',alpha=0.1)
plt.show()

In [None]:
s=housing['population']/100 # Sizes
l='population'              # Label
c='median_house_value'      # Label
cm=plt.get_cmap('jet')      # Colormap
cb=True                     # Colorbar

housing.plot(kind='scatter',x='longitude',y='latitude',alpha=0.4,s=s,label=l,c=c,cmap=cm,colorbar=cb)
plt.legend()
plt.show()

### 3.2 Procurando por correlações

In [None]:
corr_matrix=housing.corr()

In [None]:
cm=plt.get_cmap('viridis')      # Colormap

plt.matshow(corr_matrix,cmap=cm)
plt.colorbar()
plt.show()

In [None]:
corr_matrix['median_house_value'].sort_values(ascending=False)

In [None]:
attributes=['median_house_value','median_income','total_rooms','housing_median_age']
scatter_matrix(housing[attributes],figsize=(12,8))
plt.show()

In [None]:
housing.plot(kind='scatter',x='median_income',y='median_house_value',alpha=0.1)
plt.show()

### 3.3 Experimentando com combinação de atributos

In [None]:
housing['rooms_per_household']=housing['total_rooms']/housing['households']
housing['bedrooms_per_room']=housing['total_bedrooms']/housing['total_rooms']
housing['population_per_household']=housing['population']/housing['households']

In [None]:
corr_matrix=housing.corr()

In [None]:
cm=plt.get_cmap('viridis')      # Colormap

plt.matshow(corr_matrix,cmap=cm)
plt.colorbar()
plt.show()

In [None]:
corr_matrix['median_house_value'].sort_values(ascending=False)

## 4 Preparar o dado para Algoritmos de Aprendizado de Máquina

In [None]:
housing=train_set.drop('median_house_value',axis=1)
housing_labels=train_set['median_house_value'].copy()

### 4.1 Limpeza do dado

In [None]:
housing.dropna(subset=['total_bedrooms']) # Option 1
#housing.drop('total_bedrooms',axis=1)     # Option 2

#median=housing['total_bedrooms'].median() # Option 3
#housing['total_bedrooms'].fillna(median)

In [None]:
# from sklearn.preprocessing import Imputer

# imputer=Imputer(strategy='median')

# housing_num=housing.drop('ocean_proximity',axis=1)
# imputer.fit(housing_num)

# imputer.statistics_

# X=imputer.transform(housing_num)

# housing_tr=pd.DataFrame(X,columns=housing_num.columns)

In [None]:
housing.info()

### 4.2 Manipulação de texto e atributo categórico

In [None]:
# from sklearn.preprocessing import LabelEncoder

# encoder=LabelEncoder()
# housing_cat=housing['ocean_proximity']
# housing_cat_encoded=encoder.fit_transform(housing_cat)
# housing_cat_encoded

In [None]:
# encoder.classes_

In [None]:
# from sklearn.preprocessing import OneHotEncoder

# encoder=OneHotEncoder(categories='auto')

# housing_cat_encoded_reshaped=housing_cat_encoded.reshape(-1,1)
# housing_cat_1hot=encoder.fit_transform(housing_cat_encoded_reshaped)

# housing_cat_1hot

# housing_cat_1hot.toarray()

In [None]:
from sklearn.preprocessing import LabelBinarizer

encoder=LabelBinarizer(sparse_output=True)
housing_cat=housing['ocean_proximity']
housing_cat_1hot=encoder.fit_transform(housing_cat)

housing_cat_1hot
housing_cat_1hot.toarray()

### 4.3 Transformadores personalizados

In [None]:
# from sklearn.base import BaseEstimator, TransformerMixin

# rooms_ix, bedrooms_ix, population_ix, household_ix = [3,4,5,6]

# class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
#     def __init__(self, add_bedrooms_per_room = True): # no *args or **kwargs
#         self.add_bedrooms_per_room = add_bedrooms_per_room
    
#     def fit(self, X, y=None):
#         return self  # nothing else to do
    
#     def transform(self, X, y=None):
#         rooms_per_household = X[:, rooms_ix] / X[:, household_ix]
#         population_per_household = X[:, population_ix] / X[:, household_ix]
#         if self.add_bedrooms_per_room:
#             bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix]
#             return np.c_[X, rooms_per_household, population_per_household,bedrooms_per_room]
#         else:
#             return np.c_[X, rooms_per_household, population_per_household]

# attr_adder = CombinedAttributesAdder(add_bedrooms_per_room=False)
# housing_extra_attribs = attr_adder.transform(housing.values)

### 4.4 Escalonamento de características

In [None]:
from sklearn.preprocessing import MinMaxScaler
#from sklearn.preprocessing import StandardScaler

scaler=MinMaxScaler()
#scaler=StandardScaler()

housing_num=housing.drop('ocean_proximity',axis=1)
housing_cat_norm=scaler.fit_transform(housing_num)

housing_cat_norm

## 4.5 Transformações Pipelines

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

num_pipeline=Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('scaler', StandardScaler()),
    ])

housing_num_tr=num_pipeline.fit_transform(housing_num)

housing_num_tr

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X[self.attribute_names].values

from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import OneHotEncoder

num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]

num_pipeline = Pipeline([
        ('selector', DataFrameSelector(num_attribs)),
        ('imputer', SimpleImputer(strategy="median")),
        ('std_scaler', StandardScaler()),
    ])

cat_pipeline = Pipeline([
        ('selector', DataFrameSelector(cat_attribs)),
        ('cat_encoder', OneHotEncoder(sparse=False)),
    ])

full_pipeline = FeatureUnion(transformer_list=[
        ("num_pipeline", num_pipeline),
        ("cat_pipeline", cat_pipeline),
    ])

housing_prepared=full_pipeline.fit_transform(housing)
housing_prepared

# 5 Selecione e treine o Modelo

In [None]:
housing_prepared.info()
housing_labels.info()

In [None]:
from sklearn.linear_model import LinearRegression

lin_reg=LinearRegression()

lin_reg.git(housing_prepared,housing_labels)