# A03 - Projeto de Aprendizado de Máquina

In [16]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.linear_model import LinearRegression
import numpy as np
from sklearn.metrics import mean_squared_error,mean_absolute_error
from scipy.stats import randint
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV

In [2]:
housing = pd.read_csv('housing.csv', sep=';', encoding='utf-8')
housing.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


In [3]:
train_set, test_set = train_test_split(housing, test_size=0.20, random_state=42)

In [4]:
housing = train_set.drop('median_house_value', axis=1)
housing_labels = train_set['median_house_value'].copy()

In [5]:
housing_num = housing.drop('ocean_proximity', axis=1)
housing_cat = housing['ocean_proximity']

In [6]:
num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('scaler', MinMaxScaler()),
    ])

cat_pipeline = Pipeline([
        ('encoder', OneHotEncoder(sparse=False)),
    ])

In [7]:
num_attribs = list(housing_num)
cat_attribs = ['ocean_proximity']

full_pipeline = ColumnTransformer([
        ('num', num_pipeline, num_attribs),
        ('cat', cat_pipeline, cat_attribs),
    ])

housing_prepared = full_pipeline.fit_transform(housing)

In [8]:
housing_prepared

array([[0.72908367, 0.01702128, 0.62745098, ..., 0.        , 1.        ,
        0.        ],
       [0.61653386, 0.12978723, 0.94117647, ..., 0.        , 1.        ,
        0.        ],
       [0.38545817, 0.22446809, 0.05882353, ..., 0.        , 1.        ,
        0.        ],
       ...,
       [0.59462151, 0.15744681, 0.68627451, ..., 0.        , 0.        ,
        1.        ],
       [0.23804781, 0.53510638, 0.2745098 , ..., 0.        , 0.        ,
        1.        ],
       [0.19223108, 0.55531915, 1.        , ..., 1.        , 0.        ,
        0.        ]])

In [None]:
# c0=list(housing.drop('ocean_proximity', axis=1))
# c1=housing['ocean_proximity'].value_counts().index.values.tolist()

# housing_prepared_df=pd.DataFrame(housing_prepared,columns=c0+c1)

# housing_prepared_df.head()

In [9]:
# model = LinearRegression()
# model.fit(housing_prepared, housing_labels)

# some_data = housing.iloc[:5]
# some_labels = housing_labels.iloc[:5]
# some_data_prepared = full_pipeline.transform(some_data)

# print("Some labels:", list(some_labels))
# print("Predictions:", model.predict(some_data_prepared))

# housing_predictions = model.predict(housing_prepared)
# lin_mse = mean_squared_error(housing_labels, housing_predictions)
# lin_rmse = np.sqrt(lin_mse)
# lin_rmse

# lin_mae = mean_absolute_error(housing_labels, housing_predictions)
# lin_mae

Some labels: [103000.0, 382100.0, 172600.0, 93400.0, 96500.0]
Predictions: [189840. 290896. 251200. 148048. 165424.]


## Treinar o modelo

In [17]:
param_distribs = {
    'n_estimators': randint(low=1, high=200),
    'max_features': randint(low=1, high=8),
}

model = RandomForestRegressor(random_state=42)

random_search = RandomizedSearchCV(model,
                                   param_distributions=param_distribs,
                                   n_iter=10,
                                   cv=10,
                                   scoring='neg_mean_squared_error',
                                   random_state=42)

random_search.fit(housing_prepared, housing_labels)

RandomizedSearchCV(cv=10, error_score='raise-deprecating',
                   estimator=RandomForestRegressor(bootstrap=True,
                                                   criterion='mse',
                                                   max_depth=None,
                                                   max_features='auto',
                                                   max_leaf_nodes=None,
                                                   min_impurity_decrease=0.0,
                                                   min_impurity_split=None,
                                                   min_samples_leaf=1,
                                                   min_samples_split=2,
                                                   min_weight_fraction_leaf=0.0,
                                                   n_estimators='warn',
                                                   n_jobs=None, oob_score=False,
                                                   random_st...


In [18]:
random_search.best_params_

{'max_features': 7, 'n_estimators': 180}

In [20]:
cvres = random_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

48453.862027203824 {'max_features': 7, 'n_estimators': 180}
50332.599778739306 {'max_features': 5, 'n_estimators': 15}
49905.08032776047 {'max_features': 3, 'n_estimators': 72}
49779.659894266115 {'max_features': 5, 'n_estimators': 21}
48553.50190665696 {'max_features': 7, 'n_estimators': 122}
49870.725007715824 {'max_features': 3, 'n_estimators': 75}
49739.46616263201 {'max_features': 3, 'n_estimators': 88}
48688.61206529613 {'max_features': 5, 'n_estimators': 100}
49557.55537746536 {'max_features': 3, 'n_estimators': 150}
64925.92478066109 {'max_features': 5, 'n_estimators': 2}


## Conjunto de teste

In [22]:
final_model = random_search.best_estimator_

X_test = test_set.drop('median_house_value', axis=1)
y_test = test_set['median_house_value'].copy()

X_test_prepared = full_pipeline.transform(X_test)
final_predictions = final_model.predict(X_test_prepared)

final_mse = mean_squared_error(y_test, final_predictions)
final_rmse = np.sqrt(final_mse)

In [23]:
final_rmse

48672.42820748309

In [24]:
from scipy import stats

confidence = 0.95
squared_errors = (final_predictions - y_test) ** 2
mean = squared_errors.mean()
m = len(squared_errors)

np.sqrt(stats.t.interval(confidence, m - 1,
                         loc=np.mean(squared_errors),
                         scale=stats.sem(squared_errors)))

array([46464.73770689, 50784.23658041])

## Inferência para novos dados

In [31]:
new_data=[[-120.23,38.88,40.0,850.0,29.0,25.0,116.0,7.3652,'INLAND']]
c=list(housing)
new_data=pd.DataFrame(new_data,columns=c)

new_data_prepared = full_pipeline.transform(new_data)
final_predictions = final_model.predict(new_data_prepared)

final_predictions

array([277968.33888889])