##### Bengisu Berkel-070180049
##### Şükran Han-070180128
##### Taha Galata-070190002
##### Teoman İnan-070180033

### Importing libraries and train data

In [52]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer

In [53]:
train=pd.read_csv('train.csv',index_col=0)

In [54]:
#encoding categorical data
train=pd.get_dummies(train,drop_first=True)

### Handling missing data

In [55]:
train.isna().sum()

longitude                       0
latitude                        0
housing_median_age              0
total_rooms                     0
total_bedrooms                164
population                      0
households                      0
median_income                   0
median_house_value              0
ocean_proximity_INLAND          0
ocean_proximity_ISLAND          0
ocean_proximity_NEAR BAY        0
ocean_proximity_NEAR OCEAN      0
dtype: int64

there was missing data on total_bedrooms, we will predict them using KNNImputer

In [56]:
imputer = KNNImputer(n_neighbors=5)
knn_array_train=imputer.fit_transform(train)
train=pd.DataFrame(knn_array_train,columns=train.columns)

In [57]:
train.isna().sum()

longitude                     0
latitude                      0
housing_median_age            0
total_rooms                   0
total_bedrooms                0
population                    0
households                    0
median_income                 0
median_house_value            0
ocean_proximity_INLAND        0
ocean_proximity_ISLAND        0
ocean_proximity_NEAR BAY      0
ocean_proximity_NEAR OCEAN    0
dtype: int64

We will split data

In [58]:
y_train=train["median_house_value"]

In [59]:
x_train=train.drop('median_house_value',axis=1)

### Gridsearch

In [61]:
from sklearn.ensemble import AdaBoostRegressor, RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import mean_squared_error
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.pipeline import Pipeline
from sklearn import preprocessing
from sklearn.model_selection import GridSearchCV


# Create a pipeline
pipe = Pipeline([('scaler', preprocessing.StandardScaler()),('Regressor',LinearRegression())])

# Create space of candidate learning algorithms and their hyperparameters
search_space = [{'Regressor': [LinearRegression()],'Regressor__fit_intercept': [True, False]},
                {'Regressor': [Ridge()],'Regressor__alpha': [0.1, 1.0, 10.0], 'Regressor__fit_intercept': [True, False]},
                {'Regressor': [Lasso()],'Regressor__alpha': [0.1, 1.0, 10.0], 'Regressor__fit_intercept': [True, False]},
                {'Regressor': [AdaBoostRegressor()], 'Regressor__n_estimators': [50, 100, 200], 'Regressor__learning_rate': [0.1, 0.5, 1.0]},
                {'Regressor': [DecisionTreeRegressor()], 'Regressor__max_depth': [2, 4, 8, 16], 'Regressor__min_samples_split': [2, 4, 8, 16], 'Regressor__min_samples_leaf': [1, 2, 4]},
                {'Regressor': [RandomForestRegressor()],'Regressor__n_estimators': [50, 100, 200], 'Regressor__max_depth': [2, 4, 8, 16], 'Regressor__min_samples_split': [2, 4, 8, 16], 'Regressor__min_samples_leaf': [1, 2, 4]},
                {'Regressor': [KNeighborsRegressor()], 'Regressor__n_neighbors': [2, 4, 8], 'Regressor__weights': ['uniform', 'distance']},
                {'Regressor': [GradientBoostingRegressor()], 'Regressor__n_estimators': [50, 100, 200], 'Regressor__max_depth': [2, 4, 8], 'Regressor__learning_rate': [0.1, 0.5, 1.0]},
               ]
# Create grid search 
clf = GridSearchCV(pipe, search_space, cv=5, verbose=1, scoring='neg_mean_squared_error')
# Fit grid search
best_model = clf.fit(x_train, y_train.values)
# View best model
best_model.best_estimator_.get_params()['Regressor']

Fitting 5 folds for each of 248 candidates, totalling 1240 fits


GradientBoostingRegressor(max_depth=8, n_estimators=200)

Best regressor was GradientBoostingRegressor(max_depth=8, n_estimators=200) but all hyperparameters are highest values in lists so we need to do gradeint search with higher hyperparameters

In [65]:
pipe = Pipeline([('scaler', preprocessing.StandardScaler()),('Regressor',GradientBoostingRegressor())])

search_space=[{'Regressor': [GradientBoostingRegressor()], 'Regressor__n_estimators': [150, 200,250,300], 'Regressor__max_depth': [8,12,16,20], 'Regressor__learning_rate': [0.1]},
               ]

clf = GridSearchCV(pipe, search_space, cv=5, verbose=1, scoring='neg_mean_squared_error')
# Fit grid search
best_model = clf.fit(x_train, y_train.values)
# View best model
best_model.best_estimator_.get_params()['Regressor']


Fitting 5 folds for each of 16 candidates, totalling 80 fits


GradientBoostingRegressor(max_depth=8, n_estimators=300)

The highest scored model was GradientBoostingRegressor with max_depth=8 and n_estimators=300

### Training best model

In [82]:
from sklearn.model_selection import train_test_split

In [83]:
scaler = preprocessing.StandardScaler()
scaler.fit(x_train)
x_trainStandard = scaler.transform(x_train)

GradientBoosting = GradientBoostingRegressor(n_estimators=300, max_depth=8, learning_rate=0.1)
GradientBoosting.fit(x_trainStandard,y_train)

GradientBoostingRegressor(max_depth=8, n_estimators=300)

### Calculate cross validation score

In [95]:
from sklearn.model_selection import cross_val_score
# perform cross validation
scores = cross_val_score(GradientBoosting, x_train, y_train, cv=5,scoring="r2")
print("Cross validation scores: {}".format(scores))
print("Mean score: {:.2f}".format(scores.mean()))

Cross validation scores: [0.83444507 0.82455688 0.83208203 0.84571025 0.83933926]
Mean score: 0.84


Cross validation scores look good

### Predicting median_house_value for Test Data

Using model that has bigger data size

In [115]:
test=pd.read_csv("test.csv",index_col=0)

In [116]:
x_test=pd.get_dummies(test,drop_first=True)

In [118]:
#ocean_proximity_ISLAND is missing in dummies because there is no island value in test data
island=pd.Series([0]*len(x_test))
x_test.insert(9,column="ocean_proximity_ISLAND", value=island)

In [125]:
x_test.isna().sum()

longitude                      0
latitude                       0
housing_median_age             0
total_rooms                    0
total_bedrooms                43
population                     0
households                     0
median_income                  0
ocean_proximity_INLAND         0
ocean_proximity_ISLAND         0
ocean_proximity_NEAR BAY       0
ocean_proximity_NEAR OCEAN     0
dtype: int64

There is missing values in total_bedrooms, we will predict them with knn

In [127]:
imputer = KNNImputer(n_neighbors=5)
knn_array_train=imputer.fit_transform(x_test)
x_test=pd.DataFrame(knn_array_train,columns=x_test.columns)

In [128]:
x_testStandard=scaler.transform(x_test)

In [129]:
y_test_pred = GradientBoosting.predict(x_testStandard)

In [131]:
submission=pd.Series(y_test_pred,name="median_house_value")

In [133]:
submission.to_excel("y_pred.xlsx")