In [123]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.svm import SVR

In [124]:
# Let's load the data and OHE our categorical column (ocean_proximity)
raw_data = pd.read_csv("../datasets/housing.csv")
raw_data = pd.get_dummies(raw_data, prefix="op", columns=["ocean_proximity"], drop_first=True)
raw_data.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,op_INLAND,op_ISLAND,op_NEAR BAY,op_NEAR OCEAN
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,0,0,1,0
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,0,0,1,0
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,0,0,1,0
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,0,0,1,0
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,0,0,1,0


In [125]:
# Dropping some null values for the total_bedroom column, we have enough records to keep going
raw_data = raw_data.dropna()
raw_data.count()

longitude             20433
latitude              20433
housing_median_age    20433
total_rooms           20433
total_bedrooms        20433
population            20433
households            20433
median_income         20433
median_house_value    20433
op_INLAND             20433
op_ISLAND             20433
op_NEAR BAY           20433
op_NEAR OCEAN         20433
dtype: int64

In [136]:
# Let's Scale our columns, no need to worry with the OHE columns as the values should always be either 0 (the min) or 1 (max) and should remain unchanged
scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(raw_data)
final_data = pd.DataFrame(scaled_data, columns=raw_data.columns)
final_data.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,op_INLAND,op_ISLAND,op_NEAR BAY,op_NEAR OCEAN
0,0.211155,0.567481,0.784314,0.022331,0.019863,0.008941,0.020556,0.539668,0.902266,0.0,0.0,1.0,0.0
1,0.212151,0.565356,0.392157,0.180503,0.171477,0.06721,0.186976,0.538027,0.708247,0.0,0.0,1.0,0.0
2,0.210159,0.564293,1.0,0.03726,0.02933,0.013818,0.028943,0.466028,0.695051,0.0,0.0,1.0,0.0
3,0.209163,0.564293,1.0,0.032352,0.036313,0.015555,0.035849,0.354699,0.672783,0.0,0.0,1.0,0.0
4,0.209163,0.564293,1.0,0.04133,0.043296,0.015752,0.042427,0.230776,0.674638,0.0,0.0,1.0,0.0


In [139]:
# Let's keep 20% of our data as if it was from the future
future_data = final_data.sample(frac=.2)
given_data = final_data.drop(future_data.index, axis=0)

In [145]:
# First experiment: we will just train our model on the given data and the see how it performs on future data
naive_y = given_data.median_house_value
naive_x = given_data.drop('median_house_value', axis=1)

naive_model = SVR()
naive_model.fit(naive_x, naive_y)
# Predict values for the future data
naive_predictions = naive_model.predict(future_data.drop('median_house_value', axis=1))

#Compute the mse of the predictions
mse = sum((naive_predictions - future_data.median_house_value)**2)/len(naive_predictions)
print(mse)

0.014860083919331756


In [143]:
# First experiment: we will just train our model on the given data and the see how it performs on future data
naive_y = given_data.median_house_value
naive_x = given_data.drop('median_house_value', axis=1)

naive_model = SVR()
naive_model.fit(naive_x, naive_y)
# Predict values for the future data
naive_predictions = naive_model.predict(future_data.drop('median_house_value', axis=1))

#Compute the mse of the predictions
mse = sum((naive_predictions - future_data.median_house_value)**2)/len(naive_predictions)
print(mse)

0.014860083919331756


In [151]:
# Now we are going to use a Stratified KFold to try and mitigate over fitting
k_fold_y = given_data.median_house_value
k_fold_x = given_data.drop('median_house_value', axis=1)

folds = StratifiedKFold(n_splits=2)
print(folds.get_n_splits(k_fold_x, k_fold_x))
for k in folds.split(k_fold_x, k_fold_y):
    print(k)

2


ValueError: Supported target types are: ('binary', 'multiclass'). Got 'continuous' instead.