## Importing Dependency

In [1]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score, mean_squared_error
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression, Ridge
import pandas as pd
import numpy as np

## Data import

In [2]:
df = pd.read_csv('housing.csv')

In [3]:
features = ['latitude', 'longitude', 'housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households',
            'median_income', 'median_house_value', 'ocean_proximity']
custom_seed = 42

In [4]:
df

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY
...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0,INLAND
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0,INLAND
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0,INLAND
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0,INLAND


## Data Preparation

In [5]:
df['rooms_per_household'] = df.total_rooms / df.households

In [6]:
df['bedrooms_per_room'] = df.total_bedrooms / df.total_rooms

In [7]:
df['population_per_household'] = df.population / df.households

## Question 1

In [8]:
ocean_proximity_mode = df.ocean_proximity.mode()
print(ocean_proximity_mode)

0    <1H OCEAN
Name: ocean_proximity, dtype: object


## Setting up Validation Framework

In [9]:
def validation_framework(data=df, val=0.25, test=0.2):
    df_full_train, df_test = train_test_split(data, test_size=test, random_state=42)
    df_train, df_val = train_test_split(df_full_train, test_size=val, random_state=42)

    df_train = df_train.reset_index(drop=True)
    df_val = df_val.reset_index(drop=True)
    df_test = df_test.reset_index(drop=True)

    y_train = df_train.median_house_value.values
    y_val = df_val.median_house_value.values
    y_test = df_test.median_house_value.values

    del df_train['median_house_value']
    del df_val['median_house_value']
    del df_test['median_house_value']

    return df_train, df_val, df_test, y_train, y_val, y_test


df_train, df_val, df_test, y_train, y_val, y_test = validation_framework()

## Q2 - Create the correlation matrix for the numerical features of your train dataset.
### In a correlation matrix, you compute the correlation coefficient between every pair of features in the dataset.

In [10]:
cor_matrix = df_train.corr(method='pearson')


def get_coefs(cor_matrix):
    coeff_column = "coefficient"

    return cor_matrix \
        .where(np.triu(np.ones(cor_matrix.shape), k=1).astype(bool)) \
        .stack() \
        .reset_index() \
        .rename(columns={0: coeff_column}) \
        .sort_values(ascending=False, by=coeff_column, key=abs)

  cor_matrix = df_train.corr(method='pearson')


In [11]:
get_coefs(cor_matrix).head()  # Total bedrooms & Households

Unnamed: 0,level_0,level_1,coefficient
35,total_bedrooms,households,0.979399
27,total_rooms,total_bedrooms,0.931546
0,longitude,latitude,-0.925005
29,total_rooms,households,0.921441
40,population,households,0.906841


In [12]:
median_house_value_mean = df.median_house_value.mean()

In [13]:
# Above Average
y_train = (y_train > median_house_value_mean).astype(int)
y_val = (y_val > median_house_value_mean).astype(int)
y_test = (y_test > median_house_value_mean).astype(int)

## Q3 - Calculate the mutual information score with the (binarized) price for the categorical variable that we have.
### Use the training set only.
### What is the value of mutual information?
### Round it to 2 decimal digits using round(score, 2)

In [14]:
mutual_info_score(df_train.ocean_proximity, y_train)

0.10138385763624205

### Defining a function to one hot encode training data

In [15]:
def one_hot_encode(training_data):
    train_dicts = training_data.to_dict(orient='records')
    dv = DictVectorizer(sparse=False)
    one_hot_encoded_data = dv.fit_transform(train_dicts)
    return one_hot_encoded_data


X_train = one_hot_encode(df_train)
X_test = one_hot_encode(df_test)
X_val = one_hot_encode(df_val)

## Question 4
### Now let's train a logistic regression
### Remember that we have one categorical variable ocean_proximity in the data. Include it using one-hot encoding.
### Fit the model on the training dataset.
### To make sure the results are reproducible across different versions of Scikit-Learn, fit the model with these parameters:
### model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)
### Calculate the accuracy on the validation dataset and round it to 2 decimal digits.

In [16]:
model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)


def accuracy(model, X_validation, y_validation):
    y_pred = model.predict(X_validation)
    return (y_validation == y_pred).mean()


global_accuracy = accuracy(model, X_val, y_val)
print(round(global_accuracy, 2))

0.84


## Question 5: Feature selection - the smallest difference in accuracy
### Train a model with all these features (using the same parameters as in Q4).
### Now exclude each feature from this set and train a model without it. Record the accuracy for each model.
### For each feature, calculate the difference between the original accuracy and the accuracy without the feature.
### Which of following feature has the smallest difference?

* total_bedrooms

In [17]:
diff_features = ['total_rooms', 'total_bedrooms', 'population', 'households']
diffs = dict()

for feature in diff_features:
    features = df_train.columns.tolist()
    features.remove(feature)
    # Data Preparation
    x_train = one_hot_encode(df_train[features])
    # Model Training
    model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)
    model.fit(x_train, y_train)
    # Data Validation
    x_val = one_hot_encode(df_val[features])

    # y_pred = model.predict(x_val)
    # acc = (y_val == y_pred).mean()
    acc = accuracy(model, x_val, y_val)
    diffs[feature] = abs(acc - global_accuracy)

min_diff = {x: y for x, y in diffs.items() if y == min(diffs.values())}
print(min_diff)

{'total_bedrooms': 0.0004844961240310086}


## Question 6 - For this question, we'll see how to use a linear regression model from Scikit-Learn
### We'll need to use the original column 'median_house_value'. Apply the logarithmic transformation to this column.
### Fit the Ridge regression model (model = Ridge(alpha=a, solver="sag", random_state=42)) on the training data.
### This model has a parameter alpha. Let's try the following values: [0, 0.01, 0.1, 1, 10]
### Which of these alphas leads to the best RMSE on the validation set? Round your RMSE scores to 3 decimal digits.
### If there are multiple options, select the smallest alpha.

* 0

In [18]:
df_train, df_val, df_test, y_train, y_val, y_test = validation_framework()
y_train = np.log(y_train)
y_val = np.log(y_val)
y_test = np.log(y_test)

alphas = [0, 0.01, 0.1, 1, 10]
root_mean_squared_errors = dict()

for a in alphas:
    model = Ridge(alpha=a, solver="sag", random_state=42)
    x_train = one_hot_encode(df_train)
    model.fit(x_train, y_train)
    x_val = one_hot_encode(df_val)
    y_pred = model.predict(x_val)
    root_mean_squared_errors[a] = np.sqrt(mean_squared_error(y_val, y_pred))

print({x: round(y, 5) for x, y in root_mean_squared_errors.items()})

{0: 0.52407, 0.01: 0.52407, 0.1: 0.52407, 1: 0.52407, 10: 0.52407}
