In [256]:
import pandas as pd
import numpy as np

## dataset

In [257]:
df=pd.read_csv('https://raw.githubusercontent.com/alexeygrigorev/datasets/master/housing.csv')

In [258]:
df.shape

(20640, 10)

In [259]:
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


Features already selected

## data prep

In [260]:
# Missing values
df.isnull().sum()

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
median_house_value      0
ocean_proximity         0
dtype: int64

In [261]:
# Filling missing values
df1=df.copy()
df=df.fillna(0)

In [262]:
df.isnull().sum()

longitude             0
latitude              0
housing_median_age    0
total_rooms           0
total_bedrooms        0
population            0
households            0
median_income         0
median_house_value    0
ocean_proximity       0
dtype: int64

In [263]:
df['rooms_per_household']=df.total_rooms/df.households
df['bedrooms_per_room']=df.total_bedrooms/df.total_rooms
df['population_per_household']=df.population/df.households

In [264]:
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity,rooms_per_household,bedrooms_per_room,population_per_household
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY,6.984127,0.146591,2.555556
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY,6.238137,0.155797,2.109842
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY,8.288136,0.129516,2.80226
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY,5.817352,0.184458,2.547945
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY,6.281853,0.172096,2.181467


Columns already in standard format, only updating field ocean_proximity

In [265]:
df.dtypes

longitude                   float64
latitude                    float64
housing_median_age          float64
total_rooms                 float64
total_bedrooms              float64
population                  float64
households                  float64
median_income               float64
median_house_value          float64
ocean_proximity              object
rooms_per_household         float64
bedrooms_per_room           float64
population_per_household    float64
dtype: object

In [266]:
df['ocean_proximity']=df['ocean_proximity'].str.lower().str.replace(' ','_')

## Question 1

What is the most frequent observation (mode) for the column ocean_proximity?

In [267]:
df.ocean_proximity.value_counts()

<1h_ocean     9136
inland        6551
near_ocean    2658
near_bay      2290
island           5
Name: ocean_proximity, dtype: int64

In [268]:
df.ocean_proximity.mode()

0    <1h_ocean
dtype: object

## Split the data

In [269]:
from sklearn.model_selection import train_test_split

In [270]:
df_full_train,df_test=train_test_split(df,test_size=0.2,random_state=42)
df_train,df_val=train_test_split(df_full_train,test_size=0.25,random_state=42)

In [271]:
len(df),len(df)*0.2,len(df_train),len(df_test),len(df_val)

(20640, 4128.0, 12384, 4128, 4128)

In [272]:
df_full_train=df_full_train.reset_index(drop=True)
df_train=df_train.reset_index(drop=True)
df_test=df_test.reset_index(drop=True)
df_val=df_val.reset_index(drop=True)

In [273]:
# splitting y
y_full_train=df_full_train['median_house_value'].values
y_train=df_train['median_house_value'].values
y_test=df_test['median_house_value'].values
y_val=df_val['median_house_value'].values

In [274]:
# dropping dependent / leaving full train for data analysis
# del df_full_train['median_house_value']
del df_train['median_house_value']
del df_test['median_house_value']
del df_val['median_house_value']

## Question 2
What are the two features that have the biggest correlation in this dataset?

In [275]:
#avoiding longitude and latitude for this
numericals=['housing_median_age','total_rooms','total_bedrooms','population','households','median_income','rooms_per_household','bedrooms_per_room','population_per_household']


In [276]:
df_full_train[numericals].corrwith(df_full_train.total_bedrooms)

housing_median_age         -0.320624
total_rooms                 0.930489
total_bedrooms              1.000000
population                  0.878932
households                  0.980255
median_income              -0.009141
rooms_per_household        -0.001659
bedrooms_per_room           0.084149
population_per_household   -0.028536
dtype: float64

In [277]:
df_full_train[numericals].corrwith(df_full_train.households)

housing_median_age         -0.302796
total_rooms                 0.920482
total_bedrooms              0.980255
population                  0.907452
households                  1.000000
median_income               0.012776
rooms_per_household        -0.083062
bedrooms_per_room           0.064185
population_per_household   -0.027656
dtype: float64

In [278]:
df_full_train[numericals].corrwith(df_full_train.total_rooms)

housing_median_age         -0.360922
total_rooms                 1.000000
total_bedrooms              0.930489
population                  0.857936
households                  0.920482
median_income               0.198268
rooms_per_household         0.136090
bedrooms_per_room          -0.189316
population_per_household   -0.024991
dtype: float64

In [279]:
df_full_train[numericals].corr()

Unnamed: 0,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,rooms_per_household,bedrooms_per_room,population_per_household
housing_median_age,1.0,-0.360922,-0.320624,-0.292283,-0.302796,-0.121711,-0.160892,0.135495,0.016245
total_rooms,-0.360922,1.0,0.930489,0.857936,0.920482,0.198268,0.13609,-0.189316,-0.024991
total_bedrooms,-0.320624,0.930489,1.0,0.878932,0.980255,-0.009141,-0.001659,0.084149,-0.028536
population,-0.292283,0.857936,0.878932,1.0,0.907452,0.004122,-0.073733,0.035134,0.07233
households,-0.302796,0.920482,0.980255,0.907452,1.0,0.012776,-0.083062,0.064185,-0.027656
median_income,-0.121711,0.198268,-0.009141,0.004122,0.012776,1.0,0.336013,-0.616669,0.022061
rooms_per_household,-0.160892,0.13609,-0.001659,-0.073733,-0.083062,0.336013,1.0,-0.435169,-0.004922
bedrooms_per_room,0.135495,-0.189316,0.084149,0.035134,0.064185,-0.616669,-0.435169,1.0,0.003938
population_per_household,0.016245,-0.024991,-0.028536,0.07233,-0.027656,0.022061,-0.004922,0.003938,1.0


**Max correlation between=** households and total bedroom
**Max correlation=** 0.980255

## Make median_house_value binary

In [280]:
global_mean_house_value= y_train.mean()
global_mean_house_value

206807.7419250646

In [281]:
# Copying to avoid loss of info
y_full_train1=y_full_train.copy()
y_train1=y_train.copy()
y_test1=y_test.copy()
y_val1=y_val.copy()

In [282]:
y_full_train=(y_full_train>=global_mean_house_value).astype(int)
y_train=(y_train>=global_mean_house_value).astype(int)
y_test=(y_test>=global_mean_house_value).astype(int)
y_val=(y_val>=global_mean_house_value).astype(int)
df_full_train['bin_med_house_val']=(df_full_train.median_house_value>=global_mean_house_value).astype(int)

In [283]:
y_full_train

array([0, 1, 0, ..., 1, 1, 1])

## Question 3
Calculate the mutual information score with the (binarized) price for the categorical variable that we have. Use the training set only.

In [284]:
from sklearn.metrics import mutual_info_score

In [285]:
round(mutual_info_score(df_full_train.bin_med_house_val,df_full_train.ocean_proximity),2)

0.1

## Question 4
Calculate the accuracy on the validation dataset and round it to 2 decimal digits.

In [286]:
from sklearn.feature_extraction import DictVectorizer

In [287]:
train_dict=df_train.to_dict(orient='records')
dv=DictVectorizer(sparse=False)
X_train=dv.fit_transform(train_dict)

In [288]:
from sklearn.linear_model import LogisticRegression

In [289]:
model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)

In [290]:
model.fit(X_train,y_train)

LogisticRegression(max_iter=1000, random_state=42, solver='liblinear')

**Calculating accuracy**

In [291]:
val_dict=df_val.to_dict(orient='records')
X_val=dv.transform(val_dict)
y_pred=model.predict(X_val)

In [292]:
total_accuracy=(y_pred==y_val).mean()
print('Total accuracy is ', round(total_accuracy,2))

Total accuracy is  0.84


## Question 5
Which of following feature has the smallest difference (in accuracy)?

    total_rooms
    total_bedrooms
    population
    households


In [293]:

for col in df_train.columns:
    
    df_train1=df_train.copy()
    df_val1=df_val.copy()
    del df_train1[col]
    del df_val1[col]
    train_dict=df_train1.to_dict(orient='records')
    dv=DictVectorizer(sparse=False)
    X_train=dv.fit_transform(train_dict)
    model.fit(X_train,y_train)
    val_dict=df_val1.to_dict(orient='records')
    X_val=dv.transform(val_dict)
    y_pred=model.predict(X_val)
    score=(y_pred==y_val).mean()
    print(col)
    print(round(score,4))
    print(round(abs(score-total_accuracy),4))
    print()


longitude
0.8328
0.0027

latitude
0.8326
0.0029

housing_median_age
0.8299
0.0056

total_rooms
0.8362
0.0007

total_bedrooms
0.8387
0.0031

population
0.8263
0.0092

households
0.8331
0.0024

median_income
0.7863
0.0492

ocean_proximity
0.82
0.0155

rooms_per_household
0.836
0.0005

bedrooms_per_room
0.8365
0.001

population_per_household
0.8365
0.001



**least difference is for :** *total_rooms*

## Question 6

Which of these alphas leads to the best RMSE on the validation set (for Ridge regression)? 

In [294]:
df_full_train,df_test=train_test_split(df,test_size=0.2,random_state=42)
df_train,df_val=train_test_split(df_full_train,test_size=0.25,random_state=42)
df_full_train=df_full_train.reset_index(drop=True)
df_train=df_train.reset_index(drop=True)
df_test=df_test.reset_index(drop=True)
df_val=df_val.reset_index(drop=True)
y_full_train=np.log1p(df_full_train['median_house_value']).values
y_train=np.log1p(df_train['median_house_value']).values
y_test=np.log1p(df_test['median_house_value']).values
y_val=np.log1p(df_val['median_house_value']).values

In [295]:
# del df_full_train['median_house_value']
del df_train['median_house_value']
del df_test['median_house_value']
del df_val['median_house_value']

In [296]:
from sklearn.linear_model import Ridge

In [297]:
a=0.01
model = Ridge(alpha=a, solver="sag", random_state=42)

In [298]:
train_dict=df_train.to_dict(orient='records')
X_train=dv.fit_transform(train_dict)

In [299]:
model.fit(X_train,y_train)

Ridge(alpha=0.01, random_state=42, solver='sag')

In [300]:
val_dict=df_val.to_dict(orient='records')
X_val=dv.transform(val_dict)
y_pred=model.predict(X_val)

In [301]:
from sklearn.metrics import mean_squared_error

In [302]:
mean_squared_error(y_pred,y_val)

0.27464262615435947

In [303]:
for a in [0, 0.01, 0.1, 1, 10]:
    model = Ridge(alpha=a, solver="sag", random_state=42)
    model.fit(X_train,y_train)
    y_pred=model.predict(X_val)
    print(a)
    print('RMSE = ',round(mean_squared_error(y_pred,y_val),4))

0
RMSE =  0.2746
0.01
RMSE =  0.2746
0.1
RMSE =  0.2746
1
RMSE =  0.2746
10
RMSE =  0.2746
