In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

In [3]:
!wget https://raw.githubusercontent.com/alexeygrigorev/datasets/master/housing.csv

--2022-09-25 13:54:40--  https://raw.githubusercontent.com/alexeygrigorev/datasets/master/housing.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.110.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1423529 (1.4M) [text/plain]
Saving to: ‘housing.csv.4’


2022-09-25 13:54:40 (141 MB/s) - ‘housing.csv.4’ saved [1423529/1423529]



# Data preparation

In [36]:
df = pd.read_csv('housing.csv.3')
df = df[['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'median_house_value', 'ocean_proximity']]

In [37]:
df = df.fillna(0)

In [38]:
df['rooms_per_household'] = df['total_rooms'] / df['households']
df['bedrooms_per_room'] = df['total_bedrooms'] / df['total_rooms']
df['population_per_household'] = df['population'] / df['households']

# Question 1

In [39]:
df['ocean_proximity'].value_counts()

# Answer: <1H OCEAN

<1H OCEAN     9136
INLAND        6551
NEAR OCEAN    2658
NEAR BAY      2290
ISLAND           5
Name: ocean_proximity, dtype: int64

# Question 2

In [40]:
df.dtypes

longitude                   float64
latitude                    float64
housing_median_age          float64
total_rooms                 float64
total_bedrooms              float64
population                  float64
households                  float64
median_income               float64
median_house_value          float64
ocean_proximity              object
rooms_per_household         float64
bedrooms_per_room           float64
population_per_household    float64
dtype: object

In [41]:
df_numeric = df.copy()
df_numeric = df.drop(["ocean_proximity"], axis=1)

In [42]:
df_numeric.corr().unstack().sort_values(ascending = False)[:15]

longitude                 longitude                   1.000000
latitude                  latitude                    1.000000
bedrooms_per_room         bedrooms_per_room           1.000000
rooms_per_household       rooms_per_household         1.000000
median_house_value        median_house_value          1.000000
median_income             median_income               1.000000
households                households                  1.000000
population                population                  1.000000
total_bedrooms            total_bedrooms              1.000000
total_rooms               total_rooms                 1.000000
housing_median_age        housing_median_age          1.000000
population_per_household  population_per_household    1.000000
total_bedrooms            households                  0.966507
households                total_bedrooms              0.966507
total_rooms               total_bedrooms              0.920196
dtype: float64

In [43]:
#Answer: total_bedrooms and households

### Make median_house_value binary

In [44]:
mean = df.median_house_value.mean()
df_with_avg = df.copy()

In [45]:
df_with_avg['above_average'] = np.where(df_with_avg['median_house_value']>=mean,1,0)

In [46]:
df_with_avg = df_with_avg.drop('median_house_value', axis=1)

### Split the data

In [47]:
from sklearn.model_selection import train_test_split

In [48]:
df_full_train, df_test = train_test_split(df_with_avg, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)
len(df_train), len(df_val), len(df_test)

(12384, 4128, 4128)

In [49]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [52]:
y_train = df_train.above_average.values
y_val = df_val.above_average.values
y_test = df_test.above_average.values

del df_train['above_average']
del df_val['above_average']
del df_test['above_average']

# Question 3

In [53]:
from sklearn.metrics import mutual_info_score

In [56]:
mutual_info_score(df_full_train.ocean_proximity, df_full_train.above_average)

0.1019224615118327

In [57]:
# Answer: 0.10 (we have only 1 categorical feature)

# Question 4

In [71]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import accuracy_score

dv = DictVectorizer(sparse=False)

train_dict = df_train.to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val.to_dict(orient='records')
X_val = dv.transform(val_dict)

In [72]:
df_val.shape, (len(X_val), len(X_val[0]))     #ensure the shape is correct 

((4128, 12), (4128, 16))

In [73]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

LogisticRegression(max_iter=1000, random_state=42, solver='liblinear')

In [74]:
y_pred = model.predict_proba(X_val)[:, 1]

In [75]:
decision = (y_pred >= 0.5)

In [77]:
accuracy = np.round(accuracy_score(y_val, decision),2)
print(accuracy)

0.84


In [78]:
#Answer: 0.84

# Question 5

In [82]:
features = list(df_train.columns)
features

['longitude',
 'latitude',
 'housing_median_age',
 'total_rooms',
 'total_bedrooms',
 'population',
 'households',
 'median_income',
 'ocean_proximity',
 'rooms_per_household',
 'bedrooms_per_room',
 'population_per_household']

In [86]:
for f in features:
    subset = features.copy()
    subset.remove(f)
    
    train_dict = df_train[subset].to_dict(orient='records')

    dv = DictVectorizer(sparse=False)
    dv.fit(train_dict)

    X_train = dv.transform(train_dict)

    model = LogisticRegression(solver="liblinear", C=1.0, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)

    val_dict = df_val[subset].to_dict(orient='records')
    X_val = dv.transform(val_dict)

    y_pred = model.predict(X_val)

    score = accuracy_score(y_val, y_pred)
    print(f, accuracy - score, score)

longitude 0.007151162790697607 0.8328488372093024
latitude 0.005697674418604581 0.8343023255813954
housing_median_age 0.008362403100775184 0.8316375968992248
total_rooms 0.0018217054263565124 0.8381782945736435
total_bedrooms 0.0042441860465115555 0.8357558139534884
population 0.013691860465116279 0.8263081395348837
households 0.006666666666666599 0.8333333333333334
median_income 0.053420542635658874 0.7865794573643411
ocean_proximity 0.019505813953488382 0.8204941860465116
rooms_per_household 0.0042441860465115555 0.8357558139534884
bedrooms_per_room 0.0032751937984495383 0.8367248062015504
population_per_household 0.003759689922480547 0.8362403100775194


In [87]:
#Answer: total rooms

# Question 6

In [90]:
df['median_house_value'] = np.log1p(df['median_house_value'])

df_train_full, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_train_full, test_size=0.25, random_state=42)
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)
y_train = df_train.median_house_value.values
y_val = df_val.median_house_value.values
y_test = df_test.median_house_value.values
del df_train['median_house_value']
del df_val['median_house_value']
del df_test['median_house_value']

In [91]:
train_dict = df_train.to_dict(orient='records')
dv = DictVectorizer(sparse=False)
dv.fit(train_dict)
X_train = dv.transform(train_dict)
val_dict = df_val.to_dict(orient='records')
X_val = dv.transform(val_dict)

In [92]:
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

In [95]:
for i in [0, 0.01, 0.1, 1, 10]:
    model = Ridge(alpha=i, solver="sag", random_state=42)
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_val)
    
    score = np.sqrt(mean_squared_error(y_val, y_pred))
    
    print(i, round(score, 3))
    

0 0.011
0.01 0.011
0.1 0.011
1 0.011
10 0.011


In [None]:
#Answer: 0