In [1]:
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
housing_data = pd.read_csv('housing.csv')
housing_data.sample(5)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
9746,-121.62,36.63,52.0,1437.0,298.0,836.0,257.0,3.6286,165500.0,<1H OCEAN
305,-122.18,37.76,43.0,2018.0,408.0,1111.0,367.0,1.8913,91200.0,NEAR BAY
3740,-118.39,34.17,26.0,3345.0,818.0,1599.0,773.0,3.3516,241500.0,<1H OCEAN
15582,-116.74,33.33,17.0,4190.0,946.0,1802.0,673.0,2.4744,158200.0,INLAND
18395,-121.86,37.27,17.0,4393.0,709.0,2292.0,692.0,5.6876,246500.0,<1H OCEAN


In [3]:
housing_data = housing_data.dropna()

In [4]:
housing_data.shape

(20433, 10)

In [5]:
housing_data.loc[housing_data['median_house_value'] == 500001].count()

longitude             958
latitude              958
housing_median_age    958
total_rooms           958
total_bedrooms        958
population            958
households            958
median_income         958
median_house_value    958
ocean_proximity       958
dtype: int64

In [6]:
housing_data = housing_data.drop(housing_data.loc[housing_data['median_house_value'] == 500001].index)

In [7]:
housing_data.shape

(19475, 10)

In [8]:
housing_data = pd.get_dummies(housing_data, columns=['ocean_proximity'])

In [9]:
housing_data.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,0,0,0,1,0
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,0,0,0,1,0
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,0,0,0,1,0
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,0,0,0,1,0
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,0,0,0,1,0


In [10]:
X = housing_data.drop('median_house_value', axis=1)
Y = housing_data['median_house_value']

In [11]:
X.columns

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'ocean_proximity_<1H OCEAN', 'ocean_proximity_INLAND',
       'ocean_proximity_ISLAND', 'ocean_proximity_NEAR BAY',
       'ocean_proximity_NEAR OCEAN'],
      dtype='object')

In [12]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)

In [13]:
x_train.shape, x_test.shape

((15580, 13), (3895, 13))

In [14]:
from sklearn.linear_model import LinearRegression
linear_model = LinearRegression(normalize=True).fit(x_train, y_train)

In [15]:
print('Training Score: ', linear_model.score(x_train, y_train))

Training Score:  0.610487875288396


In [16]:
predictors = x_train.columns
predictors

Index(['longitude', 'latitude', 'housing_median_age', 'total_rooms',
       'total_bedrooms', 'population', 'households', 'median_income',
       'ocean_proximity_<1H OCEAN', 'ocean_proximity_INLAND',
       'ocean_proximity_ISLAND', 'ocean_proximity_NEAR BAY',
       'ocean_proximity_NEAR OCEAN'],
      dtype='object')

In [17]:
coef = pd.Series(linear_model.coef_, predictors).sort_values()
print(coef)

longitude                    -2.385889e+04
latitude                     -2.197373e+04
population                   -2.993688e+01
total_rooms                  -7.659631e+00
households                    4.432410e+01
total_bedrooms                9.206652e+01
housing_median_age            9.473677e+02
median_income                 3.870589e+04
ocean_proximity_INLAND        3.071274e+17
ocean_proximity_NEAR BAY      3.071274e+17
ocean_proximity_<1H OCEAN     3.071274e+17
ocean_proximity_NEAR OCEAN    3.071274e+17
ocean_proximity_ISLAND        3.071274e+17
dtype: float64


In [18]:
y_pred = linear_model.predict(x_test)

In [19]:
df_pred_actual = pd.DataFrame({'predicted': y_pred, 'actual': y_test})
df_pred_actual.head(10)

Unnamed: 0,predicted,actual
16504,110976.0,102200.0
14581,264064.0,192900.0
10430,249152.0,284100.0
10094,168384.0,185200.0
19237,315712.0,341500.0
5141,110528.0,86400.0
3012,154752.0,116500.0
20369,289216.0,302600.0
2788,120576.0,180400.0
11354,116096.0,160000.0


In [20]:
from sklearn.metrics import r2_score

print("Testing score: ", r2_score(y_test, y_pred))

Testing score:  0.622635767286392


In [21]:
df_pred_actual_sample = df_pred_actual.sample(100)
df_pred_actual_sample = df_pred_actual_sample.reset_index()

In [22]:
df_pred_actual_sample.head()

Unnamed: 0,index,predicted,actual
0,19824,110912.0,110300.0
1,8411,171264.0,149100.0
2,17759,197248.0,162900.0
3,6265,193664.0,163500.0
4,13184,326720.0,426900.0
