In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

In [2]:
plt.rcParams['figure.figsize'] = 11, 5
plt.rcParams['figure.dpi'] = 100

In [3]:
data = pd.read_csv('housing.csv')

In [4]:
data1 = data.copy()

In [5]:
data1.drop(['longitude','latitude'],axis = 1, inplace = True)

In [6]:
data1

Unnamed: 0,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY
...,...,...,...,...,...,...,...,...
20635,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0,INLAND
20636,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0,INLAND
20637,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0,INLAND
20638,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0,INLAND


In [7]:
features = data1.drop("median_house_value", axis=1)
target = data1['median_house_value']

In [8]:
num_attr = features.columns[:-1]
cat_attr = features.columns[-1]

In [9]:
from sklearn.pipeline import Pipeline
from sklearn.impute  import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

In [10]:
num_pipe = Pipeline([
    ('simple' , SimpleImputer())
])
cat = OneHotEncoder()

In [11]:
f1 = num_pipe.fit_transform(features[num_attr])
f2 = cat.fit_transform(features[[cat_attr]]).toarray()

In [12]:
final = np.hstack([f1, f2])
columns = np.append(features.columns[:-1],cat.categories_)

In [13]:
df = pd.DataFrame(final , columns = columns)

In [14]:
df.head()

Unnamed: 0,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,<1H OCEAN,INLAND,ISLAND,NEAR BAY,NEAR OCEAN
0,41.0,880.0,129.0,322.0,126.0,8.3252,0.0,0.0,0.0,1.0,0.0
1,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,0.0,0.0,0.0,1.0,0.0
2,52.0,1467.0,190.0,496.0,177.0,7.2574,0.0,0.0,0.0,1.0,0.0
3,52.0,1274.0,235.0,558.0,219.0,5.6431,0.0,0.0,0.0,1.0,0.0
4,52.0,1627.0,280.0,565.0,259.0,3.8462,0.0,0.0,0.0,1.0,0.0


In [15]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_absolute_error

In [16]:
X_train, X_test, Y_train, Y_test = train_test_split(final, target, random_state=100)

In [17]:
model = LinearRegression()
model.fit(X_train, Y_train)
pred = model.predict(X_test)
mean_absolute_error(Y_test, pred)

49627.290861151065

In [18]:
r2_score(Y_test, pred)

0.6520805114320036

### Columntransformer

In [19]:
from sklearn.compose import ColumnTransformer

In [20]:
num_attr

Index(['housing_median_age', 'total_rooms', 'total_bedrooms', 'population',
       'households', 'median_income'],
      dtype='object')

In [21]:
cat_attr = [cat_attr]

In [22]:
pipe = ColumnTransformer([
    ('num' , num_pipe , num_attr),
    ('cat' , OneHotEncoder() , cat_attr) 
])

In [23]:
final_feature = pipe.fit_transform(features)

In [24]:
X_train, X_test, Y_train, Y_test = train_test_split(final_feature, target, random_state=100)

In [25]:
model = LinearRegression()
model.fit(X_train, Y_train)
pred = model.predict(X_test)
mean_absolute_error(Y_test, pred)

49627.290861151065

In [26]:
r2_score(Y_test, pred)

0.6520805114320036

### Add feature

In [27]:
data2 = data1.copy()

In [28]:
data2.drop('ocean_proximity',axis =1 ,inplace=True)

In [29]:
data2

Unnamed: 0,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0
1,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0
2,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0
3,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0
4,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0
...,...,...,...,...,...,...,...
20635,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0
20636,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0
20637,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0
20638,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0


In [30]:
data2['rooms_per_household'] = data['total_rooms']/data['households']
data2['population_per_household'] = data['population']/data['households']
data2['bedrooms_per_rooms'] = data['total_bedrooms']/data['total_rooms']

In [31]:
data2['ocean_proximity'] = data['ocean_proximity']

In [32]:
data2

Unnamed: 0,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,rooms_per_household,population_per_household,bedrooms_per_rooms,ocean_proximity
0,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,6.984127,2.555556,0.146591,NEAR BAY
1,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,6.238137,2.109842,0.155797,NEAR BAY
2,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,8.288136,2.802260,0.129516,NEAR BAY
3,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,5.817352,2.547945,0.184458,NEAR BAY
4,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,6.281853,2.181467,0.172096,NEAR BAY
...,...,...,...,...,...,...,...,...,...,...,...
20635,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0,5.045455,2.560606,0.224625,INLAND
20636,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0,6.114035,3.122807,0.215208,INLAND
20637,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0,5.205543,2.325635,0.215173,INLAND
20638,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0,5.329513,2.123209,0.219892,INLAND


In [33]:
features = data2.drop("median_house_value", axis=1)
target = data2['median_house_value']

In [34]:
num_attr1 = features.columns[:-1]
cat_attr1  = features.columns[-1]

In [35]:
num_pipe1 = Pipeline([
    ('simple',SimpleImputer())
])
cat1 = OneHotEncoder()

In [36]:
f3 = num_pipe1.fit_transform(features[num_attr1])

In [37]:
f4 = cat1.fit_transform(features[[cat_attr1]]).toarray()

In [38]:
final1  = np.hstack([f3,f4])

In [39]:
columns = np.append(features.columns[:-1],cat1.categories_)

In [40]:
df1 = pd.DataFrame(final1,columns = columns)

In [41]:
df1

Unnamed: 0,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,rooms_per_household,population_per_household,bedrooms_per_rooms,<1H OCEAN,INLAND,ISLAND,NEAR BAY,NEAR OCEAN
0,41.0,880.0,129.0,322.0,126.0,8.3252,6.984127,2.555556,0.146591,0.0,0.0,0.0,1.0,0.0
1,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,6.238137,2.109842,0.155797,0.0,0.0,0.0,1.0,0.0
2,52.0,1467.0,190.0,496.0,177.0,7.2574,8.288136,2.802260,0.129516,0.0,0.0,0.0,1.0,0.0
3,52.0,1274.0,235.0,558.0,219.0,5.6431,5.817352,2.547945,0.184458,0.0,0.0,0.0,1.0,0.0
4,52.0,1627.0,280.0,565.0,259.0,3.8462,6.281853,2.181467,0.172096,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20635,25.0,1665.0,374.0,845.0,330.0,1.5603,5.045455,2.560606,0.224625,0.0,1.0,0.0,0.0,0.0
20636,18.0,697.0,150.0,356.0,114.0,2.5568,6.114035,3.122807,0.215208,0.0,1.0,0.0,0.0,0.0
20637,17.0,2254.0,485.0,1007.0,433.0,1.7000,5.205543,2.325635,0.215173,0.0,1.0,0.0,0.0,0.0
20638,18.0,1860.0,409.0,741.0,349.0,1.8672,5.329513,2.123209,0.219892,0.0,1.0,0.0,0.0,0.0


In [42]:
X_train, X_test, Y_train, Y_test = train_test_split(final1, target, random_state=100)

In [43]:
model = LinearRegression()
model.fit(X_train,Y_train)

LinearRegression()

In [44]:
pred = model.predict(X_test)

In [45]:
mean_absolute_error(Y_test,pred)

48853.837528888966

In [46]:
r2_score(Y_test,pred)

0.6589711609956458

In [47]:
cat_attr1 = [cat_attr1]

In [48]:
pipe1 = ColumnTransformer([
    ('num' ,num_pipe1 , num_attr1),
    ('cat' ,OneHotEncoder(), cat_attr1)
])

In [49]:
final_feature1 = pipe1.fit_transform(features)

In [50]:
X_train, X_test, Y_train, Y_test = train_test_split(final_feature1, target, random_state=100)

In [51]:
model = LinearRegression()
model.fit(X_train, Y_train)
pred = model.predict(X_test)
mean_absolute_error(Y_test, pred)

48853.837528888966

In [52]:
r2_score(Y_test,pred)

0.6589711609956458

### With scaling

In [53]:
num_pipe1  = Pipeline([
    ('simple' , SimpleImputer(strategy='median')),
    ('scale', StandardScaler())
])

In [54]:
pipe1 = ColumnTransformer([
    ('num', num_pipe1, num_attr1),
    ('cat', OneHotEncoder(), cat_attr1)
])

In [55]:
final_feature1 = pipe1.fit_transform(features)

In [56]:
X_train, X_test, Y_train, Y_test = train_test_split(final_feature1, target, random_state=100)

In [57]:
model = LinearRegression()
model.fit(X_train, Y_train)
pred = model.predict(X_test)
mean_absolute_error(Y_test, pred)

48850.3180060167

In [58]:
r2_score(Y_test,pred)

0.6590061766540446