**Data Science Regression Project: Real Estate dataset**

In [1]:
import numpy as np 
import pandas as pd 


Data Load: Load Germany housing data into a dataframe

In [2]:
df1 = pd.read_csv("germany_housing_data_14.07.2020.csv")
df1.head()

Unnamed: 0.1,Unnamed: 0,Price,Type,Living_space,Lot,Usable_area,Free_of_Relation,Rooms,Bedrooms,Bathrooms,...,Energy_source,Energy_certificate,Energy_certificate_type,Energy_consumption,Energy_efficiency_class,State,City,Place,Garages,Garagetype
0,0,498000.0,Multiple dwelling,106.0,229.0,,01.10.2020,5.5,3.0,1.0,...,Gas,available,demand certificate,,D,Baden-Württemberg,Bodenseekreis,Bermatingen,2.0,Parking lot
1,1,495000.0,Mid-terrace house,140.93,517.0,20.0,01.01.2021,6.0,3.0,2.0,...,,not required by law,,,,Baden-Württemberg,Konstanz (Kreis),Engen,7.0,Parking lot
2,2,749000.0,Farmhouse,162.89,82.0,37.62,01.07.2020,5.0,3.0,2.0,...,"Fernwärme, Bioenergie",available,demand certificate,,B,Baden-Württemberg,Esslingen (Kreis),Ostfildern,1.0,Garage
3,3,259000.0,Farmhouse,140.0,814.0,,nach Vereinbarung,4.0,,2.0,...,Strom,available,demand certificate,,G,Baden-Württemberg,Waldshut (Kreis),Bonndorf im Schwarzwald,1.0,Garage
4,4,469000.0,Multiple dwelling,115.0,244.0,,sofort,4.5,2.0,1.0,...,Öl,available,demand certificate,,F,Baden-Württemberg,Esslingen (Kreis),Leinfelden-Echterdingen,1.0,Garage


In [3]:
df1.shape

(10552, 26)

In [4]:
df1.columns

Index(['Unnamed: 0', 'Price', 'Type', 'Living_space', 'Lot', 'Usable_area',
       'Free_of_Relation', 'Rooms', 'Bedrooms', 'Bathrooms', 'Floors',
       'Year_built', 'Furnishing_quality', 'Year_renovated', 'Condition',
       'Heating', 'Energy_source', 'Energy_certificate',
       'Energy_certificate_type', 'Energy_consumption',
       'Energy_efficiency_class', 'State', 'City', 'Place', 'Garages',
       'Garagetype'],
      dtype='object')

Drop features that are not required to build our model

In [5]:
df2= df1.drop('Unnamed: 0',axis='columns')
df2.head()

Unnamed: 0,Price,Type,Living_space,Lot,Usable_area,Free_of_Relation,Rooms,Bedrooms,Bathrooms,Floors,...,Energy_source,Energy_certificate,Energy_certificate_type,Energy_consumption,Energy_efficiency_class,State,City,Place,Garages,Garagetype
0,498000.0,Multiple dwelling,106.0,229.0,,01.10.2020,5.5,3.0,1.0,2.0,...,Gas,available,demand certificate,,D,Baden-Württemberg,Bodenseekreis,Bermatingen,2.0,Parking lot
1,495000.0,Mid-terrace house,140.93,517.0,20.0,01.01.2021,6.0,3.0,2.0,,...,,not required by law,,,,Baden-Württemberg,Konstanz (Kreis),Engen,7.0,Parking lot
2,749000.0,Farmhouse,162.89,82.0,37.62,01.07.2020,5.0,3.0,2.0,4.0,...,"Fernwärme, Bioenergie",available,demand certificate,,B,Baden-Württemberg,Esslingen (Kreis),Ostfildern,1.0,Garage
3,259000.0,Farmhouse,140.0,814.0,,nach Vereinbarung,4.0,,2.0,2.0,...,Strom,available,demand certificate,,G,Baden-Württemberg,Waldshut (Kreis),Bonndorf im Schwarzwald,1.0,Garage
4,469000.0,Multiple dwelling,115.0,244.0,,sofort,4.5,2.0,1.0,,...,Öl,available,demand certificate,,F,Baden-Württemberg,Esslingen (Kreis),Leinfelden-Echterdingen,1.0,Garage


Data Cleaning: Handle NA values

In [6]:
df2.isna().sum()

Price                         0
Type                        402
Living_space                  0
Lot                           0
Usable_area                4984
Free_of_Relation           3569
Rooms                         0
Bedrooms                   3674
Bathrooms                  1801
Floors                     2664
Year_built                  694
Furnishing_quality         2726
Year_renovated             5203
Condition                   323
Heating                     584
Energy_source              1227
Energy_certificate          755
Energy_certificate_type    3526
Energy_consumption         8119
Energy_efficiency_class    4819
State                         1
City                          1
Place                       290
Garages                    1960
Garagetype                 1960
dtype: int64

In [7]:
col_to_fill_zero = ['Usable_area', 'Free_of_Relation', 'Bedrooms', 'Bathrooms', 'Floors', 'Furnishing_quality', 'Year_renovated', 'Energy_source', 'Energy_certificate_type', 'Energy_consumption', 'Energy_efficiency_class']

df2[col_to_fill_zero] = df2[col_to_fill_zero].fillna(0)

In [8]:
df2.isna().sum()

Price                         0
Type                        402
Living_space                  0
Lot                           0
Usable_area                   0
Free_of_Relation              0
Rooms                         0
Bedrooms                      0
Bathrooms                     0
Floors                        0
Year_built                  694
Furnishing_quality            0
Year_renovated                0
Condition                   323
Heating                     584
Energy_source                 0
Energy_certificate          755
Energy_certificate_type       0
Energy_consumption            0
Energy_efficiency_class       0
State                         1
City                          1
Place                       290
Garages                    1960
Garagetype                 1960
dtype: int64

In [9]:
df2['Year_built'] = df2['Year_built'].fillna(df2.Year_built.mean())
df2['Garages'] = df2['Garages'].fillna(df2.Garages.mean())

In [10]:
df2.isna().sum()

Price                         0
Type                        402
Living_space                  0
Lot                           0
Usable_area                   0
Free_of_Relation              0
Rooms                         0
Bedrooms                      0
Bathrooms                     0
Floors                        0
Year_built                    0
Furnishing_quality            0
Year_renovated                0
Condition                   323
Heating                     584
Energy_source                 0
Energy_certificate          755
Energy_certificate_type       0
Energy_consumption            0
Energy_efficiency_class       0
State                         1
City                          1
Place                       290
Garages                       0
Garagetype                 1960
dtype: int64

In [11]:
df2.dropna(inplace=True)

In [12]:
df2.isna().sum()

Price                      0
Type                       0
Living_space               0
Lot                        0
Usable_area                0
Free_of_Relation           0
Rooms                      0
Bedrooms                   0
Bathrooms                  0
Floors                     0
Year_built                 0
Furnishing_quality         0
Year_renovated             0
Condition                  0
Heating                    0
Energy_source              0
Energy_certificate         0
Energy_certificate_type    0
Energy_consumption         0
Energy_efficiency_class    0
State                      0
City                       0
Place                      0
Garages                    0
Garagetype                 0
dtype: int64

Using One Hot Encoding

In [13]:
df2 = pd.get_dummies(df2, drop_first=True)

In [14]:
X = df2.drop('Price', axis = 'columns')
y = df2.Price

Building model

In [15]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [16]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X_train, y_train)
y_predict = model.predict(X_test)

In [17]:
model.score(X_train, y_train)

0.9448529177064652

Saving as a Pickle file

In [18]:
import pickle
with open('Germany_housing.pickle','wb') as f:
    pickle.dump(model,f)