In [110]:
import pandas as pd
import statsmodels.api as sm
import numpy as np
import matplotlib.pyplot as plt

In [121]:
house_data=pd.read_csv('https://raw.githubusercontent.com/zekelabs/data-science-complete-tutorial/master/Data/house_rental_data.csv.txt', index_col='Unnamed: 0')
house_data.head(10) #data imported

Unnamed: 0,Sqft,Floor,TotalFloor,Bedroom,Living.Room,Bathroom,Price
1,1177.698,2,7,2,2,2,62000
2,2134.8,5,7,4,2,2,78000
3,1138.56,5,7,2,2,1,58000
4,1458.78,2,7,3,2,2,45000
5,967.776,11,14,3,2,2,45000
6,1127.886,11,12,4,2,2,148000
7,1352.04,5,7,3,2,1,58000
8,757.854,5,14,1,0,1,48000
9,1152.792,10,12,3,2,2,45000
10,1423.2,4,5,4,2,2,65000


In [112]:
columns=house_data.columns.tolist() #list of columns
no_of_columns=house_data.shape[1] #no of columns
no_of_rows=house_data.shape[0] #no of rows

print('No. of rows: ',no_of_rows)
print('No. of Columns: ',no_of_columns)
print('Column Names: ',columns)

No. of rows:  645
No. of Columns:  7
Column Names:  ['Sqft', 'Floor', 'TotalFloor', 'Bedroom', 'Living.Room', 'Bathroom', 'Price']


In [122]:
house_data=house_data.rename(columns={'Living.Room':'Living Room', 'TotalFloor':'No. of Floors'}) #renaming the columns

In [114]:
house_data.describe().round(3) #summary / statistics

Unnamed: 0,Sqft,Floor,No. of Floors,Bedroom,Living Room,Bathroom,Price
count,645.0,645.0,645.0,645.0,645.0,645.0,645.0
mean,1527.656,5.94,10.856,2.837,1.814,1.811,61986.823
std,767.387,3.885,4.996,1.011,0.462,0.684,35635.091
min,359.358,1.0,1.0,1.0,0.0,0.0,6100.0
25%,925.08,3.0,7.0,2.0,2.0,1.0,39000.0
50%,1423.2,5.0,12.0,3.0,2.0,2.0,50000.0
75%,1892.856,8.0,14.0,4.0,2.0,2.0,75000.0
max,5856.468,22.0,38.0,7.0,4.0,5.0,250000.0


In [115]:
house_data.groupby('No. of Floors').sum().sort_values(by='No. of Floors',ascending=True) #grouping by no of floors
house_data.set_index(['Price','No. of Floors'], append=True) #indexing

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Sqft,Floor,Bedroom,Living Room,Bathroom
Unnamed: 0_level_1,Price,No. of Floors,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,62000,7,1177.698,2,2,2,2
2,78000,7,2134.800,5,4,2,2
3,58000,7,1138.560,5,2,2,1
4,45000,7,1458.780,2,3,2,2
5,45000,14,967.776,11,3,2,2
6,148000,12,1127.886,11,4,2,2
7,58000,7,1352.040,5,3,2,1
8,48000,14,757.854,5,1,0,1
9,45000,12,1152.792,10,3,2,2
10,65000,5,1423.200,4,4,2,2


In [123]:
variables=list(house_data.columns) #storing columns as a list
target_variable='Price' #target variable for regression
independent_variable=[var for var in variables if var not in target_variable] #all values other than target variable are independent variables

In [124]:
model_Simple=sm.OLS(house_data[target_variable], house_data[independent_variable]).fit() #creating a model without constant (intercept)

model=sm.OLS(house_data[target_variable], sm.add_constant(house_data[independent_variable])).fit() #creating a model with constant

In [125]:
print(model_Simple.summary()) #regression results
print('################################################################################')
print(model.summary()) #regression results

                            OLS Regression Results                            
Dep. Variable:                  Price   R-squared:                       0.926
Model:                            OLS   Adj. R-squared:                  0.925
Method:                 Least Squares   F-statistic:                     1330.
Date:                Sat, 06 Jul 2019   Prob (F-statistic):               0.00
Time:                        13:20:27   Log-Likelihood:                -7285.4
No. Observations:                 645   AIC:                         1.458e+04
Df Residuals:                     639   BIC:                         1.461e+04
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
Sqft             37.6229      1.574     23.899

In [126]:
drop_var=['No. of Floors','Living Room','Bedroom','Bathroom'] #dropping variables with p>0.05
independent_variable_new=[var for var in variables if var not in drop_var] #creating a new independent variable having value other than the dropped values

In [127]:
model_new=sm.OLS(house_data[target_variable], sm.add_constant(house_data[independent_variable_new])).fit() #creating a predictive model with new variable
print(model_new.summary()) #regression results

                            OLS Regression Results                            
Dep. Variable:                  Price   R-squared:                       1.000
Model:                            OLS   Adj. R-squared:                  1.000
Method:                 Least Squares   F-statistic:                 6.734e+31
Date:                Sat, 06 Jul 2019   Prob (F-statistic):               0.00
Time:                        13:20:51   Log-Likelihood:                 14230.
No. Observations:                 645   AIC:                        -2.845e+04
Df Residuals:                     641   BIC:                        -2.843e+04
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       3.729e-11   6.42e-12      5.810      0.0

In [138]:
house_data['Predicted Price']=model_new.predict(sm.add_constant(house_data[independent_variable_new])) #prediction price
house_data.head(10)

Unnamed: 0,Sqft,Floor,No. of Floors,Bedroom,Living Room,Bathroom,Price,Predicted Price
1,1177.698,2,7,2,2,2,62000,62000.0
2,2134.8,5,7,4,2,2,78000,78000.0
3,1138.56,5,7,2,2,1,58000,58000.0
4,1458.78,2,7,3,2,2,45000,45000.0
5,967.776,11,14,3,2,2,45000,45000.0
6,1127.886,11,12,4,2,2,148000,148000.0
7,1352.04,5,7,3,2,1,58000,58000.0
8,757.854,5,14,1,0,1,48000,48000.0
9,1152.792,10,12,3,2,2,45000,45000.0
10,1423.2,4,5,4,2,2,65000,65000.0
