In [1]:
import numpy as np
import scipy as sp
import pandas as pd
import statsmodels.formula.api as smf
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
#reading the training data sets
train = pd.read_csv("train.csv")

In [3]:
#basic diagnostics
train.dtypes

Id                 int64
MSSubClass         int64
MSZoning          object
LotFrontage      float64
LotArea            int64
Street            object
Alley             object
LotShape          object
LandContour       object
Utilities         object
LotConfig         object
LandSlope         object
Neighborhood      object
Condition1        object
Condition2        object
BldgType          object
HouseStyle        object
OverallQual        int64
OverallCond        int64
YearBuilt          int64
YearRemodAdd       int64
RoofStyle         object
RoofMatl          object
Exterior1st       object
Exterior2nd       object
MasVnrType        object
MasVnrArea       float64
ExterQual         object
ExterCond         object
Foundation        object
                  ...   
BedroomAbvGr       int64
KitchenAbvGr       int64
KitchenQual       object
TotRmsAbvGrd       int64
Functional        object
Fireplaces         int64
FireplaceQu       object
GarageType        object
GarageYrBlt      float64


In [18]:
#formula based ols method from statsmodels
reg_result = smf.ols(formula= "SalePrice ~ LotArea + OverallCond", data = train).fit()
reg_result.summary()

0,1,2,3
Dep. Variable:,SalePrice,R-squared:,0.075
Model:,OLS,Adj. R-squared:,0.074
Method:,Least Squares,F-statistic:,59.45
Date:,"Sun, 12 Nov 2017",Prob (F-statistic):,1.52e-25
Time:,16:40:57,Log-Likelihood:,-18487.0
No. Observations:,1460,AIC:,36980.0
Df Residuals:,1457,BIC:,37000.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,1.893e+05,1.05e+04,18.111,0.000,1.69e+05,2.1e+05
LotArea,2.0965,0.200,10.457,0.000,1.703,2.490
OverallCond,-5452.1254,1798.371,-3.032,0.002,-8979.799,-1924.452

0,1,2,3
Omnibus:,576.273,Durbin-Watson:,1.994
Prob(Omnibus):,0.0,Jarque-Bera (JB):,3329.423
Skew:,1.745,Prob(JB):,0.0
Kurtosis:,9.523,Cond. No.,76800.0


In [21]:
train_ints = train.select_dtypes(include = [np.int])

In [22]:
train_ints.dtypes

Id               int64
MSSubClass       int64
LotArea          int64
OverallQual      int64
OverallCond      int64
YearBuilt        int64
YearRemodAdd     int64
BsmtFinSF1       int64
BsmtFinSF2       int64
BsmtUnfSF        int64
TotalBsmtSF      int64
1stFlrSF         int64
2ndFlrSF         int64
LowQualFinSF     int64
GrLivArea        int64
BsmtFullBath     int64
BsmtHalfBath     int64
FullBath         int64
HalfBath         int64
BedroomAbvGr     int64
KitchenAbvGr     int64
TotRmsAbvGrd     int64
Fireplaces       int64
GarageCars       int64
GarageArea       int64
WoodDeckSF       int64
OpenPorchSF      int64
EnclosedPorch    int64
3SsnPorch        int64
ScreenPorch      int64
PoolArea         int64
MiscVal          int64
MoSold           int64
YrSold           int64
SalePrice        int64
dtype: object

In [26]:
train_ints.corr().SalePrice

Id              -0.021917
MSSubClass      -0.084284
LotArea          0.263843
OverallQual      0.790982
OverallCond     -0.077856
YearBuilt        0.522897
YearRemodAdd     0.507101
BsmtFinSF1       0.386420
BsmtFinSF2      -0.011378
BsmtUnfSF        0.214479
TotalBsmtSF      0.613581
1stFlrSF         0.605852
2ndFlrSF         0.319334
LowQualFinSF    -0.025606
GrLivArea        0.708624
BsmtFullBath     0.227122
BsmtHalfBath    -0.016844
FullBath         0.560664
HalfBath         0.284108
BedroomAbvGr     0.168213
KitchenAbvGr    -0.135907
TotRmsAbvGrd     0.533723
Fireplaces       0.466929
GarageCars       0.640409
GarageArea       0.623431
WoodDeckSF       0.324413
OpenPorchSF      0.315856
EnclosedPorch   -0.128578
3SsnPorch        0.044584
ScreenPorch      0.111447
PoolArea         0.092404
MiscVal         -0.021190
MoSold           0.046432
YrSold          -0.028923
SalePrice        1.000000
Name: SalePrice, dtype: float64

In [29]:
reg_formula = ("SalePrice ~ YearRemodAdd + YearBuilt" 
               "+ TotRmsAbvGrd +FullBath " 
               "+TotalBsmtSF +GarageArea +GarageCars +GrLivArea"
               "+OverallQual")
print reg_formula

SalePrice ~ YearRemodAdd + YearBuilt+ TotRmsAbvGrd +FullBath +TotalBsmtSF +GarageArea +GarageCars +GrLivArea+OverallQual


In [30]:
reg_result = smf.ols(formula = reg_formula, data = train)
reg_result.summary()

0,1,2,3
Dep. Variable:,SalePrice,R-squared:,0.772
Model:,OLS,Adj. R-squared:,0.771
Method:,Least Squares,F-statistic:,546.8
Date:,"Sun, 12 Nov 2017",Prob (F-statistic):,0.0
Time:,17:01:29,Log-Likelihood:,-17463.0
No. Observations:,1460,AIC:,34950.0
Df Residuals:,1450,BIC:,35000.0
Df Model:,9,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-1.159e+06,1.29e+05,-8.978,0.000,-1.41e+06,-9.06e+05
YearRemodAdd,294.6839,63.791,4.620,0.000,169.552,419.816
YearBuilt,258.0525,50.347,5.125,0.000,159.292,356.813
TotRmsAbvGrd,41.7519,1121.858,0.037,0.970,-2158.886,2242.390
FullBath,-6408.3114,2685.771,-2.386,0.017,-1.17e+04,-1139.899
TotalBsmtSF,28.9907,2.902,9.992,0.000,23.299,34.682
GarageArea,16.9253,10.314,1.641,0.101,-3.307,37.158
GarageCars,1.053e+04,3051.640,3.451,0.001,4543.945,1.65e+04
GrLivArea,53.7230,4.158,12.920,0.000,45.567,61.879

0,1,2,3
Omnibus:,485.052,Durbin-Watson:,1.98
Prob(Omnibus):,0.0,Jarque-Bera (JB):,60868.329
Skew:,-0.439,Prob(JB):,0.0
Kurtosis:,34.62,Cond. No.,442000.0


In [31]:
#loading test data
test = pd.read_csv("test.csv")

In [39]:
#predict on train dataset and calculate meansquared error
predict_train = reg_result.predict(train)
mean_squared_error(train.SalePrice,predict_train)

1435413991.433038

In [35]:
type(predict_train)

pandas.core.series.Series

In [40]:
#no mape function in scikit learn
def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100
mean_absolute_percentage_error(train.SalePrice,predict_train)

14.401179306636177

In [42]:
predict_test = reg_result.predict(test)

In [43]:
predict_test

0       111559.750008
1       158322.971237
2       172431.471696
3       190360.697256
4       218893.699489
5       185398.331919
6       175189.778111
7       177618.230731
8       208819.872433
9       103567.279505
10      210299.944350
11      121755.008975
12      110369.519403
13      167088.908092
14      146024.034689
15      328315.127118
16      267161.825300
17      294291.985057
18      284771.216467
19      384362.253958
20      302712.591851
21      233745.633594
22      190006.939720
23      195105.990057
24      183656.694012
25      216924.461798
26      314858.325368
27      269330.627899
28      207058.628301
29      203576.387620
            ...      
1429     51315.093281
1430    158686.325231
1431      8295.181893
1432     97289.360558
1433     18236.053534
1434    261436.091393
1435    254908.484884
1436    203003.699234
1437    206520.815739
1438    241133.607455
1439    163276.935900
1440    198368.885768
1441    193775.102444
1442    290069.176604
1443    30

In [45]:
#making a final submission dataset 
submit_csv = pd.DataFrame()
submit_csv['Id'] = test['Id']
submit_csv['SalePrice'] = predict_test

In [46]:
submit_csv.to_csv("submission.csv")