In [1]:
#Import the packages
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score, cross_val_predict
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import norm, skew
%matplotlib inline

In [2]:
#Import the dataset
train=pd.read_csv('train.csv')
test=pd.read_csv('test.csv')
full = train.append(test, ignore_index=True)

In [3]:
print(train.shape,test.shape,full.shape)

(2051, 82) (879, 80) (2930, 82)


Preprocessing

In [4]:
full[full.columns[full.isnull().any()]].isnull().sum().sort_values(ascending=False)

Pool QC           2917
Misc Feature      2824
Alley             2732
Fence             2358
Fireplace Qu      1422
Sale Condition     879
SalePrice          879
Lot Frontage       490
Garage Qual        159
Garage Cond        159
Garage Finish      159
Garage Yr Blt      159
Garage Type        157
Bsmt Exposure       83
BsmtFin Type 2      81
Bsmt Qual           80
Bsmt Cond           80
BsmtFin Type 1      80
Mas Vnr Area        23
Mas Vnr Type        23
Bsmt Full Bath       2
Bsmt Half Bath       2
Total Bsmt SF        1
Bsmt Unf SF          1
BsmtFin SF 1         1
BsmtFin SF 2         1
Electrical           1
Garage Cars          1
Garage Area          1
dtype: int64

In [5]:
y_observed = train.pop('SalePrice')
#drop the features with 40 percent of missing values
features = pd.concat([train, test], keys=['train', 'test'])
features.drop(['Pool QC', 'Misc Feature', 'Fireplace Qu', 'Fence', 'Alley','PID'],
              axis=1, inplace=True)
features.shape

(2930, 75)

In [6]:
#features with missing values
features.isnull().sum().sort_values(ascending=False).head(25)

Sale Condition    879
Lot Frontage      490
Garage Qual       159
Garage Finish     159
Garage Cond       159
Garage Yr Blt     159
Garage Type       157
Bsmt Exposure      83
BsmtFin Type 2     81
Bsmt Qual          80
BsmtFin Type 1     80
Bsmt Cond          80
Mas Vnr Type       23
Mas Vnr Area       23
Bsmt Half Bath      2
Bsmt Full Bath      2
Bsmt Unf SF         1
BsmtFin SF 2        1
Garage Cars         1
Garage Area         1
Total Bsmt SF       1
Electrical          1
BsmtFin SF 1        1
Exterior 2nd        0
Functional          0
dtype: int64

In [7]:
#fill the nas
features['Sale Condition'] = features['Sale Condition'].fillna(features['Sale Condition'].mode()[0])
features['Lot Frontage'] = features['Lot Frontage'].fillna(features['Lot Frontage'].mean())
for col in ('Garage Type', 'Garage Finish', 'Garage Qual', 'Garage Cond'):
    features[col] = features[col].fillna('NoGRG')
features['Garage Cars'] = features['Garage Cars'].fillna(0.0)
features['Garage Area'] = features['Garage Area'].fillna(0.0)
features.drop(['Total Bsmt SF',  'Garage Yr Blt'], axis=1, inplace=True)
features['BsmtFin SF 1'] = features['BsmtFin SF 1'].fillna(0)   
features['BsmtFin SF 2'] = features['BsmtFin SF 2'].fillna(0)  
features['Bsmt Full Bath'] = features['Bsmt Full Bath'].fillna(features['Bsmt Full Bath'].median())
features['Bsmt Half Bath'] = features['Bsmt Half Bath'].fillna(features['Bsmt Half Bath'].median())
features['Bsmt Unf SF'] = features['Bsmt Unf SF'].fillna(features['Bsmt Unf SF'].mean())
features['Electrical'] = features['Electrical'].fillna(features['Electrical'].mode()[0])
for col in ('Bsmt Exposure', 'BsmtFin Type 1', 'BsmtFin Type 2', 'Bsmt Cond','Bsmt Qual'):
    features[col] = features[col].fillna('NoBasement')
features['Mas Vnr Area'] = features['Mas Vnr Area'].fillna(0.0)
features['Mas Vnr Type'] = features['Mas Vnr Type'].fillna('None')



In [8]:
#check nas
features.isnull().sum().sort_values(ascending=False).head(5)

Yr Sold           0
Half Bath         0
Enclosed Porch    0
Exter Cond        0
Exter Qual        0
dtype: int64

In [9]:
features.shape

(2930, 73)

In [10]:
#take log of observed SalePrice
y_observed = np.log(y_observed)

In [11]:
#Get dummies for object features
for col in features.dtypes[features.dtypes == 'object'].index:
    for_dummy = features.pop(col)
    features = pd.concat([features, pd.get_dummies(for_dummy, prefix=col)], axis=1)

In [12]:
#DROP ID 
#Drop ID 
x = features.loc['train'].drop('Id', axis=1).select_dtypes(include=[np.number]).values
x_testset = features.loc['test'].drop('Id', axis=1).select_dtypes(include=[np.number]).values

In [13]:
from sklearn.preprocessing import StandardScaler

In [14]:
#train test split
x_train, x_test, y_train, y_test = train_test_split(x,y_observed,test_size=0.33,random_state=40)

RandomForestRegressor

In [15]:
from sklearn.ensemble import RandomForestRegressor

In [16]:
rf = RandomForestRegressor(n_estimators=100)
rf.fit(x_train, y_train)
prediction_rf_train = rf.predict(x_train)
prediction_rf_test = rf.predict(x_test)

GradientBoostingRegressor

In [17]:
from sklearn.ensemble import GradientBoostingRegressor

In [18]:
gbr=GradientBoostingRegressor(n_estimators=3000,
            learning_rate=0.05, max_depth=3, max_features='sqrt',
            min_samples_leaf=15, min_samples_split=10, loss='huber')
gbr.fit(x_train, y_train)
prediction_gbr_train = gbr.predict(x_train)
prediction_gbr_test = gbr.predict(x_test)

ElasticNet

In [19]:
from sklearn.linear_model import Ridge, Lasso, ElasticNet, LinearRegression, RidgeCV, LassoCV, ElasticNetCV

In [20]:
l1_ratios = np.linspace(0.01, 1.0, 25)
optimal_enet = ElasticNetCV(l1_ratio=l1_ratios, n_alphas=100, cv=10,
                            verbose=1)
optimal_enet.fit(x_train, y_train)
prediction_en_train=optimal_enet.predict(x_train)
prediction_en_test=optimal_enet.predict(x_test)

........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................

........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................

........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................

predictions table

In [21]:
test_id = test.Id
predictions_rf=np.exp(rf.predict(x_testset))
predictions_gbr=np.exp(gbr.predict(x_testset))
predictions_enet=np.exp(optimal_enet.predict(x_testset))

In [22]:
prediction_df=pd.DataFrame({'Id': test_id, 'RF': predictions_rf,
                            'GBR':predictions_gbr,'ENET':predictions_enet})

In [23]:
prediction_df.head()

Unnamed: 0,ENET,GBR,Id,RF
0,143468.052525,129860.072461,2658,129952.425089
1,224096.234284,151652.380292,2718,141906.641231
2,179570.779279,209482.079787,2414,190180.068569
3,118855.244189,119936.933994,1989,123373.003416
4,170328.192331,168872.542058,625,170457.176574


In [24]:
prediction_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 879 entries, 0 to 878
Data columns (total 4 columns):
ENET    879 non-null float64
GBR     879 non-null float64
Id      879 non-null int64
RF      879 non-null float64
dtypes: float64(3), int64(1)
memory usage: 27.5 KB


In [25]:
prediction_df['mean']=(prediction_df['RF']+prediction_df['GBR'])/2

In [26]:
prediction_df.head()

Unnamed: 0,ENET,GBR,Id,RF,mean
0,143468.052525,129860.072461,2658,129952.425089,129906.248775
1,224096.234284,151652.380292,2718,141906.641231,146779.510762
2,179570.779279,209482.079787,2414,190180.068569,199831.074178
3,118855.244189,119936.933994,1989,123373.003416,121654.968705
4,170328.192331,168872.542058,625,170457.176574,169664.859316


In [27]:
submit=pd.DataFrame({'Id': test_id, 'SalePrice': prediction_df['mean']})

In [28]:
submit.to_csv('regg3.csv', index=False)