In [48]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn import linear_model
from sklearn.model_selection import train_test_split
import houseprice_functions as hpf
from sklearn.linear_model import ElasticNetCV, ElasticNet
from sklearn import metrics

# Outlier imputation

In [64]:
cleaned_hp = pd.read_csv('data/cleaned_houseprice.csv')
HousePrices = pd.read_csv('data/train.csv')

train_outliers = hpf.outlier_selecter(cleaned_hp,cleaned_hp,num_sd = 4,min_unique = 20, drop_zeros = True)

for col, idx in train_outliers.items():
    print(col,':',idx)
    hpf.outlier_imputation(cleaned_hp, cleaned_hp,idx, col = col, method = "random", decimals = 2)
    
y = HousePrices[["Id",'SalePrice']]
x_y = cleaned_hp.merge(y,how = "inner",on = "Id")
y = np.log(x_y['SalePrice'])
X = x_y.drop(["Id","SalePrice","Ext_ImStucc", "Ext_Stone","House_2.5Fin"],axis = 1)

GarageArea : [581, 1190, 1298]
TotalBsmtSF : [332, 440, 496, 523, 1298]
GrLivArea : [523, 691, 1169, 1182, 1298]
LotArea : [249, 313, 335, 384, 451, 457, 706, 769, 1298, 1396]
EnclosedPorch : [3, 197, 328, 358, 496, 630, 747, 813, 939, 1152, 1197, 1326, 1360]
Total_PorchDeckSF : [53, 961, 1328, 1423, 1459]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


# standardization 

In [50]:
all_cols = X.columns
lis_ = []
for col in all_cols:
    if len(X[col].unique()) == 2:
        continue
    else: 
        lis_.append(col)
###############
#getting a list of all columns that are greater than 0.75 skew
pos_skewed_feats =[]
for col in lis_:
    if (X[col].skew() > 0.75):
        pos_skewed_feats.append(col)
################
#creating a new dataframe, replacing the relevant columns with the transformed 
X2 = X.copy()
for col in all_cols:
    if col in pos_skewed_feats:
        X2[col] = np.log1p(X2[col])
#################
#cheking for if it improved the skew
for col in pos_skewed_feats:
    print(col)
    print(X[col].skew())
    print(X2[col].skew())

GrLivArea
0.7998678263533677
-0.13065101569267792
LotArea
2.5052611623481176
-0.9214384390937395
LotShape
1.309985656555955
0.7831035648282987
LandSlope
4.813682424489448
4.297167061742489
EnclosedPorch
2.8508299091824068
2.216943569642373
Total_PorchDeckSF
1.008970579117751
-0.9070199627997609


# Train test split

In [51]:
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y, test_size=0.3, random_state=42)

In [52]:
en_model = ElasticNetCV(l1_ratio=[.1, .5, .7, .9, .95, .99, 1], eps=1e-3, n_alphas=100, fit_intercept=True, 
                        normalize=True, precompute='auto', max_iter=2000, tol=0.0001, cv=6, 
                        copy_X=True, verbose=0, n_jobs=-1, positive=False, random_state=1)
en_model.fit(X2_train, y2_train)
print("Model Alpha:",en_model.alpha_)
print("Model Ratio:",en_model.l1_ratio_)
print("Model n iters:", en_model.n_iter_)
y_train_pred = en_model.predict(X2_train)
y_pred = en_model.predict(X2_test)
print("R square train:",metrics.r2_score(y_train_pred, y2_train))
print("R square test:",metrics.r2_score(y2_test, y_pred))
print("MRSE train:",np.sqrt(metrics.mean_squared_error(y_train_pred, y2_train)))
print("MRSE test:",np.sqrt(metrics.mean_squared_error(y_pred, y2_test)))

Model Alpha: 0.0001794392259340051
Model Ratio: 0.7
Model n iters: 46
R square train: 0.8971446685925352
R square test: 0.9205324428929217
MRSE train: 0.11647167806680636
MRSE test: 0.11610928603967066


## Full Model

In [53]:
full_model = en_model.fit(X2, y)
y_pred = full_model.predict(X2)
print("Full Model RMSE:",np.sqrt(metrics.mean_squared_error(y_pred, y)))
#MRSE train: 0.11647167806680636
#MRSE test: 0.11610928603967066
#Full Model RMSE: 0.1140172867438289

Full Model RMSE: 0.1140172867438289


In [54]:
cleaned_hp_test = pd.read_csv('data/cleaned_houseprice_test.csv')
train = list(X.columns)
test = list(cleaned_hp_test.columns)
for i in train:
    if i not in test:
        print("Missing from test:")
        print(i)
for i in test:
    if i not in train:
        print("Missing from train:")
        print(i)

Missing from train:
Id


## TEST CSV CREATION

In [55]:
cleaned_hp_test = pd.read_csv('data/cleaned_houseprice_test.csv')
cleaned_hp_test.shape

(1459, 110)

In [56]:
sum(cleaned_hp_test.isna().sum().sort_values(ascending = False))

0

# Test outliers

In [57]:
cleaned_hp_test = pd.read_csv('data/cleaned_houseprice_test.csv')

test_outliers = hpf.outlier_selecter(cleaned_hp,cleaned_hp_test,num_sd = 4,min_unique = 20, drop_zeros = True)
test_outliers

{'GrLivArea': [185, 304],
 'LotArea': [53,
  171,
  271,
  411,
  523,
  529,
  661,
  828,
  848,
  1057,
  1169,
  1184,
  1190,
  1287],
 'EnclosedPorch': [260,
  325,
  380,
  459,
  462,
  577,
  653,
  660,
  662,
  799,
  840,
  945,
  1119,
  1185,
  1248,
  1266,
  1445]}

In [58]:
cleaned_hp_test.head()

Unnamed: 0,Id,CentralAir,HeatingQC,garage_score,Heating,Electrical,GarageArea,TotalBsmtSF,finishedbsmt,2Types,...,Ext_Plywood,Ext_Stucco,Ext_WdSdng,Ext_WdShing,House_1.5Fin,House_1.5Unf,House_2.5Unf,House_2Story,House_SFoyer,House_SLvl
0,1461,1,3,6,1,5,730.0,882.0,0.693878,0,...,0,0,0,0,0,0,0,0,0,0
1,1462,1,3,6,1,5,312.0,1329.0,0.694507,0,...,0,0,1,0,0,0,0,0,0,0
2,1463,1,4,6,1,5,482.0,928.0,0.852371,0,...,0,0,0,0,0,0,0,1,0,0
3,1464,1,5,6,1,5,470.0,926.0,0.650108,0,...,0,0,0,0,0,0,0,1,0,0
4,1465,1,5,6,1,5,506.0,1280.0,0.205469,0,...,0,0,0,0,0,0,0,0,0,0


In [59]:
for col, idx in test_outliers.items():
    hpf.outlier_imputation(cleaned_hp,cleaned_hp_test,idx, col = col,method = "random",decimals = 2)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


In [60]:
X_test = cleaned_hp_test.drop(["Id"],axis = 1)

In [61]:
all_cols = X_test.columns
lis_ = []
for col in all_cols:
    if col != "TotalBsmtSF":
        if len(X_test[col].unique()) == 2:
            continue
        else: 
            lis_.append(col)
###############
#getting a list of all columns that are greater than 0.75 skew
pos_skewed_feats =[]
for col in lis_:
    if (X_test[col].skew() > 0.75):
        pos_skewed_feats.append(col)
################
#creating a new dataframe, replacing the relevant columns with the transformed 
Xtest2 = X_test.copy()
for col in all_cols:
    if col in pos_skewed_feats:
        Xtest2[col] = np.log1p(Xtest2[col])
#################
#cheking for if it improved the skew
for col in pos_skewed_feats:
    print(col)
    print(X_test[col].skew())
    print(Xtest2[col].skew())

GrLivArea
1.1276647840362568
0.025649359092058246
Fireplaces
0.8198582704555165
0.29412900401924885
LotArea
3.12599856693982
-0.9206716350923922
LotShape
1.2042262704496145
0.7679560899873284
LandSlope
4.968389880056036
4.64800252535215
EnclosedPorch
4.784481964509475
1.8536469435613678
Total_PorchDeckSF
1.558989626117663
-0.9388049100673139


In [62]:
y_log_values = full_model.predict(Xtest2)
y = np.exp(y_log_values)
results = pd.DataFrame(zip(cleaned_hp_test.Id,y),columns = ["Id","SalePrice"])
results.head()

Unnamed: 0,Id,SalePrice
0,1461,124501.006289
1,1462,159296.000192
2,1463,177838.42627
3,1464,196819.019208
4,1465,203142.770219


In [17]:
test_submission = results.to_csv('data/submission.csv',index = False)