In [2]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

from sklearn.linear_model import Ridge, Lasso, ElasticNet, LinearRegression
from sklearn.model_selection import KFold, GridSearchCV

%matplotlib inline

In [61]:
def decade(year):
    return str(year)[0:3] + '0'

In [244]:
def clean_data(csv):
    df = pd.read_csv(csv, index_col='Id')
    df.drop(['PoolQC'], axis=1, inplace=True)
    for i in range(len(df.columns)):
        if df.dtypes[i] == 'object':
            df[df.columns[i]].fillna('NA',inplace=True)
    for i in range(len(df.columns)):
        if df.dtypes[i] == 'float64' or df.dtypes[i] == 'int64':
            df[df.columns[i]].fillna(0, inplace=True)
    for i in range(len(df.columns)):
        if df.dtypes[i] == 'float64':
            df[df.columns[i]] = df[df.columns[i]].astype(int)
    obj_cols = []
    for i in range(len(df.columns)):
        if df.dtypes[i] == 'object':
            obj_cols.append(df.columns[i])
    years = df[['YearBuilt', 'YearRemodAdd', 'GarageYrBlt', 'YrSold']]
    for column in years.columns:
        df[column] = years[column].apply(decade)   
    areas = ['LotFrontage', 'LotArea', 'MasVnrArea', 'BsmtFinSF1','BsmtFinSF2','BsmtUnfSF','TotalBsmtSF','1stFlrSF','2ndFlrSF',
         'LowQualFinSF', 'GrLivArea','GarageArea','WoodDeckSF','OpenPorchSF','EnclosedPorch','3SsnPorch','PoolArea','MiscVal', 'ScreenPorch']
    drop_cats = ['BsmtUnfSF', 'MSSubClass_30', 'MSSubClass_50', 'MSZoning_RL', 'Street_Pave',
                 'Alley_Pave', 'LotShape_Reg', 'LandContour_Lvl', 'LotConfig_Inside', 'LandSlope_Gtl',
                 'Condition1_PosN', 'Condition2_Norm', 'BldgType_1Fam', 'BldgType_Twnhs', 'OverallCond_6',
                 'YearBuilt_1950', 'YearRemodAdd_2000', 'RoofStyle_Gable', 'Exterior2nd_Plywood',
                 'MasVnrType_None', 'ExterQual_Gd', 'ExterCond_TA', 'Foundation_CBlock', 'BsmtQual_TA',
                 'BsmtCond_TA', 'BsmtExposure_Mn', 'BsmtFinType2_Rec', 'HeatingQC_Gd', 'BsmtFullBath_0',
                 'BsmtHalfBath_0', 'FullBath_1', 'BedroomAbvGr_2', 'KitchenAbvGr_0', 'KitchenQual_Gd',
                 'TotRmsAbvGrd_4', 'Fireplaces_1', 'FireplaceQu_Gd', 'GarageType_Attchd', 'GarageYrBlt_00',
                 'GarageYrBlt_1960', 'GarageFinish_Fin', 'GarageCars_2', 'PavedDrive_N', 'Fence_NA',
                 'MoSold_12', 'YrSold_2010', 'SaleType_COD', 'SaleCondition_Family']
    cat_and_rms = [check for check in df.columns if (check not in obj_cols and check not in years and check not in areas and check != 'SalePrice') ]
    df_dum = pd.get_dummies(df,columns = [cols for cols in df.columns if (cols not in areas and cols != 'SalePrice')],drop_first=True)
    for cat in drop_cats:
        if cat in df_dum:
            df_dum = df_dum.drop(cat, axis=1)
    return df_dum
    


In [245]:
train_df = clean_data('train.csv')
train_df.shape

(1460, 316)

In [246]:
test_df = clean_data('test.csv')
print test_df.shape

(1459, 308)


In [1]:
Lasso()

NameError: name 'Lasso' is not defined

In [247]:
train_df = train_df.drop([c for c in train_df.columns if c not in test_df.columns][1:], axis = 1)
train_df.shape

(1460, 294)

In [248]:
test_df = test_df.drop([c for c in test_df.columns if c not in train_df.columns],axis = 1)
test_df.shape

(1459, 293)

In [249]:
from sklearn.preprocessing import StandardScaler
target = 'SalePrice'
features = [c for c in train_df.columns if c != target]

y_train = train_df[target]

ss= StandardScaler()
X_train = ss.fit_transform(train_df[features])
X_train.shape

(1460L, 293L)

In [250]:
kf = KFold(n_splits=20, shuffle=True, random_state=2008)
ridge = Ridge()
params = {'alpha': np.logspace(-1, 5, 20)}
rgs = GridSearchCV(ridge, param_grid = params, scoring='r2', cv=kf)
rgs.fit(X_train,y_train)
print rgs.best_score_
print rgs.best_params_

0.820563449805
{'alpha': 1096.985797892384}


In [135]:
kf = KFold(n_splits=10, shuffle=True, random_state=2003)
lasso = Lasso()
params = {'alpha': np.linspace(0,5,20)}
lgs = GridSearchCV(lasso, param_grid = params, scoring='r2', cv=kf)
lgs.fit(X_train,y_train)
print lgs.best_score_
print lgs.best_params_

0.768963462487
{'alpha': 5.0}


In [162]:
kf = KFold(n_splits=20, shuffle=True, random_state=2008)
enet = ElasticNet()
params = {'alpha': np.linspace(0,5,20),
          'l1_ratio' : np.linspace(0.01,1.0,25)}
engs = GridSearchCV(enet, param_grid = params, scoring='r2', cv=kf)
engs.fit(X_train,y_train)
print engs.best_score_
print engs.best_params_

0.833200196245
{'alpha': 4.4736842105263159, 'l1_ratio': 0.79375000000000007}


In [251]:
X_test = ss.transform(test_df[features])
X_test.shape

(1459L, 293L)

In [252]:
x = rgs.predict(X_test)
test_df['SalePrice'] = x
x.shape

(1459L,)

In [255]:
test_df[['SalePrice']].to_csv('sub8.csv')

In [253]:
from sklearn.metrics import mean_squared_error
from math import sqrt

rms_s2 = sqrt(mean_squared_error(y_train[:-1], rgs.predict(X_test)))
rms_s2

104488.3766651606

In [215]:
lasso = Lasso(lgs.best_params_['alpha'])

In [223]:
lasso.fit(X_train,y_train)
lasso_coefs = pd.DataFrame({'predictor' : features,
                            'coefs' : lasso.coef_,})
drop_cats = lasso_coefs.loc[lasso_coefs['coefs'] == 0,'predictor']

In [226]:
drop_cats = [x for x in drop_cats]

In [228]:
print drop_cats

['BsmtUnfSF', 'MSSubClass_30', 'MSSubClass_50', 'MSZoning_RL', 'Street_Pave', 'Alley_Pave', 'LotShape_Reg', 'LandContour_Lvl', 'LotConfig_Inside', 'LandSlope_Gtl', 'Condition1_PosN', 'Condition2_Norm', 'BldgType_1Fam', 'BldgType_Twnhs', 'OverallCond_6', 'YearBuilt_1950', 'YearRemodAdd_2000', 'RoofStyle_Gable', 'Exterior2nd_Plywood', 'MasVnrType_None', 'ExterQual_Gd', 'ExterCond_TA', 'Foundation_CBlock', 'BsmtQual_TA', 'BsmtCond_TA', 'BsmtExposure_Mn', 'BsmtFinType2_Rec', 'HeatingQC_Gd', 'BsmtFullBath_0', 'BsmtHalfBath_0', 'FullBath_1', 'BedroomAbvGr_2', 'KitchenAbvGr_0', 'KitchenQual_Gd', 'TotRmsAbvGrd_4', 'Fireplaces_1', 'FireplaceQu_Gd', 'GarageType_Attchd', 'GarageYrBlt_00', 'GarageYrBlt_1960', 'GarageFinish_Fin', 'GarageCars_2', 'PavedDrive_N', 'Fence_NA', 'MoSold_12', 'YrSold_2010', 'SaleType_COD', 'SaleCondition_Family']


In [232]:
'LotFrontage' in test_df

True