In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV,Lasso
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn.metrics import r2_score, classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
import time

sns.set_style('darkgrid')
%config InlineBackend.figure_format = 'retina'
%matplotlib inline


In [2]:
train=pd.read_csv('datasets/train_agged.csv')
test=pd.read_csv('datasets/test_agged.csv')

In [3]:
train=train.drop('Unnamed: 0',axis=1)
test=test.drop('Unnamed: 0',axis=1)
test=test.drop('SalePrice',axis=1)

### High level look at what I did:

    1. Drop collinear variables -> 0.6-0.9
    2. Scale variables
    3. Lasso down to 50 variables
    6. Lasso down again to 25 or so


### This sheet continues from the train and test set exported from aggregation_only sheet

In [4]:
#get the dataset
target = 'SalePrice'
nc = [x for x in train.columns if x != target]

X_train,X_test,y_train,y_test=train_test_split(train[nc],train[target])

ss = StandardScaler()
ss.fit(X_train)
X_train = ss.transform(X_train)
X_test= ss.transform(X_test)

test_sc=ss.transform(test)
X=ss.transform(train[nc])

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(1530, 167) (1530,)
(511, 167) (511,)


In [5]:
l_alphas = np.logspace(2, 6, 100)
lasso_model = Lasso(alpha=1000,max_iter=10000000)
#lasso_model = LassoCV(alphas=l_alphas, cv=10,max_iter=100000, fit_intercept=True)
lasso_model = lasso_model.fit(X_train, y_train)

In [6]:
new_columns=(test.columns[[(item>0)for column,item in zip(test.columns,lasso_model.coef_)]])

In [7]:
len(new_columns)

43

In [8]:
cross_val_score(lasso_model,X_train,y_train,cv=15).mean()

0.8569417658501843

In [9]:
new_columns

Index(['Lot Frontage', 'Lot Area', 'Overall Qual', 'Overall Cond',
       'Year Built', 'Year Remod/Add', 'Mas Vnr Area', 'Exter Qual',
       'Bsmt Qual', 'Bsmt Exposure', 'BsmtFin SF 1', 'Total Bsmt SF',
       'Heating QC', '1st Flr SF', 'Gr Liv Area', 'Kitchen Qual',
       'TotRms AbvGrd', 'Functional', 'Fireplaces', 'Fireplace Qu',
       'Garage Yr Blt', 'Garage Area', 'Land Contour_HLS',
       'Neighborhood_BrkSide', 'Neighborhood_Crawfor', 'Neighborhood_GrnHill',
       'Neighborhood_NoRidge', 'Neighborhood_NridgHt', 'Neighborhood_Somerst',
       'Neighborhood_StoneBr', 'Condition 1_Norm', 'Condition 1_PosA',
       'Condition 1_PosN', 'Roof Style_Hip', 'Exterior 1st_BrkFace',
       'Exterior 1st_CemntBd', 'Exterior 1st_VinylSd', 'Mas Vnr Type_Stone',
       'Misc Feature_Othr', 'Sale Type_Con', 'Sale Type_New', 'Sale Type_Oth',
       'bath_agg'],
      dtype='object')

In [10]:
train2=pd.DataFrame(train[new_columns])
train2['SalePrice']=train['SalePrice'].values
test2=pd.DataFrame(test[new_columns])

In [11]:
train2.head()

Unnamed: 0,Lot Frontage,Lot Area,Overall Qual,Overall Cond,Year Built,Year Remod/Add,Mas Vnr Area,Exter Qual,Bsmt Qual,Bsmt Exposure,...,Exterior 1st_BrkFace,Exterior 1st_CemntBd,Exterior 1st_VinylSd,Mas Vnr Type_Stone,Misc Feature_Othr,Sale Type_Con,Sale Type_New,Sale Type_Oth,bath_agg,SalePrice
0,0.0,13517,6,8,1976,2005,289.0,2,2,1,...,0,0,0,0,0,0,0,0,3.0,130500.0
1,43.0,11492,7,5,1996,1997,132.0,2,3,1,...,0,0,1,0,0,0,0,0,4.0,220000.0
2,68.0,7922,5,7,1953,2007,0.0,1,2,1,...,0,0,1,0,0,0,0,0,2.0,109000.0
3,73.0,9802,5,5,2006,2007,0.0,1,3,1,...,0,0,1,0,0,0,0,0,3.0,174000.0
4,82.0,14235,6,8,1900,1993,0.0,1,1,1,...,0,0,0,0,0,0,0,0,2.0,138500.0


In [12]:
train2.corr()['SalePrice'].sort_values(ascending=False)

SalePrice               1.000000
Overall Qual            0.800090
Exter Qual              0.712105
Gr Liv Area             0.699355
Kitchen Qual            0.692797
Bsmt Qual               0.652096
Garage Area             0.650319
Total Bsmt SF           0.630076
1st Flr SF              0.618345
bath_agg                0.610640
Year Built              0.571153
Garage Yr Blt           0.552273
Year Remod/Add          0.549522
Fireplace Qu            0.543061
TotRms AbvGrd           0.505493
Mas Vnr Area            0.505214
Fireplaces              0.471123
Heating QC              0.458016
Neighborhood_NridgHt    0.449408
BsmtFin SF 1            0.423152
Bsmt Exposure           0.417098
Sale Type_New           0.356484
Exterior 1st_VinylSd    0.342740
Mas Vnr Type_Stone      0.310744
Lot Area                0.297661
Roof Style_Hip          0.266052
Neighborhood_NoRidge    0.264384
Neighborhood_StoneBr    0.251550
Land Contour_HLS        0.207823
Lot Frontage            0.182080
Exterior 1

### Encode an ordinal out of some remaining dummy variables

In [13]:
def ordinalize(name,listy,df1, df2):
    aggname=name+'ordinal'
    df1[aggname]=np.zeros(shape=df1.iloc[:,0].shape)
    df2[aggname]=np.zeros(shape=df2.iloc[:,0].shape)
    
    for item in listy:
        itemmean=df1[df1[item]>0]['SalePrice'].mean()
        print(itemmean)
        df1[aggname]=df1[aggname]+df1[item]*itemmean
        df2[aggname]=df2[aggname]+df2[item]*itemmean
        
    nnbavg=df1[df1[aggname]==0]['SalePrice'].mean()
    df1[aggname]=df1[aggname].replace(0,nnbavg)
    df2[aggname]=df2[aggname].replace(0,nnbavg)
    
    df1=df1.drop(columns=listy,inplace=True)
    df2=df2.drop(columns=listy,inplace=True)
    return 

In [14]:
nbors=train2.columns[[('Neighborhood' in x) for x in train2.columns]]

In [15]:
nbors

Index(['Neighborhood_BrkSide', 'Neighborhood_Crawfor', 'Neighborhood_GrnHill',
       'Neighborhood_NoRidge', 'Neighborhood_NridgHt', 'Neighborhood_Somerst',
       'Neighborhood_StoneBr'],
      dtype='object')

In [16]:
train2.shape

(2041, 44)

In [17]:
ordinalize('nb',nbors,train2,test2)

127022.69736842105
205901.21126760563
280000.0
316294.125
323152.2727272727
227019.99224806202
327991.2972972973


In [18]:
train2.shape

(2041, 38)

In [19]:
#get the polynomial features
target = 'SalePrice'
nc = [x for x in train2.columns if x != target]

X_train,X_test,y_train,y_test=train_test_split(train2[nc],train2[target])

ss = StandardScaler()
ss.fit(X_train)
X_train = ss.transform(X_train)
X_test= ss.transform(X_test)
#test2=test2.drop('SalePrice',axis=1)
test2_sc=ss.transform(test2)
X=ss.transform(train2[nc])



pf = PolynomialFeatures(degree=2,interaction_only=True,include_bias=False)
X_train_poly=pf.fit_transform(X_train)
X_test_poly=pf.fit_transform(X_test)
X_train_poly.shape
X_poly=pf.fit_transform(X)
test2_sc_poly=pf.fit_transform(test2_sc)
test_poly_cols=pf.get_feature_names(test2.columns)

In [20]:
l_alphas = np.logspace(2, 6, 100)
lasso_model = Lasso(alpha=9000,max_iter=10000000)
#lasso_model = LassoCV(alphas=l_alphas, cv=10,max_iter=100000, fit_intercept=True)
lasso_model = lasso_model.fit(X_train_poly, y_train)

In [21]:
new_columns=[column for column,item in zip(test_poly_cols,lasso_model.coef_) if (item>0)]

In [22]:
len(new_columns)

21

In [23]:
cross_val_score(lasso_model,X_train,y_train,cv=15).mean()

0.7923132303559649

In [24]:
new_columns

['Overall Qual',
 'Exter Qual',
 'Bsmt Qual',
 'Bsmt Exposure',
 'BsmtFin SF 1',
 'Total Bsmt SF',
 '1st Flr SF',
 'Gr Liv Area',
 'Kitchen Qual',
 'Fireplaces',
 'Fireplace Qu',
 'Garage Area',
 'bath_agg',
 'Overall Qual Exter Qual',
 'Overall Qual Bsmt Exposure',
 'Overall Qual nbordinal',
 'Mas Vnr Area nbordinal',
 'Total Bsmt SF nbordinal',
 'Gr Liv Area nbordinal',
 'Land Contour_HLS Exterior 1st_CemntBd',
 'Sale Type_New nbordinal']

In [25]:
predictions_test=lasso_model.predict(test2_sc_poly)

In [26]:
submit=pd.DataFrame()

In [27]:
submit['Id']=list(test['Id'])
submit['SalePrice']=predictions_test
submit=submit.set_index('Id')
submit.to_csv('submissions/dllordinals.csv')

### The kaggle score for this was the b