In [122]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor as rfr
from sklearn.metrics import auc, roc_curve, roc_auc_score, mean_squared_error
from sklearn.model_selection import ShuffleSplit

In [123]:
df = pd.read_csv('train.csv')

In [124]:
splitter = ShuffleSplit(n_splits=1, test_size=0.2, random_state=4242)

for train_index, test_index in splitter.split(df, df['SalePrice']):
    d_train = df.iloc[train_index]
    d_test = df.iloc[test_index]
    
    y_train = df['SalePrice'].iloc[train_index]
    y_test = df['SalePrice'].iloc[test_index]

In [125]:
categs = list(df.dtypes[df.dtypes == 'object'].index)
df[categs] = df[categs].fillna('NaN')
numers = [f for f in df if f not in (categs + ['Id', 'SalePrice'])]

dummy_train = pd.get_dummies(d_train[categs], columns=categs)
dummy_test = pd.get_dummies(d_test[categs], columns=categs)
dummy_cols = list(set(dummy_train) & set(dummy_test))
dummy_train = dummy_train[dummy_cols]
dummy_test = dummy_test[dummy_cols]

X_train = pd.concat([d_train[numers].fillna(-999),
                     dummy_train], axis=1)
X_test = pd.concat([d_test[numers].fillna(-999),
                     dummy_test], axis=1)

### Обучим лес на 10-fold cv и выведем качество

In [126]:
rgr_rf = rfr(n_estimators=10, max_depth=4, n_jobs=-1)

from sklearn.model_selection import KFold as kf
spltr = kf(n_splits=10, random_state=42)

for train_ind, test_ind in spltr.split(X_train, y_train):
    rgr_rf.fit(X_train.iloc[train_ind], y_train.iloc[train_ind])
    y_pred_test = rgr_rf.predict(X_train.iloc[test_ind])
    y_pred_train = rgr_rf.predict(X_train.iloc[train_ind])
    print('Train:')
    print(mean_squared_error(y_train.iloc[train_ind], y_pred_train))
    print('Test:')
    print(mean_squared_error(y_train.iloc[test_ind], y_pred_test))
    print(' ')
    

Train:
795777590.0366564
Test:
2794522730.1487966
 
Train:
802704650.9691269
Test:
1423094345.3884983
 
Train:
822471829.818606
Test:
1683450927.3922665
 
Train:
880066226.476135
Test:
1571751059.832118
 
Train:
873028747.5464164
Test:
1811029620.9352725
 
Train:
956465639.12236
Test:
780538637.3558258
 
Train:
924463379.5564485
Test:
1158340129.0475397
 
Train:
948470811.9926093
Test:
726406832.0658991
 
Train:
876057151.899705
Test:
1178834825.252115
 
Train:
880538612.677504
Test:
652182235.309959
 


### Построим лес на всем тесте и покажем значимость переменных

In [127]:
rgr_rf = rfr(n_estimators=10, max_depth=4, n_jobs=-1)
rgr_rf.fit(X_train, y_train)
imp = pd.Series(rgr_rf.feature_importances_)
imp.sort_values(ascending=False)

3      0.702647
15     0.100709
13     0.046122
12     0.028760
11     0.025801
         ...   
159    0.000000
158    0.000000
157    0.000000
156    0.000000
0      0.000000
Length: 247, dtype: float64

In [128]:
y_pred_rf_test = rgr_rf.predict(X_test)
y_pred_rf_train = rgr_rf.predict(X_train)

print('Train:')
print(mean_squared_error(y_train, y_pred_rf_train))
print('Test:')
print(mean_squared_error(y_test, y_pred_rf_test))


Train:
935236081.8063563
Test:
1339677817.2060168


### Обучим лог регрессию

In [129]:
from sklearn.preprocessing import StandardScaler

train_median = d_train[numers].median()

X_train_lin = pd.concat([d_train[numers].fillna(train_median),
                     d_train[numers + categs].isnull().astype(np.int8).add_suffix('_NaN'),
                     dummy_train], axis=1)

X_test_lin = pd.concat([d_test[numers].fillna(train_median),
                     d_test[numers + categs].isnull().astype(np.int8).add_suffix('_NaN'),
                     dummy_test], axis=1)

scaler = StandardScaler()
scaler.fit(X_train_lin[numers])

X_train_lin[numers] = scaler.transform(X_train_lin[numers])
X_test_lin[numers] = scaler.transform(X_test_lin[numers])

from sklearn.linear_model import LogisticRegression as lr
rgr_lr = lr(C=0.1)
rgr_lr.fit(X_train_lin, y_train)



LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [130]:
y_pred_lin_test = rgr_lr.predict(X_test_lin)
y_pred_lin_train = rgr_lr.predict(X_train_lin)

print('Train:')
print(mean_squared_error(y_train, y_pred_lin_train))
print('Test:')
print(mean_squared_error(y_test, y_pred_lin_test))


Train:
481665230.86558217
Test:
3184376277.4589043


### Для интереса можно обучить дерево

In [131]:
from sklearn.tree import DecisionTreeRegressor as dtr
rgr_dt = dtr()
rgr_dt.fit(X_train, y_train)

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=None, splitter='best')

In [132]:
y_pred_dt_test = rgr_dt.predict(X_test)
y_pred_dt_train = rgr_dt.predict(X_train)

print('Train:')
print(mean_squared_error(y_train, y_pred_dt_train))
print('Test:')
print(mean_squared_error(y_test, y_pred_dt_test))

print('\nimportances:')
imp = pd.Series(rgr_dt.feature_importances_)
print(imp.sort_values(ascending=False))

Train:
0.0
Test:
1715785940.8321917

importances:
3      0.617027
15     0.092161
13     0.071131
11     0.034140
12     0.022274
         ...   
104    0.000000
187    0.000000
107    0.000000
186    0.000000
123    0.000000
Length: 247, dtype: float64


In [133]:
def get_meta_features(rgr, X_train, y_train, X_test, stack_cv):
    meta_train = np.zeros_like(y_train, dtype=float)
    meta_test = np.zeros_like(y_test, dtype=float)
    
    for i, (train_ind, test_ind) in enumerate(stack_cv.split(X_train, y_train)):
        
        rgr.fit(X_train.iloc[train_ind], y_train.iloc[train_ind])
        meta_train[test_ind] =rgr.predict(X_train.iloc[test_ind])
        meta_test += rgr.predict(X_test)
    
    return meta_train, meta_test / stack_cv.n_splits

In [134]:
stack_cv = kf(n_splits=10, random_state=555)

meta_train = []
meta_test = []
col_names = []

print('LR features...')
meta_tr, meta_te = get_meta_features(rgr_lr, X_train_lin, y_train, X_test_lin, stack_cv)

meta_train.append(meta_tr)
meta_test.append(meta_te)
col_names.append('lr_pred')

print('RF features...')
meta_tr, meta_te = get_meta_features(rgr_rf, X_train, y_train, X_test, stack_cv)

meta_train.append(meta_tr)
meta_test.append(meta_te)
col_names.append('rf_pred')

print('DT features...')
meta_tr, meta_te = get_meta_features(rgr_dt, X_train, y_train, X_test, stack_cv)

meta_train.append(meta_tr)
meta_test.append(meta_te)
col_names.append('dt_pred')

LR features...




RF features...
DT features...


In [135]:
X_meta_train = pd.DataFrame(np.stack(meta_train, axis=1), columns=col_names)
X_meta_test = pd.DataFrame(np.stack(meta_test, axis=1), columns=col_names)

In [149]:
rgr_lr_meta = lr(penalty='l2', C=0.1)
rgr_lr_meta.fit(X_meta_train, y_train)



LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [150]:
y_pred_meta_train = rgr_lr_meta.predict(X_meta_train)
y_pred_meta_test = rgr_lr_meta.predict(X_meta_test)

print('Train:')
print(mean_squared_error(y_train, y_pred_meta_train))
print('Test:')
print(mean_squared_error(y_test, y_pred_meta_test))


Train:
6720881779.799658
Test:
5102578369.133562
