In [21]:
import pandas as pd
import numpy as np

from sklearn.linear_model import Ridge

import sys
sys.path.append('../')

from exp.features import Features
from exp.models.cat import CatBoostModel
from exp.models.lgbm import LGBMModel
from exp.models.rf import RandomForestModel
from exp.models.rgf import RGFModel
from exp.models.xgb import XGBModel

import warnings
warnings.filterwarnings("ignore")

In [7]:
train = pd.read_csv("../input/train.csv")
test = pd.read_csv("../input/test.csv")

In [39]:
features = Features(train, test)
df = features.create_features().reset_index(drop=True)

predictions = pd.DataFrame(df["price"])

In [41]:
df.head()

Unnamed: 0,id,year,condition,cylinders,odometer,size,price,age,odometer/age,odometer/cylinders,...,state_sd,state_tn,state_tx,state_ut,state_va,state_vt,state_wa,state_wi,state_wv,state_wy
0,0,1949,3,6,115148.0,2,27587.0,74,1556.054054,19191.333333,...,0,0,0,0,0,0,0,0,0,0
1,1,2013,0,8,172038.0,3,4724.0,10,17203.8,21504.75,...,0,0,0,0,0,0,0,0,0,0
2,2,1998,2,6,152492.0,3,10931.0,25,6099.68,25415.333333,...,0,0,0,0,0,0,0,0,0,0
3,3,2014,3,4,104118.0,2,16553.0,9,11568.666667,26029.5,...,0,0,0,0,0,0,0,0,0,0
4,4,2005,3,6,144554.0,2,5158.0,18,8030.777778,24092.333333,...,0,0,0,0,0,0,0,0,0,0


In [22]:
lgbm = LGBMModel(df)
#lgbm.objective(20)
lgbm.best_params = {'num_leaves': 48, 'max_depth': 6, 'min_child_samples': 91, 'subsample': 0.5578230915019112, 'colsample_bytree': 0.5933052522026404, 'reg_alpha': 2.4725566626090776e-05, 'reg_lambda': 1.0114136512530978e-08, 'feature_fraction': 0.7523757350552451, 'bagging_fraction': 0.9199865329355417, 'bagging_freq': 5}
lgbm_predictions = lgbm.predict()

Fold_lgbm 0: 0.44012078238639385
Fold_lgbm 1: 0.44411108892613665
Fold_lgbm 2: 0.44381660826421043
Fold_lgbm 3: 0.4458463855556004
Fold_lgbm 4: 0.4460808387371563


In [10]:
xgb = XGBModel(df)
#xgb.objective(20)
xgb.best_params = {'n_estimators': 767, 'max_depth': 8, 'lambda': 1.2306916748991704e-06, 'alpha': 0.018078104089246788, 'colsample_bytree': 0.42319770953022684, 'subsample': 0.2810517802368746, 'min_child_weight': 218, 'gamma': 6.031109467976734e-08, 'eta': 0.018889170085640027}
xgb_predictions = xgb.predict()

Fold_xgb 0: 0.47281271964744787
Fold_xgb 1: 0.47635503350243086
Fold_xgb 2: 0.4845484130674397
Fold_xgb 3: 0.4810216583343765
Fold_xgb 4: 0.4810724848695508


In [23]:
rf = RandomForestModel(df)
#rf.objective(5)
rf.best_params = {'max_depth': 9, 'min_samples_split': 11, 'min_samples_leaf': 14, 'max_features': 0.6306125661502896, 'max_leaf_nodes': 18, 'n_estimators': 8762, 'bootstrap': True}
rf_predictions = rf.predict()

Fold_rf 0: 0.7415434758701457
Fold_rf 1: 0.7464606387454564
Fold_rf 2: 0.7561488533109301
Fold_rf 3: 0.7423480772406141
Fold_rf 4: 0.7488634326531528


In [12]:
rgf = RGFModel(df)
#rgf.objective(5)
rgf.best_params = {'max_leaf': 8072, 'algorithm': 'RGF_Opt', 'test_interval': 142, 'min_samples_leaf': 11, 'reg_depth': 9, 'l2': 0.0002082492344277923, 'sl2': 4.2919223241162815e-07, 'normalize': False}
rgf_predictions = rgf.predict()

Fold_rgf 0: 0.6524549943211644
Fold_rgf 1: 0.6676265410237926
Fold_rgf 2: 0.675549999716749
Fold_rgf 3: 0.6593138620207522
Fold_rgf 4: 0.6653622709063274


In [13]:
cat = CatBoostModel(df)
#cat.objective(5)
cat.best_params = {"depth": 6}
cat_predictions = cat.predict()

Fold_cat 0: 0.6433606651463853
Fold_cat 1: 0.6605377946284188
Fold_cat 2: 0.6689926166420962
Fold_cat 3: 0.658574178889984
Fold_cat 4: 0.6631411280467598


In [42]:
predictions

Unnamed: 0,price
0,27587.0
1,4724.0
2,10931.0
3,16553.0
4,5158.0
...,...
55064,
55065,
55066,
55067,


In [43]:
predictions = pd.concat([predictions, lgbm_predictions, xgb_predictions, rf_predictions, rgf_predictions, cat_predictions], axis=1)

In [44]:
model = Ridge(random_state=0)
train = predictions[predictions["price"].notnull()]
test = predictions[predictions["price"].isnull()]
X_train = train.drop(["price"], axis=1)
y_train = train["price"]
X_test = test.drop(["price"], axis=1)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [49]:
sub = pd.read_csv("../input/submit_sample.csv", header=None)
sub[sub.columns[1]] = y_pred
sub.to_csv("../output/submit0728_fullstacking.csv", index=False, header=None)