In [4]:
import polars as pl

from sklearn.linear_model import Ridge

import sys
sys.path.append('../')

from exp.features import Features
from exp.models.cat import CatBoostModel
from exp.models.lgbm import LGBMModel
from exp.models.rf import RandomForestModel
from exp.models.rgf import RGFModel
from exp.models.xgb import XGBModel

import warnings
warnings.filterwarnings("ignore")

import gc
gc.enable()

In [5]:
train = pl.read_csv("../input/train.csv")
test = pl.read_csv("../input/test.csv")

In [6]:
features = Features(train, test)
train, test = features.create_features()

predictions = pl.concat([train["price"].to_frame(), pl.DataFrame([None] * test.height, schema={"price": pl.Int64})])

In [7]:
train.head()

year,condition,cylinders,odometer,size,price,age,odometer/age,odometer/cylinders,manufacturer_odometer_mean,manufacturer_odometer_std,manufacturer_odometer_max,manufacturer_odometer_min,manufacturer_odometer_diff,lat,lng,region_count,manufacturer_count,condition_count,fuel_count,title_status_count,transmission_count,drive_count,size_count,type_count,paint_color_count,state_count,manufacturer_acura,manufacturer_alfa-romeo,manufacturer_aston-martin,manufacturer_audi,manufacturer_buick,manufacturer_cadillac,manufacturer_chevrolet,manufacturer_chrysler,manufacturer_dodge,manufacturer_fiat,…,state_il,state_in,state_ks,state_ky,state_la,state_ma,state_md,state_me,state_mi,state_mn,state_mo,state_ms,state_mt,state_nan,state_nc,state_nd,state_ne,state_nh,state_nj,state_nm,state_nv,state_ny,state_oh,state_ok,state_or,state_pa,state_ri,state_sc,state_sd,state_tx,state_ut,state_va,state_vt,state_wa,state_wi,state_wv,state_wy
i64,i64,i8,f64,i64,i64,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,…,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8
1949,3,6,115148.0,2,27587,74,1556.054054,19191.333333,99634.628778,54369.725486,1484310.0,85.0,1484225.0,36.162277,-86.774298,229,3044,15219,24785,22365,6156,4834,9411,418,144,689,0,0,0,0,0,0,0,0,0,0,…,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2013,0,8,172038.0,3,4724,10,17203.8,21504.75,149848.5,63701.285145,386817.0,877.0,385940.0,40.79445,-77.861639,12,1782,2404,24785,22365,20497,4834,14636,9259,4300,1107,0,0,0,0,0,0,0,0,0,0,…,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
1998,2,6,152492.0,3,10931,25,6099.68,25415.333333,117840.777762,62743.660556,1187840.0,22.0,1187818.0,37.692236,-97.337545,233,6943,6009,24785,22365,20497,14602,14636,7311,4300,293,0,0,0,0,0,0,0,0,0,0,…,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2014,3,4,104118.0,2,16553,9,11568.666667,26029.5,117840.777762,62743.660556,1187840.0,22.0,1187818.0,42.651167,-73.754968,504,6943,15219,24785,22365,6156,14602,9411,7311,3765,2112,0,0,0,0,0,0,0,0,0,0,…,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2005,3,6,144554.0,2,5158,18,8030.777778,24092.333333,117840.777762,62743.660556,1187840.0,22.0,1187818.0,40.586356,-122.391675,153,6943,15219,24785,22365,6156,14602,9411,9259,1419,3050,0,0,0,0,0,0,0,0,0,0,…,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [8]:
test.head()

year,condition,cylinders,odometer,size,age,odometer/age,odometer/cylinders,manufacturer_odometer_mean,manufacturer_odometer_std,manufacturer_odometer_max,manufacturer_odometer_min,manufacturer_odometer_diff,lat,lng,region_count,manufacturer_count,condition_count,fuel_count,title_status_count,transmission_count,drive_count,size_count,type_count,paint_color_count,state_count,manufacturer_acura,manufacturer_alfa-romeo,manufacturer_aston-martin,manufacturer_audi,manufacturer_buick,manufacturer_cadillac,manufacturer_chevrolet,manufacturer_chrysler,manufacturer_dodge,manufacturer_fiat,manufacturer_ford,…,state_il,state_in,state_ks,state_ky,state_la,state_ma,state_md,state_me,state_mi,state_mn,state_mo,state_ms,state_mt,state_nan,state_nc,state_nd,state_ne,state_nh,state_nj,state_nm,state_nv,state_ny,state_oh,state_ok,state_or,state_pa,state_ri,state_sc,state_sd,state_tx,state_ut,state_va,state_vt,state_wa,state_wi,state_wv,state_wy
i64,i64,i8,f64,i64,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,…,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8
2015,3,4,92553.0,3,8,11569.125,23138.25,106793.071841,72910.573408,2946000.0,120.0,2945880.0,39.0,-108.0,138,3807,15219,24785,22365,20497,14602,14636,7311,1419,804,0,0,0,0,0,0,1,0,0,0,0,…,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2013,4,4,134385.0,2,10,13438.5,33596.25,122265.837,58250.681702,396512.0,491.0,396021.0,43.216505,-123.341738,21,1000,3810,24785,985,20497,14602,9411,9259,10143,645,0,0,0,0,0,0,0,0,0,0,0,…,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
2011,2,4,102489.0,3,12,8540.75,25622.25,111336.231218,62332.777216,1127500.0,1247.0,1126253.0,41.083064,-81.518485,102,772,6009,24785,22365,20497,14602,14636,9259,10143,961,0,0,0,0,0,0,0,0,0,0,0,…,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2016,3,6,64310.0,2,7,9187.142857,10718.333333,124055.680416,54971.511041,377569.0,1437.0,376132.0,39.739236,-104.984862,246,1297,15219,2639,22365,20497,8096,9411,7311,1419,804,0,0,0,0,0,0,0,0,0,0,0,…,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1999,3,8,180839.0,2,24,7534.958333,22604.875,122259.717853,55577.993427,346965.0,2267.0,344698.0,37.959055,-93.333475,66,1742,15219,24785,3004,20497,8096,9411,7311,4300,725,0,0,0,0,0,0,0,0,0,0,0,…,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [9]:
predictions = pl.concat([train["price"].to_frame(), pl.DataFrame([None] * test.height, schema={"price": pl.Int64})])

In [12]:
lgbm = LGBMModel(train, test)
#lgbm.objective(20)
lgbm.best_params = {'num_leaves': 48, 'max_depth': 6, 'min_child_samples': 91, 'subsample': 0.5578230915019112, 'colsample_bytree': 0.5933052522026404, 'reg_alpha': 2.4725566626090776e-05, 'reg_lambda': 1.0114136512530978e-08, 'feature_fraction': 0.7523757350552451, 'bagging_fraction': 0.9199865329355417, 'bagging_freq': 5}
lgbm_predictions = lgbm.predict()
predictions = pl.concat([predictions, lgbm_predictions], how="horizontal")

Fold_lgbm 0: 0.438997748685258
Fold_lgbm 1: 0.44367061958235204
Fold_lgbm 2: 0.4438502345358935
Fold_lgbm 3: 0.4453008427470675
Fold_lgbm 4: 0.4453268344511112


In [13]:
xgb = XGBModel(train, test)
#xgb.objective(20)
xgb.best_params = {'n_estimators': 767, 'max_depth': 8, 'lambda': 1.2306916748991704e-06, 'alpha': 0.018078104089246788, 'colsample_bytree': 0.42319770953022684, 'subsample': 0.2810517802368746, 'min_child_weight': 218, 'gamma': 6.031109467976734e-08, 'eta': 0.018889170085640027}
xgb_predictions = xgb.predict()
predictions = pl.concat([predictions, xgb_predictions], how="horizontal")

Fold_xgb 0: 0.4745561476623629
Fold_xgb 1: 0.4784323061866384
Fold_xgb 2: 0.4884161029488602
Fold_xgb 3: 0.48400381786286995
Fold_xgb 4: 0.48454445464054335


In [None]:
#rf = RandomForestModel(df)
##rf.objective(5)
#rf.best_params = {'max_depth': 9, 'min_samples_split': 11, 'min_samples_leaf': 14, 'max_features': 0.6306125661502896, 'max_leaf_nodes': 18, 'n_estimators': 8762, 'bootstrap': True}
#rf_predictions = rf.predict()
#predictions = pd.concat([predictions, rf_predictions], axis=1)

In [None]:
#rgf = RGFModel(df)
##rgf.objective(5)
#rgf.best_params = {'max_leaf': 8072, 'algorithm': 'RGF_Opt', 'test_interval': 142, 'min_samples_leaf': 11, 'reg_depth': 9, 'l2': 0.0002082492344277923, 'sl2': 4.2919223241162815e-07, 'normalize': False}
#rgf_predictions = rgf.predict()
#predictions = pd.concat([predictions, rgf_predictions], axis=1)

In [None]:
#cat = CatBoostModel(df)
##cat.objective(5)
#cat.best_params = {"depth": 6}
#cat_predictions = cat.predict()
#predictions = pd.concat([predictions, cat_predictions], axis=1)

In [14]:
predictions.head()

price,lgbm_pred_0,lgbm_pred_1,lgbm_pred_2,lgbm_pred_3,lgbm_pred_4,xgb_pred_0,xgb_pred_1,xgb_pred_2,xgb_pred_3,xgb_pred_4
i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
27587,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4724,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10931,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16553,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5158,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
model = Ridge(random_state=0)
train = predictions[predictions["price"].notnull()]
test = predictions[predictions["price"].isnull()]
X_train = train.drop(["price"], axis=1)
y_train = train["price"]
X_test = test.drop(["price"], axis=1)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [None]:
sub = pd.read_csv("../input/submit_sample.csv", header=None)
sub[sub.columns[1]] = y_pred
sub.to_csv("../output/submit0728_add_geo.csv", index=False, header=None)