In [1]:
import pandas as pd
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

from sklearn.linear_model import Ridge

import sys
sys.path.append('../')

from exp.features import Features
from exp.models.cat import CatBoostModel
from exp.models.lgbm import LGBMModel
from exp.models.rf import RandomForestModel
from exp.models.rgf import RGFModel
from exp.models.xgb import XGBModel

import warnings
warnings.filterwarnings("ignore")

import gc
gc.enable()

In [2]:
train = pd.read_csv("../input/train.csv")
test = pd.read_csv("../input/test.csv")

In [3]:
features = Features(train, test)
df = features.create_features().reset_index(drop=True)

predictions = pd.DataFrame(df["price"])

In [4]:
df.head()

Unnamed: 0,id,region,year,condition,cylinders,odometer,size,price,age,odometer/age,odometer/cylinders,manufacturer_odometer_mean,manufacturer_odometer_std,manufacturer_odometer_max,manufacturer_odometer_min,manufacturer_odometer_diff,region_lat,region_lng,region_state,region_count,manufacturer_count,condition_count,fuel_count,title_status_count,transmission_count,drive_count,size_count,type_count,paint_color_count,state_count,manufacturer_alfa-romeo,manufacturer_aston-martin,manufacturer_audi,manufacturer_bmw,manufacturer_buick,manufacturer_cadillac,manufacturer_chevrolet,manufacturer_chrysler,manufacturer_dodge,manufacturer_fiat,manufacturer_ford,manufacturer_gmc,manufacturer_honda,manufacturer_hyundai,manufacturer_infiniti,manufacturer_jaguar,manufacturer_jeep,manufacturer_kia,manufacturer_land_rover,manufacturer_lexus,...,state_ar,state_az,state_ca,state_co,state_ct,state_dc,state_de,state_fl,state_ga,state_hi,state_ia,state_id,state_il,state_in,state_ks,state_ky,state_la,state_ma,state_md,state_me,state_mi,state_mn,state_mo,state_ms,state_mt,state_nan,state_nc,state_nd,state_ne,state_nh,state_nj,state_nm,state_nv,state_ny,state_oh,state_ok,state_or,state_pa,state_ri,state_sc,state_sd,state_tn,state_tx,state_ut,state_va,state_vt,state_wa,state_wi,state_wv,state_wy
0,0,nashville,1949,3,6,115148.0,2,27587.0,74,1556.054054,19191.333333,98963.759601,50487.072804,1484310.0,85.0,1484225.0,36.162277,-86.774298,Tennessee,425,6223,30440,49707,45081,12332,9754,18782,879,269,6333,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1,state college,2013,0,8,172038.0,3,4724.0,10,17203.8,21504.75,150290.470805,63290.940575,386817.0,877.0,385940.0,40.79445,-77.861639,Pennsylvania,24,3528,4773,49707,45081,40990,9754,29356,18735,8669,1975,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
2,2,wichita,1998,2,6,152492.0,3,10931.0,25,6099.68,25415.333333,118407.079338,61351.920244,1187840.0,10.0,1187830.0,37.692236,-97.337545,Kansas,470,13890,12020,49707,45081,40990,29213,29356,14566,8669,527,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,3,albany,2014,3,4,104118.0,2,16553.0,9,11568.666667,26029.5,118407.079338,61351.920244,1187840.0,10.0,1187830.0,42.651167,-73.754968,New York,1034,13890,30440,49707,45081,12332,29213,18782,14566,7623,3801,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,4,redding,2005,3,6,144554.0,2,5158.0,18,8030.777778,24092.333333,118407.079338,61351.920244,1187840.0,10.0,1187830.0,40.586356,-122.391675,California,327,13890,30440,49707,45081,12332,29213,18782,18735,2832,5341,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [5]:
lgbm = LGBMModel(df)
#lgbm.objective(20)
lgbm.best_params = {'num_leaves': 48, 'max_depth': 6, 'min_child_samples': 91, 'subsample': 0.5578230915019112, 'colsample_bytree': 0.5933052522026404, 'reg_alpha': 2.4725566626090776e-05, 'reg_lambda': 1.0114136512530978e-08, 'feature_fraction': 0.7523757350552451, 'bagging_fraction': 0.9199865329355417, 'bagging_freq': 5}
lgbm_predictions = lgbm.predict()
predictions = pd.concat([predictions, lgbm_predictions], axis=1)

ValueError: pandas dtypes must be int, float or bool.
Fields with bad pandas dtypes: region: object, region_state: object

In [None]:
xgb = XGBModel(df)
#xgb.objective(20)
xgb.best_params = {'n_estimators': 767, 'max_depth': 8, 'lambda': 1.2306916748991704e-06, 'alpha': 0.018078104089246788, 'colsample_bytree': 0.42319770953022684, 'subsample': 0.2810517802368746, 'min_child_weight': 218, 'gamma': 6.031109467976734e-08, 'eta': 0.018889170085640027}
xgb_predictions = xgb.predict()
predictions = pd.concat([predictions, xgb_predictions], axis=1)

In [None]:
#rf = RandomForestModel(df)
##rf.objective(5)
#rf.best_params = {'max_depth': 9, 'min_samples_split': 11, 'min_samples_leaf': 14, 'max_features': 0.6306125661502896, 'max_leaf_nodes': 18, 'n_estimators': 8762, 'bootstrap': True}
#rf_predictions = rf.predict()
#predictions = pd.concat([predictions, rf_predictions], axis=1)

In [None]:
#rgf = RGFModel(df)
##rgf.objective(5)
#rgf.best_params = {'max_leaf': 8072, 'algorithm': 'RGF_Opt', 'test_interval': 142, 'min_samples_leaf': 11, 'reg_depth': 9, 'l2': 0.0002082492344277923, 'sl2': 4.2919223241162815e-07, 'normalize': False}
#rgf_predictions = rgf.predict()
#predictions = pd.concat([predictions, rgf_predictions], axis=1)

In [None]:
#cat = CatBoostModel(df)
##cat.objective(5)
#cat.best_params = {"depth": 6}
#cat_predictions = cat.predict()
#predictions = pd.concat([predictions, cat_predictions], axis=1)

In [None]:
predictions

In [None]:
model = Ridge(random_state=0)
train = predictions[predictions["price"].notnull()]
test = predictions[predictions["price"].isnull()]
X_train = train.drop(["price"], axis=1)
y_train = train["price"]
X_test = test.drop(["price"], axis=1)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [None]:
sub = pd.read_csv("../input/submit_sample.csv", header=None)
sub[sub.columns[1]] = y_pred
sub.to_csv("../output/submit0728_add_geo.csv", index=False, header=None)