# Import modules

In [52]:
import pandas as pd
import numpy as np
import re
import seaborn as sns
import math
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier, XGBRegressor
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split, GridSearchCV
import catboost as cb
import optuna
from sklearn.model_selection import StratifiedKFold, cross_val_score
import pprint
from copy import deepcopy
from typing import *
from lightgbm import LGBMClassifier, LGBMRegressor
from catboost import CatBoostClassifier
from functools import reduce
from sklearn.preprocessing import TargetEncoder
from ktools.preprocessing.categorical_denoiser_prepreprocesser import CategoricalDenoiserPreprocessor
from ktools.metrics.fast_matthew_correlation_coefficient import fast_matthews_corr_coeff
from ktools.preprocessing.categorical_string_label_error_imputator import CategoricalLabelErrorImputator
from ktools.preprocessing.categorical_features_embedder import SortMainCategories
from ktools.preprocessing.kaggle_dataset_manager import KaggleDatasetManager
from ktools.utils.data_science_pipeline_settings import DataSciencePipelineSettings
from sklearn.linear_model import LogisticRegression
from ktools.fitting.cross_validate_then_test_sklearn_model import CrossValidateTestSklearnModel
from sklearn.metrics import root_mean_squared_error, r2_score
from sklearn.model_selection import KFold

# Main

In [2]:
# def update(df):
    
#     t = 100
    
#     cat_c = ['brand','model','fuel_type','engine','transmission','ext_col','int_col','accident','clean_title']
#     re_ = ['model','engine','transmission','ext_col','int_col']
    
#     for col in re_:
#         df.loc[df[col].value_counts(dropna=False)[df[col]].values < t, col] = "noise"
        
#     for col in cat_c:
#         df[col] = df[col].fillna('missing')
#         df[col] = df[col].astype('category')
        
#     return df

# train  = update(train)
# # test   = update(test)

In [53]:
class ConvertToLower:
    @staticmethod
    def transform(original_settings : DataSciencePipelineSettings):
        settings = deepcopy(original_settings)
        for col_name in settings.categorical_col_names:
            settings.combined_df[col_name] = settings.combined_df[col_name].str.lower()
        return settings

In [54]:
class UsedCarSpecificConverter:
    @staticmethod
    def transform(original_settings : DataSciencePipelineSettings):
        settings = deepcopy(original_settings)
        def find_pattern(pattern, text):
            match = re.search(pattern, text)
            if match:
                return match.group(1)
            else:
                return None
            
        def transmission(text):
            if 'a/t' in text or 'at' in text or 'automatic' in text:
                return 'automatic transmission'
            elif 'm/t' in text or 'mt' in text or 'manual' in text:
                return 'manual transmission'
            elif 'cvt' in text:
                return 'continuously variable transmission'
            else:
                return 'other'

        def camshafts(text):
            if 'dohc' in text:
                # double overhead camshaft
                return 'dohc'
            elif 'sohc' in text:
                #single overhead camshaft
                return 'sohc'
            elif 'ohv' in text:
                # overhead valve
                return 'ohv'
            elif 'vtec' in text:
                # variable valve timing and lift electronic control
                return 'vtec'
            else:
                return 'other'

        def injection(text):
            if 'ddi' in text:
                #direct diesel injection
                return 'ddi'
            elif 'gdi' in text:
                #gasoline direct injection
                return 'gdi'
            elif 'mpfi' in text:
                # multi-point fuel injection
                return 'mpfi'
            elif 'pdi' in text:
                # port fuel injection
                return 'pdi'
            elif 'tfsi' in text or 'tsi' in text:
                # turbo stratified injection
                return 'tfsi'
            elif 'gtdi' in text:
                # gasoline turbocharged direct injection
                return 'gtdi'
            elif 'sidi' in text:
                # spark ignition direct injection
                return 'sidi'
            else:
                return 'other'
            
        pattern = r'(\d*\.?\d+)\s*hp'
        settings.combined_df['horsepower'] = settings.combined_df['engine'].apply(lambda x : find_pattern(pattern, x)).astype('float64')
        pattern = r'(\d*\.?\d+)\s*(l|liter)'
        settings.combined_df['liters'] = settings.combined_df['engine'].apply(lambda x : find_pattern(pattern, x)).astype('float64')
        pattern = r'(\d*\.?\d+)\s*cylinder'
        settings.combined_df['cylinders'] = settings.combined_df['engine'].apply(lambda x : find_pattern(pattern, x)).astype('float64')
        pattern = r'(\d*\.?\d+)\s*(-speed|speed)'
        settings.combined_df['speed'] = settings.combined_df['transmission'].apply(lambda x : find_pattern(pattern, x)).astype('float64')
        
        settings.combined_df['injection'] = settings.combined_df['engine'].apply(lambda x : injection(x)).astype('object')
        settings.combined_df['camshaft'] = settings.combined_df['engine'].apply(lambda x : camshafts(x)).astype('object')
        settings.combined_df['transmission_clean'] = settings.combined_df['transmission'].apply(lambda x : transmission(x)).astype('object')
        settings.combined_df.loc[(settings.combined_df['model'].str.contains('model y|model x|model s|model 3', regex=True)), 'fuel_type'] = 'electric'
        settings.combined_df.loc[(settings.combined_df['model'].str.contains('electric')), 'fuel_type'] = 'electric'


        expensive_ext_color = ['blue caelum','dark sapphire','bianco monocerus','c / c',
                               'ice','tempest','beluga black','bianco icarus metallic','blu eleos',
                               'shadow black','nero noctis','sandstone metallic','lizard green','balloon white','onyx',
                               'donington grey metallic','china blue','diamond white','rosso corsa',
                                'granite','rosso mars metallic',
                                'carpathian grey','kemora gray metallic','grigio nimbus','dash','bianco isis','python green',
                                'fountain blue','custom color','vega blue','designo magno matte',
                                'brands hatch gray metallic',
                                'rift metallic','gentian blue metallic',
                                'arancio borealis','blue',
                                'aventurine green metallic',
                                'apex blue','daytona gray pearl effect',
                                'daytona gray pearl effect w/ black roof','matte white',
                                'carpathian grey premium metallic','blue metallic','santorini black metallic',
                                'quartzite grey metallic','carrara white metallic','black',
                                'kinetic blue',
                                'nero daytona']

        expensive_int_color = ['dark auburn',
                            'hotspur',
                            'cobalt blue',
                            'beluga hide',
                            'linen',
                            'beluga',
                            'black / brown',
                            'nero ade',
                            'sahara tan',
                            'portland']
        
        
        
        settings.combined_df['expensive_ext_col'] = settings.combined_df['ext_col'].isin(expensive_ext_color).astype(int)
        settings.combined_df['expensive_int_col'] = settings.combined_df['int_col'].isin(expensive_int_color).astype(int)
        settings.combined_df['twin_turbo'] = settings.combined_df['engine'].str.contains('twin turbo').astype(int)
        settings.combined_df['turbo'] = settings.combined_df['engine'].str.contains('turbo').astype(int)
        settings.combined_df['length_model'] = settings.combined_df['model'].apply(lambda x : len(x))
        settings.combined_df['length_ext_col'] = settings.combined_df['ext_col'].apply(lambda x : len(x))
        settings.combined_df['length_int_col'] = settings.combined_df['int_col'].apply(lambda x : len(x))
        
        clean_colors = ['ext_col', 'int_col']
        string_imputator = CategoricalLabelErrorImputator(verbose=True)
        settings.combined_df[['basic_ext_color', 'basic_int_color']] = string_imputator.impute(settings.combined_df[clean_colors],
                                                                                                                    clean_colors,
                                                                                                                    1500)
        
        settings.combined_df['basic_ext_color'] = settings.combined_df['basic_ext_color'].astype('object')
        settings.combined_df['basic_int_color'] = settings.combined_df['basic_int_color'].astype('object')
        # threshold = 2.5e6
        # print("num removed: ", (settings.combined_df['price'] > threshold).sum())
        # settings.combined_df = settings.combined_df[(settings.combined_df['price'] < threshold) | (settings.combined_df['price'].isna())]
        
        # settings.training_col_names += ['horsepower', 
        #                                 'injection',
        #                                 'camshaft', 
        #                                 'cylinders', 
        #                                 'expensive_ext_col', 
        #                                 'expensive_int_col', 
        #                                 'twin_turbo', 
        #                                 'turbo',
        #                                 'transmission_clean',
        #                                 'speed',
        #                                 'basic_ext_color',
        #                                 'basic_int_color',
        #                                 'liters',
        #                                 'length_model',
        #                                 'length_ext_col',
        #                                 'length_int_col'
        #                                 ]
        
        re_ = ['model','engine','transmission','ext_col','int_col']
        for col in re_:
            settings.combined_df.loc[settings.combined_df[col].value_counts(dropna=False)[settings.combined_df[col]].values < 100, col] = "noise"
        
        # settings.categorical_col_names += ['injection',
        #                                    'camshaft',
        #                                    'transmission_clean',
        #                                    'basic_ext_color',
        #                                    'basic_int_color']
        
        return settings

In [55]:
class BasicUsedCarSpecificConverter:
    @staticmethod
    def transform(original_settings : DataSciencePipelineSettings):
        settings = deepcopy(original_settings)
        def find_pattern(pattern, text):
            match = re.search(pattern, text)
            if match:
                return match.group(1)
            else:
                return None

        pattern = r'(\d*\.?\d+)\s*hp'
        settings.combined_df['horsepower'] = settings.combined_df['engine'].apply(lambda x : find_pattern(pattern, x)).astype('float64')
        pattern = r'(\d*\.?\d+)\s*(l|liter)'
        settings.combined_df['liters'] = settings.combined_df['engine'].apply(lambda x : find_pattern(pattern, x)).astype('float64')
        pattern = r'(\d*\.?\d+)\s*cylinder'
        settings.combined_df['cylinders'] = settings.combined_df['engine'].apply(lambda x : find_pattern(pattern, x)).astype('float64')
        pattern = r'(\d*\.?\d+)\s*(-speed|speed)'
        settings.combined_df['speed'] = settings.combined_df['transmission'].apply(lambda x : find_pattern(pattern, x)).astype('float64')

        settings.training_col_names += ['horsepower',
                                        'liters',
                                        'cylinders',
                                        'speed'
                                        ]
        return settings

In [56]:
class BackPackerMinimalProcessing:
    @staticmethod
    def transform(original_settings : DataSciencePipelineSettings):

        settings = deepcopy(original_settings)
        settings.update()
        def update(df):
            t = 100
            re_ = ['model','engine','transmission','ext_col','int_col']
            for col in re_:
                df.loc[df[col].value_counts(dropna=False)[df[col]].values < t, col] = "noise"
            return df
        
        settings.train_df = update(settings.train_df)
        settings.test_df = update(settings.test_df)
        settings.combined_df = pd.concat([settings.train_df, settings.test_df], keys=['train', 'test'])
        return settings

In [7]:
class SergeyConverter:
    @staticmethod
    def transform(original_settings : DataSciencePipelineSettings):
        settings = deepcopy(original_settings)
        fuel_type_dict = {
            'Gasoline': 0,
            'Hybrid': 1,
            'E85 Flex Fuel': 2,
            'uknown': 3,
            'Diesel': 4,
            'dash': 5,
            'Plug-In Hybrid': 6,
            'not supported': 7
        }

        accident_dict = {
            'None reported': 0,
            'At least 1 accident or damage reported': 1,
            'uknown': 2
        }

        clean_title_dict = {
            'Yes': 0,
            'uknown': 1
        }

        expensive_ext_color = ['Blue Caelum', 'Dark Sapphire', 'Bianco Monocerus', 'C / C', 'Ice',
            'Tempest', 'Beluga Black', 'Bianco Icarus Metallic', 
            'BLU ELEOS', 'Shadow Black', 'Nero Noctis', 'Sandstone Metallic',
            'Lizard Green', 'Balloon White', 'Onyx', 'Donington Grey Metallic',
            'China Blue', 'Diamond White', 'Rosso Corsa', 'Granite',
            'Rosso Mars Metallic', 'Carpathian Grey', 'Kemora Gray Metallic',
            'Grigio Nimbus', 'dash', 'Bianco Isis', 'Python Green', 'Fountain Blue',
            'Custom Color', 'Vega Blue', 'Designo Magno Matte',
            'Brands Hatch Gray Metallic', 'Rift Metallic', 'Gentian Blue Metallic',
            'Arancio Borealis', 'BLUE', 'Aventurine Green Metallic', 'Apex Blue',
            'Daytona Gray Pearl Effect', 'Daytona Gray Pearl Effect w/ Black Roof',
            'Matte White', 'Carpathian Grey Premium Metallic', 'Blue Metallic',
            'Santorini Black Metallic', 'Quartzite Grey Metallic',
            'Carrara White Metallic', 'BLACK', 'Kinetic Blue', 'Nero Daytona']

        expensive_int_color = ['Dark Auburn', 'Hotspur', 'Cobalt Blue', 'Beluga Hide', 'Linen',
                            'Beluga', 'Black / Brown', 'Nero Ade', 'Sahara Tan', 'Portland']

        expensive_hp = [443.0, 473.0, 493.0, 502.0, 521.0, 542.0, 543.0, 571.0, 572.0, 573.0, 580.0,
                        591.0, 602.0, 611.0, 616.0, 620.0, 624.0, 640.0, 641.0, 651.0, 710.0, 715.0, 760.0, 788.0, 797.0]



        def encode_columns(df):
            df['fuel_type_encoded'] = df['fuel_type'].map(fuel_type_dict)
            df['accident_encoded'] = df['accident'].map(accident_dict)
            df['clean_title_encoded'] = df['clean_title'].map(clean_title_dict)
            df['expensive_color_ext_encoded'] = df.ext_col.isin(expensive_ext_color).astype(int)
            df['expensive_color_int_encoded'] = df.int_col.isin(expensive_int_color).astype(int)
            df['expensive_hp'] = df['engine'].str.extract(r'(\d+\.?\d*)HP').astype(float).isin(expensive_hp).astype(int)
            df['cylinder'] = df['engine'].str.extract(r'(\d+\.?\d*) Cylinder').astype(float)              
            df['got_V'] = df['model'].str.extract(r'(\d+\.?\d*) V').notna().astype(int)
            return df
        settings.combined_df = encode_columns(settings.combined_df)
        settings.training_col_names += ['expensive_color_ext_encoded',
                                        'expensive_color_int_encoded']
        return settings

In [57]:
class FillNullValues:
    @staticmethod
    def transform(original_settings : DataSciencePipelineSettings, numeric_fill=-1, category_fill='missing'):
        settings = deepcopy(original_settings)
        for col_name in settings.training_col_names:
            if pd.api.types.is_numeric_dtype(settings.combined_df[col_name]):
                settings.combined_df[col_name] = settings.combined_df[col_name].fillna(numeric_fill)
            else:
                settings.combined_df[col_name] = settings.combined_df[col_name].fillna(category_fill)
        return settings

In [58]:
class ConvertObjectToCategorical:
    @staticmethod
    def transform(original_settings : DataSciencePipelineSettings):
        settings = deepcopy(original_settings)
        cat_cols = settings.categorical_col_names
        settings.combined_df[cat_cols] = settings.combined_df[cat_cols].astype('category')
        return settings

In [59]:
class LogTransformTarget:
    @staticmethod
    def transform(original_settings : DataSciencePipelineSettings):
        settings = deepcopy(original_settings)
        target = settings.target_col_name
        settings.combined_df['log_' + target] = np.log(settings.combined_df[target] + 1)
        settings.target_col_name = 'log_' + target
        settings.logged = True
        return settings

In [60]:
train_csv_path = "data/used_car_prices/train_combined.csv"
test_csv_path = "data/used_car_prices/test.csv"
target_col_name = "price"

settings = DataSciencePipelineSettings(train_csv_path,
                                        test_csv_path,
                                        target_col_name)

full_transforms = [ConvertToLower.transform, BackPackerMinimalProcessing.transform, FillNullValues.transform, ConvertObjectToCategorical.transform, LogTransformTarget.transform]
# basic_transforms = [ConvertToLower.transform, FillNullValues.transform, ConvertObjectToCategorical.transform, LogTransformTarget.transform]

full_settings = reduce(lambda acc, func: func(acc), full_transforms, settings)
# basic_settings = reduce(lambda acc, func: func(acc), basic_transforms, settings)

In [183]:
# train_df, test_df = basic_settings.update()

# data_manager = KaggleDatasetManager(train_df,
#                                     basic_settings.training_col_names,
#                                     basic_settings.target_col_name,
#                                     0.9,
#                                     0.1,
#                                     0)

# (X_train, 
# X_valid, 
# X_test, 
# y_train, 
# y_valid,
# y_test) = data_manager.dataset_partition()


# # model = XGBRegressor(**{}, 
# #                     verbosity=0,
# #                     eval_metric='logloss',
# #                     tree_method='hist',
# #                     enable_categorical=True)

# model = ()

# num_splits = 10
# eval_metrics = {"r2" : r2_score, "rmse" : lambda y, yhat : root_mean_squared_error(np.exp(y), np.exp(yhat))}
# skf = KFold(n_splits=num_splits)

# model, cv_scores, test_scores = CrossValidateTestSklearnModel(model,
#                               eval_metrics,
#                               skf,
#                               num_splits).evaluate(X_train,
#                                                    y_train,
#                                                    X_test,
#                                                    y_test)

Final Model r2: 0.638607

10-fold cross validation r2:  0.6376903986890982

Final Model rmse: 70167.345097

10-fold cross validation rmse:  73703.32201325778

In [186]:
model_params = {'learning_rate': 0.017521301504983752,
                'max_depth': 42,
                'reg_alpha': 0.06876635751774487, 
                'reg_lambda': 9.738899198284985,
                'num_leaves': 131,
                'subsample': 0.2683765421728044,
                'colsample_bytree': 0.44346036599709887,
                'n_estimators': 1000,
                'random_state': 42}

In [187]:
# from typing import Dict
# import numpy as np
# import pandas as pd
# from copy import deepcopy
# from sklearn.model_selection import StratifiedKFold
# from ktools.fitting.i_sklearn_model import ISklearnModel
# import lightgbm as lgb
# from lightgbm import log_evaluation, early_stopping


# class CrossValidateTestSklearnModel:

#     def __init__(self,
#                  sklearn_model_instance : ISklearnModel,
#                  evaluation_metrics : Dict[str, callable],
#                  kfold_object = None,
#                  num_splits : int = 5) -> None:
#         self.model = sklearn_model_instance
#         self._evaluation_metrics = evaluation_metrics
#         self._metric_names = list(evaluation_metrics.keys())
#         self._kf = kfold_object
#         self._num_metrics = len(self._metric_names)
#         self._num_splits = num_splits
#         self._model_list = []

#     def _fit_then_predict(self, X, y, X_test, y_test):
#         # model = deepcopy(self.model).fit(X, y)
        
#         train_data = lgb.Dataset(X, label=y)
#         val_data = lgb.Dataset(X_test, label=y_test, reference=train_data)
#         model = lgb.train(model_params,
#                     train_data,
#                     valid_sets=[train_data, val_data],
#                     valid_names=['train', 'valid'],
#                     callbacks=callbacks    
#                     )
#         y_pred = model.predict(X_test)
#         return y_pred, model

#     def evaluate(self,
#                  X_train, y_train,
#                  X_test, y_test):

#         cv_results = np.zeros((self._num_splits, self._num_metrics))
#         cv_scores = None

#         if self._kf is not None:
#             for i, (train_index, val_index) in enumerate(self._kf.split(X_train, y_train)):
#                 X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
#                 y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]

#                 y_pred, model = self._fit_then_predict(X_train_fold,
#                                                        y_train_fold,
#                                                        X_val_fold,
#                                                        y_val_fold)
#                 self._model_list += [model]

#                 for j, metric in enumerate(self._metric_names):
#                     score = self._evaluation_metrics[metric](np.array(y_val_fold), np.array(y_pred))
#                     cv_results[i][j] = score
            
#             cv_scores = pd.DataFrame(columns=self._metric_names, data=cv_results)
#             cv_scores.describe()

#         y_pred, self.model = self._fit_then_predict(X_train,
#                                                      y_train,
#                                                      X_test,
#                                                      y_test)
#         test_scores = {}
#         for j, metric in enumerate(self._metric_names):
#             score = self._evaluation_metrics[metric](np.array(y_test), np.array(y_pred))
#             test_scores[metric] = score
#             print(f"Final Model {metric}: {score:.6f}")
#             print(f"{self._num_splits}-fold cross validation {metric}: ", cv_results[:,j].mean())
        
#         return self._model_list, cv_scores, test_scores

In [68]:
train_df, test_df = full_settings.update()
X = train_df.drop(['price', 'log_price'], axis=1)
y = train_df['price']

In [82]:
# data_manager = KaggleDatasetManager(train_df,
#                                     full_settings.training_col_names,
#                                     full_settings.target_col_name,
#                                     0.9,
#                                     0.1,
#                                     0)

# (X_train, 
# X_valid, 
# X_test, 
# y_train, 
# y_valid,
# y_test) = data_manager.dataset_partition()


# model = XGBRegressor(**{}, 
#                     verbosity=0,
#                     eval_metric='rmse',
#                     tree_method='hist',
#                     enable_categorical=True)

from ktools.fitting.lgbm_model import LGBMModel


# model = LGBMModel({    'learning_rate': 0.017521301504983752,
#     'max_depth': 42,
#     'reg_alpha': 0.06876635751774487, 
#     'reg_lambda': 9.738899198284985,
#     'num_leaves': 131,
#     'subsample': 0.2683765421728044,
#     'colsample_bytree': 0.44346036599709887,
#     'n_estimators': 1000,
#     'random_state': 42})

model = LGBMModel({'num_leaves': 426,
 'max_depth': 20,
 'learning_rate': 0.011353178352988012,
 'n_estimators': 884,
 'subsample': 0.5772552201954328,
 'colsample_bytree': 0.9164865430101521,
 'reg_alpha': 1.48699088003429e-06,
 'reg_lambda': 0.41539458543414265,
 'min_data_in_leaf': 73,
 'feature_fraction': 0.751673655170548,
 'bagging_fraction': 0.5120415391590843,
 'bagging_freq': 2,
 'min_child_weight': 0.017236362383443497,
 'cat_smooth': 54.81317407769262,
 'verbose' : -1})


num_splits = 10
eval_metrics = {"r2" : r2_score, "rmse" : lambda y, yhat : root_mean_squared_error(y, yhat)}
skf = KFold(n_splits=num_splits, shuffle=True, random_state=42)

cvt = CrossValidateTestSklearnModel(model,
                              eval_metrics,
                              skf,
                              num_splits)

modellist, cv_scores, test_scores = cvt.evaluate(X,
                                                 y,
                                                 X,
                                                 y)



Training until validation scores don't improve for 200 rounds
[150]	train's l2: 5.05793e+09	valid's l2: 6.90248e+09
[300]	train's l2: 4.83999e+09	valid's l2: 6.87395e+09
[450]	train's l2: 4.69867e+09	valid's l2: 6.86895e+09
[600]	train's l2: 4.57539e+09	valid's l2: 6.86855e+09
Early stopping, best iteration is:
[506]	train's l2: 4.65092e+09	valid's l2: 6.86412e+09




Training until validation scores don't improve for 200 rounds
[150]	train's l2: 5.09179e+09	valid's l2: 4.05412e+09
[300]	train's l2: 4.87031e+09	valid's l2: 4.01722e+09
[450]	train's l2: 4.72347e+09	valid's l2: 4.0292e+09
Early stopping, best iteration is:
[294]	train's l2: 4.87767e+09	valid's l2: 4.01674e+09




Training until validation scores don't improve for 200 rounds
[150]	train's l2: 5.21987e+09	valid's l2: 3.06837e+09
[300]	train's l2: 5.00229e+09	valid's l2: 3.06955e+09
Early stopping, best iteration is:
[211]	train's l2: 5.11259e+09	valid's l2: 3.05561e+09




Training until validation scores don't improve for 200 rounds
[150]	train's l2: 5.17415e+09	valid's l2: 7.74212e+09
[300]	train's l2: 4.9534e+09	valid's l2: 7.71269e+09
[450]	train's l2: 4.81124e+09	valid's l2: 7.71342e+09
Early stopping, best iteration is:
[269]	train's l2: 4.98812e+09	valid's l2: 7.70838e+09




Training until validation scores don't improve for 200 rounds
[150]	train's l2: 5.07989e+09	valid's l2: 6.97472e+09
[300]	train's l2: 4.8594e+09	valid's l2: 6.92943e+09
[450]	train's l2: 4.7136e+09	valid's l2: 6.95073e+09
Early stopping, best iteration is:
[261]	train's l2: 4.90501e+09	valid's l2: 6.9266e+09




Training until validation scores don't improve for 200 rounds
[150]	train's l2: 5.35802e+09	valid's l2: 2.04103e+09
[300]	train's l2: 5.13168e+09	valid's l2: 1.99151e+09
[450]	train's l2: 4.98074e+09	valid's l2: 2.00218e+09
Early stopping, best iteration is:
[342]	train's l2: 5.08433e+09	valid's l2: 1.9886e+09




Training until validation scores don't improve for 200 rounds
[150]	train's l2: 5.02718e+09	valid's l2: 5.33621e+09
[300]	train's l2: 4.81341e+09	valid's l2: 5.30043e+09
[450]	train's l2: 4.67121e+09	valid's l2: 5.30896e+09
Early stopping, best iteration is:
[314]	train's l2: 4.79857e+09	valid's l2: 5.29754e+09




Training until validation scores don't improve for 200 rounds
[150]	train's l2: 5.22515e+09	valid's l2: 3.96986e+09
[300]	train's l2: 5.00129e+09	valid's l2: 3.9326e+09
[450]	train's l2: 4.85167e+09	valid's l2: 3.94089e+09
Early stopping, best iteration is:
[256]	train's l2: 5.05257e+09	valid's l2: 3.92941e+09




Training until validation scores don't improve for 200 rounds
[150]	train's l2: 5.15185e+09	valid's l2: 5.60105e+09
[300]	train's l2: 4.93268e+09	valid's l2: 5.5506e+09
[450]	train's l2: 4.79175e+09	valid's l2: 5.54449e+09
Early stopping, best iteration is:
[388]	train's l2: 4.84842e+09	valid's l2: 5.54318e+09




Training until validation scores don't improve for 200 rounds
[150]	train's l2: 5.01427e+09	valid's l2: 5.2856e+09
[300]	train's l2: 4.79999e+09	valid's l2: 5.24873e+09
[450]	train's l2: 4.65762e+09	valid's l2: 5.26385e+09
Early stopping, best iteration is:
[293]	train's l2: 4.80704e+09	valid's l2: 5.24728e+09




Training until validation scores don't improve for 200 rounds
[150]	train's l2: 5.11284e+09	valid's l2: 5.60351e+09
[300]	train's l2: 4.89986e+09	valid's l2: 5.51753e+09
[450]	train's l2: 4.75628e+09	valid's l2: 5.51483e+09


KeyboardInterrupt: 

In [74]:
cv_scores

Unnamed: 0,r2,rmse
0,0.149905,71865.903981
1,0.129084,78258.335918
2,0.141249,73935.313046
3,0.199779,61444.646637
4,0.162315,70246.510424
5,0.16356,68580.791613
6,0.134637,78478.903372
7,0.163526,70860.18938
8,0.172488,70498.970832
9,0.133301,79482.435314


In [75]:
sub = pd.read_csv('/Users/yuwei-1/Documents/projects/Kaggle-tools/data/used_car_prices/sample_submission.csv', index_col=0)
# sub['price'] = np.exp(cvt.model.predict(test_df[full_settings.training_col_names])) - 1
# sub

In [77]:
for i, mdl in enumerate(modellist):
    sub[f'price_{i}'] = mdl.predict(test_df[full_settings.training_col_names])

In [81]:
sub

Unnamed: 0_level_0,price,price_0,price_1,price_2,price_3,price_4,price_5,price_6,price_7,price_8,price_9
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
188533,18345.165615,17430.847781,18253.846302,20071.999728,18525.575144,18385.070618,18211.265040,17764.622467,19093.842606,17626.489251,18088.097213
188534,79120.161055,75521.639139,81660.250486,76325.226222,74766.842719,83339.456723,82790.000427,76794.423556,77001.147962,81801.094759,81201.528554
188535,54667.929227,60505.932659,52311.905126,53426.997306,56908.938123,54086.369324,55524.911828,54578.046401,51453.646276,54941.687299,52940.857934
188536,31480.856848,29528.303174,31440.834176,32948.076766,35249.955166,32076.795088,30490.405012,30985.736485,32980.863207,29646.546113,29461.053299
188537,29701.401953,29731.452448,29072.467456,30774.457125,29941.858085,30248.735942,29387.729177,29262.812778,30177.839242,28644.281951,29772.385328
...,...,...,...,...,...,...,...,...,...,...,...
314218,30720.193571,32004.250406,28135.660743,32641.403654,31469.502690,34898.698723,31554.652620,30125.286082,31352.918466,28801.817045,26217.745279
314219,53239.669271,55130.557371,49380.177069,52588.636045,51068.424637,53134.141745,54035.126300,54901.437714,51267.601381,57594.238215,53296.352234
314220,20085.596915,19169.563608,20056.591920,21303.632521,20252.431235,19976.080156,20311.538289,19658.523726,20204.352044,20016.881978,19906.373673
314221,15467.033823,14443.093903,15878.179644,17433.046985,15757.256253,15860.185645,15103.931419,15259.776670,15673.812722,14060.305593,15200.749397


In [78]:
sub['price'] = sub.drop(columns=['price']).mean(axis=1)

In [79]:
sub['price'].to_csv('submissions/used_cars/used_car_submission_v14.csv')

In [80]:
sub['price']

id
188533    18345.165615
188534    79120.161055
188535    54667.929227
188536    31480.856848
188537    29701.401953
              ...     
314218    30720.193571
314219    53239.669271
314220    20085.596915
314221    15467.033823
314222    42003.546320
Name: price, Length: 125690, dtype: float64