# Library

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
from sklearn.model_selection import KFold

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

import sys
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from numpy import argsort
import joblib
from sklearn.cluster import KMeans
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
import lightgbm as lgb
import re
from xgboost import XGBRegressor
from scipy.stats import skew
from sklearn.preprocessing import PowerTransformer
from catboost import CatBoostRegressor, cv, Pool
import warnings
from joblib import Parallel, delayed

warnings.filterwarnings('ignore')
from catboost import CatBoostRegressor, Pool
from lightgbm import LGBMRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import numpy as np
pd.set_option('display.max_columns', 1000)
import optuna

# Feature Engineering Functions

In [2]:
#Segment Ext_col
neutral_colors = ['black', 'white', 'grey', 'silver']
bright_colors = ['red', 'blue', 'yellow', 'green', 'orange']

def categorize_color(color):
    color = color.lower()
    if any(neutral in color for neutral in neutral_colors):
        return 'Neutral'
    elif any(bright in color for bright in bright_colors):
        return 'Bright'
    else:
        return 'Special'
    
#Segment Brand Car
segment_mapping = {
    'Luxury & Premium': [
        'Mercedes-Benz', 'BMW', 'Audi', 'Porsche', 'Lexus', 'Cadillac', 
        'Jaguar', 'Land', 'Tesla', 'INFINITI', 'Lincoln', 'Acura', 
        'Genesis', 'Volvo', 'Alfa', 'Maserati', 'Lucid'
    ],
    'Ultra-Luxury & Exotic': [
        'Rolls-Royce', 'Bentley', 'Ferrari', 'Lamborghini', 'McLaren', 
        'Aston', 'Bugatti', 'Maybach'
    ],
    'Mainstream': [
        'Toyota', 'Honda', 'Nissan', 'Ford', 'Chevrolet', 'Hyundai', 
        'Kia', 'Mazda', 'Subaru', 'Volkswagen', 'Buick', 'Chrysler', 
        'Dodge', 'GMC', 'RAM', 'Jeep'
    ],
    'Budget & Economy': [
        'Mitsubishi', 'FIAT', 'smart', 'Suzuki', 'MINI'
    ],
    'Defunct, Niche & Special': [
        'Pontiac', 'Hummer', 'Saturn', 'Scion', 'Mercury', 'Saab', 
        'Plymouth', 'Karma', 'Lotus', 'Polestar', 'Rivian'
    ]
}

def segment_brand(brand):
    for segment, brands_list in segment_mapping.items():
        if brand in brands_list:
            return segment
    return 'Unknown'

In [3]:
def segment_mileage(mileage):
    if mileage <= 20000:
        return '0-20k'
    elif mileage <= 40000:
        return '20k-40k'
    elif mileage <= 60000:
        return '40k-60k'
    elif mileage <= 80000:
        return '60k-80k'
    elif mileage <= 100000:
        return '80k-100k'
    else:
        return '100k+'

def segment_age(age):
    if age <= 3:
        return 'New'
    elif age <= 7:
        return 'Moderately New'
    elif age <= 12:
        return 'Old'
    else:
        return 'Very Old'

In [4]:
def fe_engine(df):
    #df = df.replace('–', np.nan)
    df['age_car'] = 2024 - df['model_year']
    df.loc[df.age_car == 0, 'age_car'] = 1

    df['transmission_type'] = 'Unknown'
    df.loc[df.transmission.str.contains('A/T|Automatic|Dual Shift Mode|CVT|AT', na=False, case=False), 'transmission_type'] = 'Automatic'
    df.loc[df.transmission.str.contains('M/T|Manual|MT', na=False, case=False), 'transmission_type'] = 'Manual'
    df.loc[df.transmission.str.contains('at/mt|mt/at', na=False, case=False), 'transmission_type'] = 'Manual'

    #Fill Engine
    df.loc[(df['engine'].str.contains('Electric|Battery|kW|charge|kw', na=False, case=False)), 'fuel_type'] = 'Electric'
    df.loc[(df.fuel_type.isna()) & (df['engine'].str.contains(r'\bElectric\b|\bV\b', regex=True)), 'fuel_type'] = 'Electric'
    df.loc[(df.fuel_type.isna()) & (df['engine'].str.contains('Gasoline|Standard|Turbo|Liter|GDI|MPFI|PDI')), 'fuel_type'] = 'Gasoline'
    df.loc[(df.fuel_type.isna()) & (df['engine'].str.contains('Hybrid')), 'fuel_type'] = 'Hybrid'
    df.loc[(df.fuel_type.isna()) & (df['engine'].str.contains('Diesel|diesel')), 'fuel_type'] = 'Diesel'
    df.loc[(df.fuel_type.isna()) & (df['engine'].str.contains('flex|Flex')), 'fuel_type'] = 'E85 Flex Fuel'

    df.fuel_type.fillna('Gasoline', inplace=True)

    #Extract components
    df['horsepower'] = df['engine'].str.extract(r'(\d+\.?\d*)HP').astype(float)
    df['displacement'] = df['engine'].str.extract(r'(\d+\.?\d*)L').astype(float)
    df['cylinders'] = df['engine'].str.extract(r'(\d+) Cylinder').astype(float)

#     col_fill = ['horsepower', 'cylinders', 'displacement']
#     for col_ in col_fill:  
#         df[col_] = np.where(df[col_].isna(), 
#                             df.groupby(['brand', 'model', 'model_year'])[col_].transform("mean"), 
#                             df[col_])
        
    df['horsepower'] = round(df['horsepower'].fillna(df['horsepower'].mean()),1)
    df['displacement'] = round(df['displacement'].fillna(df['displacement'].mean()),1)
    df['cylinders'] = round(df['cylinders'].fillna(df['cylinders'].mean()))

    #Fill Accident & Clean_title
    df.loc[(df.accident.isna()) & (df.milage < 50000), 'accident'] = 'None reported'
    df['accident'] = df['accident'].fillna('At least 1 accident or damage reported')
    df.loc[df.accident == 'None reported', 'clean_title'] = 'Yes'
    df.loc[(df.clean_title.isna()) & (df.accident == 'At least 1 accident or damage reported'), 'clean_title'] = 'No'
    df['clean_title'] = df['clean_title'].fillna("Yes")

    df['milage_age'] = df['milage'] * df['age_car']
    df['milage_age_ratio'] = df['milage'] / df['age_car']
    df['mean_milage_with_age'] = df.groupby(['age_car'])['milage'].transform('mean')
#     df['std_milage_with_age'] = df.groupby(['age_car'])['milage'].transform('std')
    df['mean_milage_age_ratio_with_age'] = df.groupby(['age_car'])['milage_age_ratio'].transform('mean')
#     df['std_milage_age_ratio_with_age'] = df.groupby(['age_car'])['milage_age_ratio'].transform('std')
    
    # Áp dụng phân đoạn cho danh sách thương hiệu
    df['brand_segment'] = df['brand'].apply(segment_brand)
    df['ext_col_category'] = df['ext_col'].apply(categorize_color)

    df['mileage_segment'] = df['milage'].apply(segment_mileage)
    df['age_segment'] = df['age_car'].apply(segment_age)
    
    #Hiệu suất động cơ
    df['engine_efficiency_1'] = df['horsepower'] / df['displacement']
    df['engine_efficiency_2'] = df['horsepower'] / df['cylinders']
    df['displacement_per_cylinder'] = df['displacement'] / df['cylinders']
    
    df['age_car_squared'] = df['age_car'] ** 2
    df['horsepower_squared'] = df['horsepower'] ** 2
#     #Replace some observations
#     col_re = ['model', 'engine', 'ext_col', 'int_col', 'transmission']
#     for col_ in col_re:
#         df.loc[df[col_].isin(df[col_].value_counts()[lambda x: x < 20].index), col_] = 'others'
        
#     df['transmission_speed'] = df['transmission'].str.extract(r'(\d+)-Speed').astype(float)
#     df['transmission_speed'] = round(df['transmission_speed'].fillna(df['transmission_speed'].mean()),1)
#     df['transmission_speed'] = np.where(df['transmission_speed'].isna(), 
#                                         round(df.groupby(['brand', 'model', 'model_year'])['transmission_speed'].transform("mean")), 
#                                         df['transmission_speed'])
#     df['transmission_speed'] = df['transmission_speed'].fillna(round(df['transmission_speed'].mean()))
    
    return df

# Load Train data

In [5]:
df = pd.read_csv("./train.csv", index_col= 'id')

In [6]:
df = fe_engine(df)
# df = df.drop(['engine', 'model_year', 'model'], axis=1)

numerical_features = df.drop(['price'], axis=1).select_dtypes(exclude = ['object', 'category']).columns
categorical_features = df.drop(['price'], axis=1).select_dtypes(include = ['object', 'category']).columns

X = df.drop(['price'], axis=1)
y = df['price']

In [7]:
df.select_dtypes(exclude = ['object', 'category'])

Unnamed: 0_level_0,model_year,milage,price,age_car,horsepower,displacement,cylinders,milage_age,milage_age_ratio,mean_milage_with_age,mean_milage_age_ratio_with_age,engine_efficiency_1,engine_efficiency_2,displacement_per_cylinder,age_car_squared,horsepower_squared
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
0,2007,213000,4200,17,172.0,1.6,4.0,3621000,12529.411765,118515.705906,6971.512112,107.500000,43.000000,0.4000,289,29584.00
1,2002,143250,4999,22,252.0,3.9,8.0,3151500,6511.363636,118470.721529,5385.032797,64.615385,31.500000,0.4875,484,63504.00
2,2002,136731,13900,22,320.0,5.3,8.0,3008082,6215.045455,118470.721529,5385.032797,60.377358,40.000000,0.6625,484,102400.00
3,2017,19500,45000,7,420.0,5.0,8.0,136500,2785.714286,67940.690480,9705.812926,84.000000,52.500000,0.6250,49,176400.00
4,2021,7388,97500,3,208.0,2.0,4.0,22164,2462.666667,29346.041378,9782.013793,104.000000,52.000000,0.5000,9,43264.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
188528,2017,49000,27500,7,420.0,6.2,8.0,343000,7000.000000,67940.690480,9705.812926,67.741935,52.500000,0.7750,49,176400.00
188529,2018,28600,30000,6,385.0,3.0,6.0,171600,4766.666667,51728.104423,8621.350737,128.333333,64.166667,0.5000,36,148225.00
188530,2021,13650,86900,3,469.0,4.0,8.0,40950,4550.000000,29346.041378,9782.013793,117.250000,58.625000,0.5000,9,219961.00
188531,2022,13895,84900,2,343.3,3.0,6.0,27790,6947.500000,17824.175440,8912.087720,114.433333,57.216667,0.5000,4,117854.89


# Load Test data

In [8]:
df_test = pd.read_csv("./test.csv", index_col= 'id')
df_test = fe_engine(df_test)
# df_test = df_test.drop(['engine', 'model_year', 'model'], axis=1)

# Train, Val & Test (Ensemble XGBoost & CatBoost)

In [9]:
n_splits_ = 3
skf = KFold(n_splits=n_splits_, shuffle=True, random_state=42)

rmse_scores = []

catboost_params = {
#     'learning_rate': 0.075,
#     'eval_metric' : 'RMSE',
#     'loss_function': 'RMSE',
#     'iterations': 5000,
#     'depth': 9,
#     'random_strength': 0,
#     'l2_leaf_reg': 0.5,
#     'max_leaves': 512,
#     'fold_permutation_block': 64,
    'random_seed': 42,
    'early_stopping_rounds': 200,
    
    'learning_rate': 0.032089785965271685, 
    'model_size_reg': 1.1498478100664318, 
    'colsample_bylevel': 0.7398749059852404, 
    'reg_lambda': 13.481452874196997, 
    'n_estimators': 986, 
    'max_depth': 10, 
    'subsample': 0.5977235262240771, 
}

xgb_params={
     'lambda': 0.03880258557285165,
     'alpha': 0.02129832295514386,
     'colsample_bytree': 0.4,
     'subsample': 0.7,
     'learning_rate': 0.014,
     'max_depth': 17,
     'random_state': 2020,
     'min_child_weight': 85,
     'n_estimators': 10000,
     'early_stopping_rounds': 200,
#      'tree_method': 'gpu_hist'
    
}

lgb_params={
                'num_leaves': 426,
                 'max_depth': 20,
                 'learning_rate': 0.011353178352988012,
                 'n_estimators': 10000,
                 'metric': 'rmse',
                 'subsample': 0.5772552201954328,
                 'colsample_bytree': 0.9164865430101521,
                 'reg_alpha': 1.48699088003429e-06,
                 'reg_lambda': 0.41539458543414265,
                 'min_data_in_leaf': 73,
                 'feature_fraction': 0.751673655170548,
                 'bagging_fraction': 0.5120415391590843,
                 'bagging_freq': 2,
                 'random_state': 42,
                 'min_child_weight': 0.017236362383443497,
                 'cat_smooth': 54.81317407769262,
                 'verbose' : -1,
                 'early_stopping_rounds': 200,
}

cat_cols = list(categorical_features)  
test_preds_1 = np.zeros((len(df_test), n_splits_), dtype=np.float32)
test_preds_2 = np.zeros((len(df_test), n_splits_), dtype=np.float32)
test_preds_3 = np.zeros((len(df_test), n_splits_), dtype=np.float32)

cv_scores = []

#No encoded data for CatBoost (Catboost works better with no encoded data)
df_test_no_encoded = df_test.copy()

for fold, (train_index, test_index) in enumerate(skf.split(X, y)):
    print(f"Fold {fold+1}")
    X_train, X_val = X.iloc[train_index], X.iloc[test_index]
    y_train, y_val = y.iloc[train_index], y.iloc[test_index]
    
    X_train_no_encoded = X_train.copy()
    X_val_no_encoded = X_val.copy()
    
    df_test_encoded = df_test.copy()
    
    encoder = OrdinalEncoder(handle_unknown = 'use_encoded_value', unknown_value = -1)
    X_train[cat_cols] = encoder.fit_transform(X_train[cat_cols])
    X_val[cat_cols] = encoder.transform(X_val[cat_cols])
    df_test_encoded[cat_cols] = encoder.transform(df_test_encoded[cat_cols])
    
    print("CB training...")
    # Train CatBoost
    model_1 = CatBoostRegressor(**catboost_params, cat_features=cat_cols)
    model_1.fit(X_train_no_encoded, y_train, eval_set=(X_val_no_encoded, y_val), verbose = 200)
    val_preds_1 = model_1.predict(X_val_no_encoded)
    
    print("-"*10)
    print("XGB training...")
    # Train XGBoost
    model_2 = XGBRegressor(**xgb_params)
    model_2.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=200)
    val_preds_2 = model_2.predict(X_val)
    
    # Ensemble predictions
    val_preds_ensemble = (val_preds_1*0.3 + val_preds_2*0.7)
    score = mean_squared_error(y_val, val_preds_ensemble, squared=False)
    cv_scores.append(score)
    
    # Store predictions for test set
    test_preds_1[:, fold] = model_1.predict(df_test_no_encoded)
    test_preds_2[:, fold] = model_2.predict(df_test_encoded)
    print(f"Fold {fold+1} RMSE: {score}")
    print('-'*50)

# Ensemble test predictions by taking the mean
test_preds_mean = (test_preds_1*0.3 + test_preds_2*0.7)
test_preds = test_preds_mean.mean(axis=1)

print(f"Mean RMSE on Eval set: {np.mean(cv_scores)}")

Fold 1


# My Optuna

In [10]:
run_optuna=False

def objective(trial):
    catboost_params = {
                    'iterations': 10000,
                    'random_seed': 42,
                    'loss_function': 'RMSE', 
                    'eval_metric': 'RMSE',
                    'early_stopping_rounds': 500,
                    'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
                    'depth': trial.suggest_int('depth', 4, 16),
                    'model_size_reg': trial.suggest_loguniform('model_size_reg', 0.1, 10),
                    'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0.7, 1.0),
                    'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 20.0),
    }
    
    xgb_params = {
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
        'n_estimators': trial.suggest_int('n_estimators', 100, 1500),
        'max_depth': trial.suggest_int('max_depth', 3, 20),
        'min_child_weight': trial.suggest_int('min_child_weight', 50, 150),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.1, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 10),
        'lambda': trial.suggest_float('lambda', 0.0001, 10, log=True),
        'alpha': trial.suggest_float('alpha', 0.0001, 10, log=True),
        'random_state': 42,
        'early_stopping_rounds': 200,
    }

    
    skf = KFold(n_splits=3, shuffle=True, random_state=42)

    rmse_scores = []

    for train_index, test_index in skf.split(X, y):
        X_train, X_val = X.iloc[train_index], X.iloc[test_index]
        y_train, y_val = y.iloc[train_index], y.iloc[test_index]

        encoder = OrdinalEncoder(handle_unknown = 'use_encoded_value', unknown_value = -1)
        X_train[cat_cols] = encoder.fit_transform(X_train[cat_cols])
        X_val[cat_cols] = encoder.fit_transform(X_val[cat_cols])

#         model_fold = XGBRegressor(**xgb_params)
        model_fold = CatBoostRegressor(**catboost_params, #cat_features=cat_cols
                                      )
        model_fold.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose = False)

        preds = model_fold.predict(X_val)
        rmse_scores.append(mean_squared_error(y_val, preds, squared=False))

    avg_rmse = np.mean(rmse_scores)
    return avg_rmse

if run_optuna:
    study = optuna.create_study(direction="minimize")

    study.optimize(objective, n_trials=50, n_jobs = -1)

    print('Best trial:', study.best_trial)

    best_params = study.best_trial.params
    print('Best params:', best_params)

# Predict & Write Submission file

In [11]:
test_preds_final = test_preds.copy()
submission_file = df_test.reset_index()[['id']]
submission_file['price'] = test_preds_final
submission_file = submission_file.set_index("id")
submission_file.to_csv("/kaggle/working/submission.csv")