S4:E9 Car🚘Price Prediction|Regression

https://www.kaggle.com/code/rv1922/s4-e9-car-price-prediction-regression/notebook

In [25]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import mean_squared_error
import warnings 
warnings.filterwarnings('ignore')
import datetime
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
from catboost import CatBoostRegressor
import lightgbm as lgb
from sklearn.ensemble import VotingRegressor
from sklearn.linear_model import RidgeCV 

In [26]:
import os

os.chdir('E:\Python code\汽车价格')

train1 = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
train2 = pd.read_excel('used_cars.xlsx')

In [27]:
train2['milage'] = train2['milage'].str.replace(r'\D', '', regex=True).astype(int)
#train2['price'] = train2['price'].str.replace(r'\D', '', regex=True).astype(int)

In [28]:
train2.head()

Unnamed: 0,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price
0,Ford,Utility Police Interceptor Base,2013,51000,E85 Flex Fuel,300.0HP 3.7L V6 Cylinder Engine Flex Fuel Capa...,6-Speed A/T,Black,Black,At least 1 accident or damage reported,Yes,10300
1,Hyundai,Palisade SEL,2021,34742,Gasoline,3.8L V6 24V GDI DOHC,8-Speed Automatic,Moonlight Cloud,Gray,At least 1 accident or damage reported,Yes,38005
2,Lexus,RX 350 RX 350,2022,22372,Gasoline,3.5 Liter DOHC,Automatic,Blue,Black,None reported,,54598
3,INFINITI,Q50 Hybrid Sport,2015,88900,Hybrid,354.0HP 3.5L V6 Cylinder Engine Gas/Electric H...,7-Speed A/T,Black,Black,None reported,Yes,15500
4,Audi,Q3 45 S line Premium Plus,2021,9835,Gasoline,2.0L I4 16V GDI DOHC Turbo,8-Speed Automatic,Glacier White Metallic,Black,None reported,,34999


In [29]:
train = pd.concat([train1, train2], axis=0, ignore_index=True)

In [30]:
train = train.drop("id", axis=1)
test = test.drop("id", axis=1)

train['clean_title'] = train['clean_title'].fillna('No')
test['clean_title'] = test['clean_title'].fillna('No')

train['clean_title']=train['clean_title'].map({'Yes': 0,'No': 1})
test['clean_title']=test['clean_title'].map({'Yes': 0,'No': 1})

In [31]:
train['accident'] = train['accident'].fillna('None reported')
test['accident'] = test['accident'].fillna('None reported')

train['accident']=train['accident'].map({'None reported': 0,'At least 1 accident or damage reported': 1})
test['accident']=test['accident'].map({'None reported': 0,'At least 1 accident or damage reported': 1})

def fill_clean_title(row):
    if pd.isna(row['clean_title']):
        return 'missing'
    return row['clean_title']

train['clean_title'] = train.apply(fill_clean_title, axis=1)
test['clean_title'] = test.apply(fill_clean_title, axis=1)


def extract_data_from_engine(df):
    df['horsepower'] = df['engine'].str.extract(r'(\d+\.\d+)(?=HP)').astype(float)
    df['engine_size'] = df['engine'].str.extract(r'(\d+\.\d+)(?=L)').astype(float)
    df['cylinders'] = df['engine'].str.extract(r'(\d+)\s(Cylinder|V\d|Straight)')[0].astype(float)
    return df


train = extract_data_from_engine(train)
test = extract_data_from_engine(test)



def extract_fuel_type(engine_info):
    if pd.isna(engine_info):
        return np.nan
    if 'Gasoline' in engine_info:
        return 'Gasoline'
    elif 'Hybrid' in engine_info:
        return 'Hybrid'
    elif 'Flex Fuel' in engine_info or 'E85' in engine_info:
        return 'Flex Fuel'
    elif 'Diesel' in engine_info:
        return 'Diesel'
    elif 'Electric' in engine_info:
        return 'Electric'
    else:
        return np.nan


train['extracted_fuel_type'] = train['engine'].apply(extract_fuel_type)
train['fuel_type'].fillna(train['extracted_fuel_type'], inplace=True)
train.drop(columns=['extracted_fuel_type'], inplace=True)

test['extracted_fuel_type'] = test['engine'].apply(extract_fuel_type)
test['fuel_type'].fillna(test['extracted_fuel_type'], inplace=True)
test.drop(columns=['extracted_fuel_type'], inplace=True)

train = train.drop('engine', axis=1)
test = test.drop('engine', axis=1)


In [32]:
train['transmission'] = train['transmission'].astype('str')

In [33]:
#categorical_columns = ['brand', 'model', 'fuel_type', 'transmission', 'ext_col', 'int_col']
categorical_columns = ['brand', 'model', 'fuel_type','transmission','ext_col', 'int_col']
label_encoder = LabelEncoder()

In [34]:
for column in categorical_columns:
    train[column] = label_encoder.fit_transform(train[column])

In [35]:
train.isnull().sum()

brand               0
model               0
model_year          0
milage              0
fuel_type           0
transmission        0
ext_col             0
int_col             0
accident            0
clean_title         0
price               0
horsepower      34069
engine_size     14610
cylinders       38800
dtype: int64

In [None]:
#train['horsepower'].fillna(0,inplace=True)

In [36]:
print("Average:",train['horsepower'].mean().round(2))
print("Max:",train['horsepower'].max())
print("Min:",train['horsepower'].min())

Average: 343.04
Max: 1020.0
Min: 70.0


In [38]:
print("Average:",train['engine_size'].mean().round(2))
print("Max:",train['engine_size'].max())
print("Min:",train['engine_size'].min())

Average: 3.8
Max: 8.4
Min: 0.65


In [40]:
print("Average:",train['cylinders'].mean().round(2))
print("Max:",train['cylinders'].max())
print("Min:",train['cylinders'].min())

Average: 6.37
Max: 12.0
Min: 3.0


In [41]:
train['horsepower'].fillna(train['horsepower'].mean(), inplace=True)
train['engine_size'].fillna(train['engine_size'].mean(), inplace=True)
train['cylinders'].fillna(train['cylinders'].mean(), inplace=True)

# Train Model

In [42]:
X = train.drop('price', axis=1)  
y = train['price']

In [43]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [44]:
xgb_model = xgb.XGBRegressor(
    n_estimators=1075,
    eta=0.006322676490697651,
    max_depth=4,
    min_child_weight=0.00395187090807294,
    subsample=0.9313710525668728,
    colsample_bytree=0.5484624100206377,
    reg_lambda=7.740721556692922,  
    alpha=0.3158771161140435,
    use_label_encoder=False,
    eval_metric='mlogloss',
    random_state=42
)

In [45]:
xgb_model.fit(X_train, y_train)

In [46]:
xgb_model.fit(X_train, y_train)

y_pred1 = xgb_model.predict(X_test)
rmse = mean_squared_error(y_test, y_pred1, squared=False)
print(f"Stacking model RMSE: {rmse}")

Stacking model RMSE: 72106.41484091553


CatBoost

In [47]:
params3 = {
    'iterations': 396,
    'depth': 3,
    'learning_rate': 0.205197734145011,
    'random_strength': 0.00013766770363584716,
    'bagging_temperature': 0.8685572414400111,
    'border_count': 241,
    'l2_leaf_reg': 61.120073566627156
}

In [48]:
catboost_model = CatBoostRegressor(**params3)

catboost_model.fit(X_train, y_train)

0:	learn: 77434.7004785	total: 150ms	remaining: 59.3s
1:	learn: 76419.4174409	total: 160ms	remaining: 31.5s
2:	learn: 75729.1490807	total: 169ms	remaining: 22.2s
3:	learn: 75246.8002719	total: 178ms	remaining: 17.5s
4:	learn: 74878.3283347	total: 186ms	remaining: 14.5s
5:	learn: 74612.5681468	total: 195ms	remaining: 12.7s
6:	learn: 74418.0671224	total: 203ms	remaining: 11.3s
7:	learn: 74273.7964000	total: 210ms	remaining: 10.2s
8:	learn: 74165.5342737	total: 217ms	remaining: 9.35s
9:	learn: 74082.7232200	total: 224ms	remaining: 8.65s
10:	learn: 74000.5261239	total: 231ms	remaining: 8.1s
11:	learn: 73915.5535189	total: 239ms	remaining: 7.66s
12:	learn: 73863.8006967	total: 246ms	remaining: 7.26s
13:	learn: 73820.0964757	total: 254ms	remaining: 6.94s
14:	learn: 73771.7966259	total: 262ms	remaining: 6.66s
15:	learn: 73706.1462688	total: 269ms	remaining: 6.39s
16:	learn: 73678.0040198	total: 278ms	remaining: 6.2s
17:	learn: 73610.6435184	total: 286ms	remaining: 6.01s
18:	learn: 73556.92771

<catboost.core.CatBoostRegressor at 0x24a1b8e9f90>

In [49]:
best_params = {
    'n_estimators': 1225,
    'num_leaves': 137,
    'max_depth': 14,
    'cat_smooth': 96,
    'learning_rate': 0.0023941644225363256,
    'subsample': 0.9082095260228584,
    'colsample_bytree': 0.6165900236226695,
    'min_split_gain': 0.0308677316309982,
    'min_child_weight': 68,
    'lambda_l2': 1.7319600391087514e-07,
    'lambda_l1': 8.761594422544116e-07,
    'max_bin': 749,
    'objective': 'regression',
    'metric': 'rmse',
    'random_state': 42,
    'boosting_type': 'gbdt',
}

In [50]:
lgb_model = lgb.LGBMRegressor(**best_params)

callbacks = [
    lgb.early_stopping(stopping_rounds=100),
    lgb.log_evaluation(100)
]

lgb_model.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],
    eval_metric='rmse',
    callbacks=callbacks
)

Training until validation scores don't improve for 100 rounds
[100]	valid_0's rmse: 75941.4
[200]	valid_0's rmse: 74571.7
[300]	valid_0's rmse: 73672.8
[400]	valid_0's rmse: 73063
[500]	valid_0's rmse: 72657.2
[600]	valid_0's rmse: 72388.6
[700]	valid_0's rmse: 72218
[800]	valid_0's rmse: 72107.7
[900]	valid_0's rmse: 72031.1
[1000]	valid_0's rmse: 71990
[1100]	valid_0's rmse: 71968.8
[1200]	valid_0's rmse: 71954.4
Did not meet early stopping. Best iteration is:
[1225]	valid_0's rmse: 71953.4


In [52]:
voting_model = VotingRegressor(
    estimators=[
        ('xgb', xgb_model),
        ('catboost', catboost_model),
        ('lgb', lgb_model)
    ]
)

In [53]:
voting_model.fit(X_train, y_train)

y_pred = voting_model.predict(X_test)
rmse = mean_squared_error(y_test, y_pred, squared=False)
print(f"Stacking model RMSE: {rmse}")

0:	learn: 77434.7004785	total: 10.6ms	remaining: 4.21s
1:	learn: 76419.4174409	total: 20.6ms	remaining: 4.05s
2:	learn: 75729.1490807	total: 30.4ms	remaining: 3.98s
3:	learn: 75246.8002719	total: 39.3ms	remaining: 3.85s
4:	learn: 74878.3283347	total: 46.4ms	remaining: 3.63s
5:	learn: 74612.5681468	total: 55.3ms	remaining: 3.6s
6:	learn: 74418.0671224	total: 62.1ms	remaining: 3.45s
7:	learn: 74273.7964000	total: 70ms	remaining: 3.4s
8:	learn: 74165.5342737	total: 77.3ms	remaining: 3.33s
9:	learn: 74082.7232200	total: 84.1ms	remaining: 3.25s
10:	learn: 74000.5261239	total: 91ms	remaining: 3.19s
11:	learn: 73915.5535189	total: 98.9ms	remaining: 3.17s
12:	learn: 73863.8006967	total: 105ms	remaining: 3.1s
13:	learn: 73820.0964757	total: 114ms	remaining: 3.1s
14:	learn: 73771.7966259	total: 121ms	remaining: 3.07s
15:	learn: 73706.1462688	total: 129ms	remaining: 3.05s
16:	learn: 73678.0040198	total: 137ms	remaining: 3.04s
17:	learn: 73610.6435184	total: 144ms	remaining: 3.03s
18:	learn: 73556

# Prediction

In [54]:
ltest = LabelEncoder()

for column in categorical_columns:
    test[column] = ltest.fit_transform(test[column])

In [55]:
test.head()

Unnamed: 0,brand,model,model_year,milage,fuel_type,transmission,ext_col,int_col,accident,clean_title,horsepower,engine_size,cylinders
0,26,1388,2015,98000,3,16,302,10,0,0,240.0,2.0,4.0
1,26,1375,2020,9142,4,31,261,14,0,0,395.0,3.0,6.0
2,14,636,2022,28121,3,3,302,57,0,1,,3.5,
3,3,182,2016,61258,3,39,259,14,0,1,,,
4,3,181,2018,59000,3,38,127,14,0,0,252.0,2.0,4.0


In [56]:
submission = pd.read_csv('sample_submission.csv')

In [57]:
submission['price'] = voting_model.predict(test).astype(np.float32)
submission.head()

Unnamed: 0,id,price
0,188533,17922.125
1,188534,78292.882812
2,188535,68809.367188
3,188536,45140.875
4,188537,30650.039062
