In [1]:

import numpy as np
import os
import gc
import pandas as pd
import polars as pl
from glob import glob
from pathlib import Path
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer


import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import optuna
from sklearn.model_selection import cross_validate
from lightgbm import LGBMRegressor
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.model_selection import KFold

In [2]:
train_data = pd.read_csv('/kaggle/input/x2024-assessment/train.csv')
test_data = pd.read_csv('/kaggle/input/x2024-assessment/test.csv')
sub_data = pd.read_csv('/kaggle/input/x2024-assessment/sample_submission.csv')

In [3]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54273 entries, 0 to 54272
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   id            54273 non-null  int64 
 1   brand         54273 non-null  object
 2   model         54273 non-null  object
 3   model_year    54273 non-null  int64 
 4   milage        54273 non-null  int64 
 5   fuel_type     54273 non-null  object
 6   engine        54273 non-null  object
 7   transmission  54273 non-null  object
 8   ext_col       54273 non-null  object
 9   int_col       54273 non-null  object
 10  accident      54273 non-null  object
 11  clean_title   54273 non-null  object
 12  price         54273 non-null  int64 
dtypes: int64(4), object(9)
memory usage: 5.4+ MB


In [4]:
train_df = train_data.drop(['id', 'price'], axis=1)
target_df = train_data['price']
test_df = test_data.drop(['id'], axis=1)

In [5]:

n_cols = train_df.loc[:,train_df.dtypes=='int'].columns
c_cols = train_df.loc[:,train_df.dtypes=='object'].columns

num_pipeline = Pipeline(
    steps=[
        ('impute', SimpleImputer(strategy='mean')),
        ('scale',MinMaxScaler())
    ]
)

cat_pipeline = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('encoder', OneHotEncoder(handle_unknown='ignore',sparse=False))
    ],
    verbose=True
)

ct = ColumnTransformer(
    transformers=[
        ('num_pipeline',num_pipeline,np.array(n_cols)),
        ('cat_pipeline',cat_pipeline,np.array(c_cols))
    ],
remainder='drop',n_jobs=-1)

clf = LGBMRegressor(objective='regression',
                    num_leaves=512,
                    max_depth=12,
                    n_estimators=70000,
                    learning_rate=0.05,
                    verbosity=-1,
                    random_state=1234)
parallel_pipeline = Pipeline(
    steps=[
        ('col_trans',ct),
        ('model', clf)
    ]
)

parallel_pipeline

In [6]:
train_df,target_df

(         brand                             model  model_year  milage  \
 0         Ford                      F-150 Lariat        2018   74349   
 1          BMW                             335 i        2007   80000   
 2       Jaguar                         XF Luxury        2009   91491   
 3          BMW                      X7 xDrive40i        2022    2437   
 4      Pontiac                     Firebird Base        2001  111000   
 ...        ...                               ...         ...     ...   
 54268      BMW                      X6 xDrive50i        2017   29000   
 54269     Audi                   A4 2.0T Premium        2015   94634   
 54270  Porsche                         Cayenne S        2013   40989   
 54271  Porsche                 911 Carrera 4 GTS        2023    1518   
 54272     Audi  A5 Sportback S line Premium Plus        2021   35000   
 
            fuel_type                                             engine  \
 0           Gasoline      375.0HP 3.5L V6 Cyl

In [None]:
parallel_pipeline.fit(train_df,target_df)



In [None]:
test_pred = parallel_pipeline.predict(test_df)

In [None]:

print('='*30)
print('Prediction Result：', test_pred)
print('='*30)
print('Scores by Train Data：', parallel_pipeline.score(train_df,target_df))
# print('='*30)
# print('Scores by Test Data：', parallel_pipeline.score(X_test,y_test))

plt.scatter(parallel_pipeline.predict(train_df),target_df)

parallel_pipeline['model'].get_params()

In [None]:
sub_data['price'] = test_pred
sub_data.to_csv('./submission.csv', index=False)