# Car price prediction

## Data import and feature removal

In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv('train.csv')
df_prod = pd.read_csv('test.csv')
df = df.drop(columns=['wheel'], axis=1)
df_prod = df_prod.drop(columns=['wheel'], axis=1)
id_prod = df_prod['Id'].to_numpy()
df_prod = df_prod.drop(columns=['Id'], axis=1)
df.nunique()


model             5
year             32
motor_type        5
running         643
color            17
type              7
status            5
motor_volume     20
price           294
dtype: int64

## One-hot encoding

In [2]:
from sklearn.preprocessing import OneHotEncoder

full_df = pd.concat((df, df_prod))

feature_names = ['model', 'motor_type', 'color', 'type', 'status']
enc = OneHotEncoder()
enc.fit(full_df[feature_names])
X_encoded = enc.transform(df[feature_names])
encoded_df = pd.DataFrame(
    X_encoded.toarray(), 
    columns=enc.get_feature_names_out(feature_names))
df = pd.concat((df.drop(columns=feature_names), encoded_df), axis=1)

X_prod_encoded = enc.transform(df_prod[feature_names])
encoded_df_prod = pd.DataFrame(
    X_prod_encoded.toarray(), 
    columns=enc.get_feature_names_out(feature_names))
df_prod = pd.concat(
    (df_prod.drop(columns=feature_names), encoded_df_prod),
    axis=1)
df


Unnamed: 0,year,running,motor_volume,price,model_hyundai,model_kia,model_mercedes-benz,model_nissan,model_toyota,motor_type_diesel,...,type_hatchback,type_minivan / minibus,type_pickup,type_sedan,type_suv,status_crashed,status_excellent,status_good,status_new,status_normal
0,2022,3000 km,2.0,24500,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
1,2014,132000 km,2.0,25500,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
2,2018,95000 miles,2.0,11700,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
3,2002,137000 miles,3.2,12000,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
4,2017,130000 km,2.0,26000,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1637,2017,120000 miles,2.0,12400,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
1638,2014,170000 km,2.0,16500,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
1639,2018,68900 miles,2.0,19500,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
1640,2019,31000 miles,2.0,19500,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0


## Mileage convertation

In [3]:
def to_kmage(running):
    running_amt = float(running.split(' ')[0])
    km_coeff = 1.609344 if 'mile' in running else 1
    return running_amt * km_coeff

df['running'] = df['running'].apply(to_kmage)
df_prod['running'] = df_prod['running'].apply(to_kmage)
df

Unnamed: 0,year,running,motor_volume,price,model_hyundai,model_kia,model_mercedes-benz,model_nissan,model_toyota,motor_type_diesel,...,type_hatchback,type_minivan / minibus,type_pickup,type_sedan,type_suv,status_crashed,status_excellent,status_good,status_new,status_normal
0,2022,3000.0000,2.0,24500,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
1,2014,132000.0000,2.0,25500,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
2,2018,152887.6800,2.0,11700,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
3,2002,220480.1280,3.2,12000,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
4,2017,130000.0000,2.0,26000,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1637,2017,193121.2800,2.0,12400,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
1638,2014,170000.0000,2.0,16500,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
1639,2018,110883.8016,2.0,19500,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
1640,2019,49889.6640,2.0,19500,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0


## X, y split

In [4]:
X, y = df.drop(columns=['price'], axis=1).to_numpy(), df['price'].to_numpy()
X_prod = df_prod.to_numpy()
print(X)
print(y)


[[2.02200000e+03 3.00000000e+03 2.00000000e+00 ... 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 [2.01400000e+03 1.32000000e+05 2.00000000e+00 ... 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 [2.01800000e+03 1.52887680e+05 2.00000000e+00 ... 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 ...
 [2.01800000e+03 1.10883802e+05 2.00000000e+00 ... 1.00000000e+00
  0.00000000e+00 0.00000000e+00]
 [2.01900000e+03 4.98896640e+04 2.00000000e+00 ... 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 [2.02200000e+03 2.00000000e+01 2.00000000e+00 ... 0.00000000e+00
  0.00000000e+00 0.00000000e+00]]
[24500 25500 11700 ... 19500 19500 28500]


## Train/test split

In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.1, random_state=42)

## Model learning

In [6]:
from lightgbm import LGBMRegressor

reg = LGBMRegressor()
reg.fit(X_train, y_train)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000977 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 346
[LightGBM] [Info] Number of data points in the train set: 1477, number of used features: 25
[LightGBM] [Info] Start training from score 16002.691266


## Model testing

In [7]:
from sklearn import metrics

y_prediction = reg.predict(X_test)

# print(f'Coefficients: {reg.coef_}')
print(f'Mean absolute error: {metrics.mean_absolute_error(y_test, y_prediction)}')
y_prediction

Mean absolute error: 2143.62793338827


array([ 5415.27343741, 22709.25517906, 27702.66574618, 29938.96550875,
        6829.25018323, 13111.62711866, 19463.87781412, 16802.11077292,
       10581.08107546, 17684.54992644, 18300.82950347, 16299.48295845,
       18785.4457635 , 14684.35738843, 16942.14420438, 14781.44075932,
       17752.53136607, 13021.43085429, 10577.87746501, 10105.22711598,
       19846.73834042, 28193.68163902, 13842.03230209,  9288.77822555,
       17454.03897031, 19582.82421697,  8688.28904521, 17637.27275509,
       26309.6741605 ,  5584.54819885, 32907.88576206, 15927.33021119,
       23316.66875525, 16339.26911986,  6674.15658868, 18636.9061179 ,
       26316.44690122, 10762.69812945, 13112.00881059, 16224.16125569,
       17774.1400221 ,  9565.36835652, 15548.60711904, 17697.87074562,
        8295.52172054, 15346.81920201, 18463.25825344, 17458.64544549,
        5900.96266741, 10812.14698365, 34058.0972505 ,  6094.65319923,
       16033.64173453, 12863.70275468, 14192.93071264, 13093.65591617,
      

## Prediction

In [8]:
reg = LGBMRegressor()
reg.fit(X, y)
y_prod_prediction = reg.predict(X_prod)
y_prod_prediction = pd.DataFrame({'Id': id_prod, 'price': y_prod_prediction})
y_prod_prediction.to_csv('submission.csv', index=False)
y_prod_prediction

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000192 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 351
[LightGBM] [Info] Number of data points in the train set: 1642, number of used features: 27
[LightGBM] [Info] Start training from score 15982.633374


Unnamed: 0,Id,price
0,0,17639.820424
1,1,16788.774365
2,2,25052.614528
3,3,14593.712318
4,4,6995.985846
...,...,...
406,406,22937.323433
407,407,14453.017124
408,408,13846.666034
409,409,15490.393508
