In [44]:
# import library
import pandas as pd
from sklearn.impute import SimpleImputer
import torch.nn as nn
import torch
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error as RMSE
import torch.optim as optim
import numpy as np

In [19]:
df = pd.read_csv('train.csv')
df.head()

Unnamed: 0,id,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price
0,0,Ford,F-150 Lariat,2018,74349,Gasoline,375.0HP 3.5L V6 Cylinder Engine Gasoline Fuel,10-Speed A/T,Blue,Gray,None reported,Yes,11000
1,1,BMW,335 i,2007,80000,Gasoline,300.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,6-Speed M/T,Black,Black,None reported,Yes,8250
2,2,Jaguar,XF Luxury,2009,91491,Gasoline,300.0HP 4.2L 8 Cylinder Engine Gasoline Fuel,6-Speed A/T,Purple,Beige,None reported,Yes,15000
3,3,BMW,X7 xDrive40i,2022,2437,Hybrid,335.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,Transmission w/Dual Shift Mode,Gray,Brown,None reported,Yes,63500
4,4,Pontiac,Firebird Base,2001,111000,Gasoline,200.0HP 3.8L V6 Cylinder Engine Gasoline Fuel,A/T,White,Black,None reported,Yes,7850


In [21]:
# Engine column
imputer = SimpleImputer(strategy='mean')
df['engine_volumn'] = df["engine"].str.extract(r"(\d+\.?\d*)L").astype('float')
df['engine_volumn'] = imputer.fit_transform(df[['engine_volumn']])
df['engine_volumn'].unique().shape

(61,)

In [22]:
# Transmission column
def categorize_transmission(trans):
    trans = trans.lower().strip()  # Convert to lowercase and strip whitespaces
    if 'auto' in trans or 'a/t' in trans or 'automatic' in trans and "cvt" not in trans:
        return 'automatic'
    elif 'cvt' in trans:
        return 'cvt'
    elif 'manual' in trans:
        return 'manual'
    else:
        return 'other'
df['transmission'] = df['transmission'].apply(categorize_transmission)
df['transmission'].unique().shape

(4,)

In [23]:
# Exterior/Interior color values
def categorize_col(col):
    basic_colors = {
        'blue': 'blue',
        'black': 'black',
        'purple': 'purple',
        'gray': 'gray',
        'white': 'white',
        'red': 'red',
        'silver': 'silver',
        'green': 'green',
        'orange': 'orange',
        'beige': 'beige',
        'gold': 'gold',
        'brown': 'brown',
        'yellow': 'yellow'
    }
    col = col.lower().strip()
    for basic_color in basic_colors:
        if basic_color in col:
            return basic_colors[basic_color]
    return 'other'
df['ext_col'] = df['ext_col'].apply(categorize_col)
df['int_col'] = df['int_col'].apply(categorize_col)

print(df['ext_col'].unique().shape, df['int_col'].unique().shape)

(14,) (13,)


In [29]:
# Accident and Clean_title 
mapping = {
    "At least 1 accident or damage reported": True,
    "None reported": False
}
df['accident'] = df['accident'].map(mapping)
df['clean_title'] = df['clean_title'].apply(lambda x: True if x == 'Yes' else False)
print(df['accident'].unique().shape, df['clean_title'].unique().shape)

(2,) (1,)


In [31]:
# Finalize the df cat, num and y column
df = df.drop(['model','engine'], axis = 1)
df_dummies = pd.get_dummies(df.select_dtypes('object'))
df_cat = pd.concat([df_dummies, df[['accident','clean_title']]],axis = 1)
print(df_cat.shape)

df_num = df[['model_year','milage','engine_volumn']]
print(df_num.shape)

df_price = df['price']

X = pd.concat([df_num,df_cat],axis =1)
y = df_price

(54273, 93)
(54273, 3)


In [35]:
tdf = pd.read_csv('test.csv')

tdf['engine_volumn'] = tdf["engine"].str.extract(r"(\d+\.?\d*)L").astype('float')
tdf['engine_volumn'] = imputer.fit_transform(tdf[['engine_volumn']])
tdf['transmission'] = tdf['transmission'].apply(categorize_transmission)
tdf['ext_col'] = tdf['ext_col'].apply(categorize_col)
tdf['int_col'] = tdf['int_col'].apply(categorize_col)
tdf['accident'] = tdf['accident'].map(mapping)
tdf['clean_title'] = tdf['clean_title'].apply(lambda x: True if x == 'Yes' else False)

# Finalize the df cat, num and y column
tdf = tdf.drop(['model','engine'], axis = 1)
tdf_dummies = pd.get_dummies(tdf.select_dtypes('object'))
tdf_cat = pd.concat([tdf_dummies, tdf[['accident','clean_title']]],axis = 1)
print(tdf_cat.shape)

tdf_num = tdf[['model_year','milage','engine_volumn']]
print(tdf_num.shape)
X_t = pd.concat([tdf_num,tdf_cat],axis =1)


(36183, 93)
(36183, 3)


### Tree regressor model on train data

In [36]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 5)

param_grid={'n_estimators':[100,500,1000],
            'max_depth':[1,3,5],
            'learning_rate':[0.0001,0.001,0.01]}

grid_search_cv=GridSearchCV(estimator=XGBRegressor(),
                         param_grid=param_grid,
                         scoring='neg_root_mean_squared_error',
                         cv=3,
                         verbose=10)

grid_search_cv.fit(X_train,y_train)

best_model=grid_search_cv.best_estimator_

Fitting 3 folds for each of 27 candidates, totalling 81 fits
[CV 1/3; 1/27] START learning_rate=0.0001, max_depth=1, n_estimators=100........
[CV 1/3; 1/27] END learning_rate=0.0001, max_depth=1, n_estimators=100;, score=-67179.338 total time=   0.4s
[CV 2/3; 1/27] START learning_rate=0.0001, max_depth=1, n_estimators=100........
[CV 2/3; 1/27] END learning_rate=0.0001, max_depth=1, n_estimators=100;, score=-77068.536 total time=   0.4s
[CV 3/3; 1/27] START learning_rate=0.0001, max_depth=1, n_estimators=100........
[CV 3/3; 1/27] END learning_rate=0.0001, max_depth=1, n_estimators=100;, score=-64567.616 total time=   0.4s
[CV 1/3; 2/27] START learning_rate=0.0001, max_depth=1, n_estimators=500........
[CV 1/3; 2/27] END learning_rate=0.0001, max_depth=1, n_estimators=500;, score=-67002.727 total time=   1.1s
[CV 2/3; 2/27] START learning_rate=0.0001, max_depth=1, n_estimators=500........
[CV 2/3; 2/27] END learning_rate=0.0001, max_depth=1, n_estimators=500;, score=-76907.456 total ti

In [37]:
y_pred = best_model.predict(X_test)
rmse = RMSE(y_pred,y_test)
rmse

75382.27133705343

In [39]:
pred = best_model.predict(X_t.values)
sub = pd.DataFrame({'id':tdf['id'],
                    'price':pred})
# sub.to_csv('submission.csv', index=False)

### Deep learning model

In [40]:
cat_set = torch.tensor(df_cat.values).float()
num_set = torch.tensor(df_num.values).float()
labels = torch.tensor(df_price.values).float()

data_set = TensorDataset(cat_set,num_set, labels)
data_load = DataLoader(data_set, batch_size = 100, shuffle = True)


In [41]:
cat_tset = torch.tensor(tdf_cat.values).float()
num_tset = torch.tensor(tdf_num.values).float()

In [42]:
class Net(nn.Module):
    def __init__(self):
        super(Net,self).__init__()
        self.cat_layer = nn.Sequential(
            nn.Linear(93, 40),
            nn.ReLU(),
            nn.Linear(40, 5),
            nn.ReLU()
        )
        self.num_layer = nn.Sequential(
            nn.BatchNorm1d(num_features = 3),
            nn.Linear(3,3),
            nn.ReLU(),
            nn.Linear(3,5),
            nn.ReLU()
        )
        self.regressor = nn.Linear(10, 1)
    def forward(self,x_cat, x_num):
        x_cat = self.cat_layer(x_cat)
        x_num = self.num_layer(x_num)
        x = torch.cat((x_cat,x_num), dim =1)
        return self.regressor(x)

In [51]:
net = Net()
criterion = nn.MSELoss()
optimizer = optim.Adam(net.parameters(), lr=0.001)

for epoch in range(3):
    for X_cat, X_num, y in data_load:
        optimizer.zero_grad()
        outputs = net(X_cat, X_num)
        loss = criterion(outputs, y)
        loss.backward()
        optimizer.step()


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


In [54]:
pred2 = net(cat_tset,num_tset).detach().numpy().flatten()
sub2 = pd.DataFrame({'id':tdf['id'],
                    'price':pred2})
# sub.to_csv('submission.csv', index=False)