In [1]:
from tensorflow import keras
from tensorflow.keras import layers

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.pipeline import Pipeline

from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer


In [2]:
import re
from sklearn.base import BaseEstimator, TransformerMixin

class ExtractTextAndNumber(BaseEstimator, TransformerMixin):
    def __init__(self, pattern=r'(\d+\.\d+)HP (\d+\.\d+)L (\w+)'):
        self.pattern = pattern

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        def extract_text_and_number(string):
            match = re.search(self.pattern, string)
            if match:
                return match.group(1), match.group(2)
            else:
                return 100, 2

        return  pd.DataFrame(X['engine'].apply(extract_text_and_number).tolist(), columns=['engine_hp', 'engine_cap'])

In [8]:
data = pd.read_csv("./train.csv")

X = data.copy()
y = X.pop("price")

features_num = ["model_year", "milage"]
features_cat_ohe = ["brand", "fuel_type", "transmission", "accident", "clean_title"]
features_cat_ord = ["ext_col", "int_col"]
features_cat_cst = ["engine"]

# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy="mean")

# Preprocessing for categorical data
categorical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False)),
    ]
)

ordinal_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("ordinal", OrdinalEncoder()),
    ]
)

custom_transformer = ExtractTextAndNumber()

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numerical_transformer, features_num),
        ("cat_ohe", categorical_transformer, features_cat_ohe),
        ("cat_ord", ordinal_transformer, features_cat_ord),
        ("cst", custom_transformer, features_cat_cst),
    ]
)

df = preprocessor.fit_transform(X)

X = pd.DataFrame(df).astype('float')
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2)

shape = [X_train.shape[1]]

In [None]:
pd.DataFrame(df).head()


In [None]:
model = keras.Sequential(
    [
        layers.Dense(1024, activation="softplus", input_shape=shape),
        layers.Dropout(0.3),
        layers.BatchNormalization(),
        layers.Dense(512, activation="softplus"),
        layers.Dropout(0.3),
        layers.BatchNormalization(),
        layers.Dense(1, activation="relu"),
    ]
)

model.compile(optimizer="sgd", loss="mae")

earlyStopping = keras.callbacks.EarlyStopping(
    patience=10, min_delta=10, restore_best_weights=True
)

history = model.fit(
    X_train,y_train,
    validation_data = (X_valid,y_valid),
    batch_size = 512,
    epochs = 100,
    callbacks = [earlyStopping],
    verbose = 2,
)

In [None]:
history_df = pd.DataFrame(history.history)
history_df.loc[20:, ['loss', 'val_loss']].plot()

In [None]:
categorical_transformer.fit(X)

In [None]:
test_data = pd.read_csv('./test.csv')
test_trans = pd.DataFrame(preprocessor.transform(test_data))

In [None]:
preds = model.predict((test_trans.astype('float')))
preds

In [None]:
result = pd.DataFrame(
    {
        'id':test_data['id'],
        'price':preds.reshape(-1)
    }
)

result.to_csv('submission.csv',index=False)