In [120]:
!pip install tensorflow



In [121]:
import tensorflow as tf
print(tf.__version__)

2.19.0


In [122]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [123]:
test = pd.read_csv('/content/test.csv')
train = pd.read_csv('/content/train.csv')

In [124]:
print(train["Mileage"].dtype)
print(test["Mileage"].dtype)


object
object


In [125]:
train["Mileage"] = train["Mileage"].str.replace(" km", "", regex=False).str.replace(",", "").astype(float)
test["Mileage"] = test["Mileage"].str.replace(" km", "", regex=False).str.replace(",", "").astype(float)


# -----------------------------
# Engine volume: extract numeric part, add Turbo flag
# -----------------------------
# Convert to string first in case there are mixed types
train["Engine volume"] = train["Engine volume"].astype(str)
test["Engine volume"] = test["Engine volume"].astype(str)

# Turbo flag
train["Turbo"] = train["Engine volume"].str.contains("Turbo").astype(int)
test["Turbo"] = test["Engine volume"].str.contains("Turbo").astype(int)

# Remove " Turbo" and convert to float
train["Engine volume"] = train["Engine volume"].str.replace(" Turbo", "").astype(float)
test["Engine volume"] = test["Engine volume"].str.replace(" Turbo", "").astype(float)

# -----------------------------
# Levy: replace "-" with NaN, convert to float
# -----------------------------
train["Levy"] = train["Levy"].replace("-", np.nan).astype(float)
test["Levy"] = test["Levy"].replace("-", np.nan).astype(float)

# Fill missing numeric values
# Fill missing Levy values safely
train["Levy"] = train["Levy"].fillna(train["Levy"].median())
test["Levy"] = test["Levy"].fillna(train["Levy"].median())

# -----------------------------
# Doors: fix weird values
# -----------------------------
def clean_doors(val):
    val = str(val)
    if "2" in val: return 2
    if "3" in val: return 3
    if "4" in val: return 4
    if "5" in val: return 5
    return np.nan

train["Doors"] = train["Doors"].apply(clean_doors).astype(float)
test["Doors"] = test["Doors"].apply(clean_doors).astype(float)

In [126]:
categorical_cols = ["Manufacturer", "Model", "Category", "Leather interior",
                    "Fuel type", "Gear box type", "Drive wheels",
                    "Wheel", "Color"]

encoder = LabelEncoder()
for col in categorical_cols:
    combined = pd.concat([train[col], test[col]]).astype(str)
    encoder.fit(combined)
    train[col] = encoder.transform(train[col].astype(str))
    test[col] = encoder.transform(test[col].astype(str))

In [127]:
y_train = np.log1p(train["Price"])  # log transform target
X_train = train.drop(["ID", "Price"], axis=1)
X_test = test.drop(["ID", "Price"], axis=1)


In [128]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler

categorical_cols = ["Manufacturer", "Model", "Category", "Leather interior",
                    "Fuel type", "Gear box type", "Drive wheels",
                    "Wheel", "Color"]
numeric_cols = ["Levy", "Prod. year", "Engine volume", "Mileage", "Cylinders", "Airbags", "Doors"]

# Separate numerical and categorical columns
X_train_numeric = train[numeric_cols]
X_test_numeric = test[numeric_cols]
X_train_categorical = train[categorical_cols]
X_test_categorical = test[categorical_cols]

# Scale numeric features
scaler = StandardScaler()
X_train_numeric_scaled = scaler.fit_transform(X_train_numeric)
X_test_numeric_scaled = scaler.transform(X_test_numeric)

# One-hot encode categorical features
ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
ohe_train = ohe.fit_transform(X_train_categorical)
ohe_test = ohe.transform(X_test_categorical)

# Combine scaled numeric and one-hot encoded categorical features
X_train = np.hstack([X_train_numeric_scaled, ohe_train])
X_test = np.hstack([X_test_numeric_scaled, ohe_test])
print(X_train.shape, X_test.shape)

(19237, 1707) (8245, 1707)


In [129]:
X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [130]:
model = Sequential()
model.add(Dense(32, activation='relu', input_dim=X_tr.shape[1]))
model.add(Dense(16, activation='relu'))
model.add(Dense(8, activation='relu'))
model.add(Dense(1, activation='linear'))  # linear for regression

model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mae', 'mse'])


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [131]:
history = model.fit(
    X_tr, y_tr,
    validation_data=(X_val, y_val),
    epochs=10,
    batch_size=32,
    verbose=1)

Epoch 1/10
[1m481/481[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 10ms/step - loss: 25.1274 - mae: 3.5746 - mse: 25.1274 - val_loss: 1.7945 - val_mae: 0.9164 - val_mse: 1.7945
Epoch 2/10
[1m481/481[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 8ms/step - loss: 1.7883 - mae: 0.9415 - mse: 1.7883 - val_loss: 1.6899 - val_mae: 0.9035 - val_mse: 1.6899
Epoch 3/10
[1m481/481[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 12ms/step - loss: 1.7166 - mae: 0.8991 - mse: 1.7166 - val_loss: 1.5765 - val_mae: 0.8644 - val_mse: 1.5765
Epoch 4/10
[1m481/481[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 4ms/step - loss: 1.5971 - mae: 0.8395 - mse: 1.5971 - val_loss: 1.4817 - val_mae: 0.8021 - val_mse: 1.4817
Epoch 5/10
[1m481/481[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - loss: 1.3916 - mae: 0.7750 - mse: 1.3916 - val_loss: 1.4464 - val_mae: 0.8004 - val_mse: 1.4464
Epoch 6/10
[1m481/481[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m

In [132]:
y_val_pred_log = model.predict(X_val)
y_val_pred = np.expm1(y_val_pred_log)  # convert back from log

y_val_true = np.expm1(y_val)  # actual prices

mse = mean_squared_error(y_val_true, y_val_pred)
mae = mean_absolute_error(y_val_true, y_val_pred)
r2 = r2_score(y_val_true, y_val_pred)

print("MSE:", mse)
print("MAE:", mae)
print("R2 Score:", r2)

[1m121/121[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
MSE: 251059855.1124946
MAE: 8333.215589769168
R2 Score: 0.19427925952091163


In [133]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np


In [134]:
rf_model = RandomForestRegressor(
    n_estimators=200,
    max_depth=None,
    random_state=42,
    n_jobs=-1
)


In [135]:
rf_model.fit(X_train, y_train)


In [136]:
y_pred = rf_model.predict(X_test)


In [140]:
# Make predictions on the validation set
y_val_pred_log = rf_model.predict(X_val)

# Convert predictions back to original scale for RMSE and MAE
y_val_pred_original_scale = np.expm1(y_val_pred_log)
y_val_true_original_scale = np.expm1(y_val)

# Calculate metrics
mse_original_scale = mean_squared_error(y_val_true_original_scale, y_val_pred_original_scale)
rmse = np.sqrt(mse_original_scale)
mae = mean_absolute_error(y_val_true_original_scale, y_val_pred_original_scale)
r2 = r2_score(y_val, y_val_pred_log)

print(f"Random Forest Results on Validation Set:")
print(f"RMSE (Original Scale): {rmse:.2f}")
print(f"MAE (Original Scale): {mae:.2f}")
print(f"R² (Log Scale): {r2:.4f}")

Random Forest Results on Validation Set:
RMSE (Original Scale): 6084.23
MAE (Original Scale): 2117.38
R² (Log Scale): 0.9406
