In [None]:
# 23/05/2025
# CSC354 – Assignment3 – ML – Concept Learning
# Nimra Shahid, Muhammad Taha
# FA22-BCS-123,119

In [1]:
import pandas as pd

# Load the dataset
df = pd.read_csv('used_cars.csv')  # Replace with the correct path if needed

# Inspect the data
print(df.info())
print(df.describe())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4009 entries, 0 to 4008
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   brand         4009 non-null   object
 1   model         4009 non-null   object
 2   model_year    4009 non-null   int64 
 3   milage        4009 non-null   object
 4   fuel_type     3839 non-null   object
 5   engine        4009 non-null   object
 6   transmission  4009 non-null   object
 7   ext_col       4009 non-null   object
 8   int_col       4009 non-null   object
 9   accident      3896 non-null   object
 10  clean_title   3413 non-null   object
 11  price         4009 non-null   object
dtypes: int64(1), object(11)
memory usage: 376.0+ KB
None
        model_year
count  4009.000000
mean   2015.515590
std       6.104816
min    1974.000000
25%    2012.000000
50%    2017.000000
75%    2020.000000
max    2024.000000


In [2]:
df.head()

Unnamed: 0,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price
0,Ford,Utility Police Interceptor Base,2013,"51,000 mi.",E85 Flex Fuel,300.0HP 3.7L V6 Cylinder Engine Flex Fuel Capa...,6-Speed A/T,Black,Black,At least 1 accident or damage reported,Yes,"$10,300"
1,Hyundai,Palisade SEL,2021,"34,742 mi.",Gasoline,3.8L V6 24V GDI DOHC,8-Speed Automatic,Moonlight Cloud,Gray,At least 1 accident or damage reported,Yes,"$38,005"
2,Lexus,RX 350 RX 350,2022,"22,372 mi.",Gasoline,3.5 Liter DOHC,Automatic,Blue,Black,None reported,,"$54,598"
3,INFINITI,Q50 Hybrid Sport,2015,"88,900 mi.",Hybrid,354.0HP 3.5L V6 Cylinder Engine Gas/Electric H...,7-Speed A/T,Black,Black,None reported,Yes,"$15,500"
4,Audi,Q3 45 S line Premium Plus,2021,"9,835 mi.",Gasoline,2.0L I4 16V GDI DOHC Turbo,8-Speed Automatic,Glacier White Metallic,Black,None reported,,"$34,999"


In [3]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Drop rows with missing values (or impute if needed)
df = df.dropna().copy()


# Encode categorical columns
label_encoders = {}
for col in df.select_dtypes(include='object').columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# Split into features and target
X = df.drop('price', axis=1)
y = df['price']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [4]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Train baseline model
baseline_model = DecisionTreeRegressor(random_state=42)
baseline_model.fit(X_train, y_train)

# Predict and evaluate
y_pred = baseline_model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print("--- Baseline Model ---")
print("RMSE:", rmse)
print("R² Score:", r2)

--- Baseline Model ---
RMSE: 365.55828026337076
R² Score: -0.21045203946720226


In [5]:
from sklearn.model_selection import GridSearchCV

grid_params = {
    'max_depth': [3, 5, 10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 5, 10],
    'max_features': [None, 'sqrt', 'log2']
}

grid_search = GridSearchCV(
    DecisionTreeRegressor(random_state=42),
    grid_params,
    cv=5,
    n_jobs=-1,
    scoring='r2'
)
grid_search.fit(X_train, y_train)

# Evaluate best model
best_grid_model = grid_search.best_estimator_
y_pred_grid = best_grid_model.predict(X_test)

print("\n--- Grid Search Results ---")
print("Best Params:", grid_search.best_params_)
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred_grid)))
print("R² Score:", r2_score(y_test, y_pred_grid))



--- Grid Search Results ---
Best Params: {'max_depth': 5, 'max_features': None, 'min_samples_leaf': 10, 'min_samples_split': 2}
RMSE: 298.92593373090455
R² Score: 0.19060312300483828


In [6]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

random_params = {
    'max_depth': [3, 5, 10, 50,100,150,200,250,300,350,400,500, None],
    'min_samples_split': randint(2, 20),
    'min_samples_leaf': randint(1, 15),
    'max_features': ['sqrt', 'log2', None]
}

random_search = RandomizedSearchCV(
    DecisionTreeRegressor(random_state=42),
    param_distributions=random_params,
    n_iter=400,
    cv=5,
    n_jobs=-1,
    random_state=42,
    scoring='r2'
)
random_search.fit(X_train, y_train)

# Evaluate best model
best_random_model = random_search.best_estimator_
y_pred_random = best_random_model.predict(X_test)

print("\n--- Random Search Results ---")
print("Best Params:", random_search.best_params_)
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred_random)))
print("R² Score:", r2_score(y_test, y_pred_random))


--- Random Search Results ---
Best Params: {'max_depth': 5, 'max_features': None, 'min_samples_leaf': 12, 'min_samples_split': 2}
RMSE: 298.3209083087512
R² Score: 0.1938762421984741
