In [63]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.model_selection import RandomizedSearchCV

In [71]:
## Loading and Preprocessing

In [43]:
# Load the dataset
car_data = pd.read_csv('CarPrice_Assignment.csv')

In [44]:
# Check for missing values
missing_values = car_data.isnull().sum()

In [45]:
# Drop duplicates if there are any
car_data = car_data.drop_duplicates()

In [46]:
# Convert categorical variables to numeric using one-hot encoding
car_data_encoded = pd.get_dummies(car_data, drop_first=True)

In [47]:
# Feature scaling (if required for some models like SVR)
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
numerical_features = ['wheelbase', 'carlength', 'carwidth', 'carheight', 'curbweight', 'enginesize', 'boreratio', 'stroke', 'compressionratio', 'horsepower', 'peakrpm', 'citympg', 'highwaympg']
car_data_encoded[numerical_features] = scaler.fit_transform(car_data_encoded[numerical_features])

In [48]:
# Display processed dataset
car_data_encoded.head()

Unnamed: 0,car_ID,symboling,wheelbase,carlength,carwidth,carheight,curbweight,enginesize,boreratio,stroke,...,cylindernumber_three,cylindernumber_twelve,cylindernumber_two,fuelsystem_2bbl,fuelsystem_4bbl,fuelsystem_idi,fuelsystem_mfi,fuelsystem_mpfi,fuelsystem_spdi,fuelsystem_spfi
0,1,3,-1.690772,-0.426521,-0.844782,-2.020417,-0.014566,0.074449,0.519071,-1.839377,...,False,False,False,False,False,False,False,True,False,False
1,2,3,-1.690772,-0.426521,-0.844782,-2.020417,-0.014566,0.074449,0.519071,-1.839377,...,False,False,False,False,False,False,False,True,False,False
2,3,1,-0.708596,-0.231513,-0.190566,-0.543527,0.514882,0.604046,-2.40488,0.685946,...,False,False,False,False,False,False,False,True,False,False
3,4,2,0.173698,0.207256,0.136542,0.235942,-0.420797,-0.431076,-0.517266,0.462183,...,False,False,False,False,False,False,False,True,False,False
4,5,2,0.10711,0.207256,0.230001,0.235942,0.516807,0.218885,-0.517266,0.462183,...,False,False,False,False,False,False,False,True,False,False


In [72]:
## Model Implementation

In [49]:
# Split the data into features (X) and target (y)
X = car_data_encoded.drop(columns='price')
y = car_data_encoded['price']

In [50]:
# Split the dataset into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [51]:
# Initialize models
models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree Regressor': DecisionTreeRegressor(random_state=42),
    'Random Forest Regressor': RandomForestRegressor(random_state=42),
    'Gradient Boosting Regressor': GradientBoostingRegressor(random_state=42),
    'Support Vector Regressor': SVR()
}

In [52]:
# Train and evaluate the models
# Train and evaluate the models
for model_name, model in models.items():
    model.fit(X_train, y_train)  # Train the model
    y_pred = model.predict(X_test)  # Predict on the test set

In [53]:
# Output the results for each model
print(f"{model_name} trained and ready to evaluate.")

Support Vector Regressor trained and ready to evaluate.


In [73]:
## Model Evaluation

In [54]:
# Dictionary to store evaluation results
model_results = {}

In [55]:
# Train and evaluate the models
for model_name, model in models.items():
    model.fit(X_train, y_train)  # Train the model
    y_pred = model.predict(X_test)  # Predict on the test set

In [56]:
# Calculate evaluation metrics
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

In [57]:
# Store the results
model_results[model_name] = {
    'R-squared': r2,
    'MSE': mse,
    'MAE': mae
}

In [58]:
# Display evaluation results for each model
for model_name, metrics in model_results.items():
    print(f"{model_name}:")
    print(f"  R-squared: {metrics['R-squared']}")
    print(f"  MSE: {metrics['MSE']}")
    print(f"  MAE: {metrics['MAE']}")
    print()

Support Vector Regressor:
  R-squared: -0.10252443708180903
  MSE: 87037712.18131916
  MAE: 5708.789736647873



In [74]:
## Feature Importance Analysis

In [59]:
# Get feature importance from the Random Forest model
rf_model = models['Random Forest Regressor']
feature_importances = rf_model.feature_importances_

In [60]:
# Create a DataFrame to organize and display feature importances
feature_importance_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': feature_importances
})

In [61]:
# Sort the features by importance in descending order
feature_importance_df_sorted = feature_importance_df.sort_values(by='Importance', ascending=False)

In [62]:
# Display the top 10 most important features
print(feature_importance_df_sorted.head(10))

       Feature  Importance
7   enginesize    0.549018
6   curbweight    0.293698
14  highwaympg    0.041858
11  horsepower    0.028179
0       car_ID    0.019694
4     carwidth    0.011678
2    wheelbase    0.006137
3    carlength    0.005977
12     peakrpm    0.004968
13     citympg    0.004326


In [75]:
## Hyperparameter Tuning

In [64]:
# Define the hyperparameters and their possible values
param_grid = {
    'n_estimators': [100, 200, 500],          # Number of trees
    'max_depth': [None, 10, 20, 30],          # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],          # Minimum number of samples to split a node
    'min_samples_leaf': [1, 2, 4],            # Minimum number of samples per leaf
    'bootstrap': [True, False]                # Whether bootstrap samples are used when building trees
}

In [65]:
# Instantiate RandomizedSearchCV
rf = RandomForestRegressor(random_state=42)
random_search = RandomizedSearchCV(estimator=rf, param_distributions=param_grid, 
                                   n_iter=10, cv=3, verbose=2, random_state=42, n_jobs=-1)

In [66]:
# Fit RandomizedSearchCV on the training data
random_search.fit(X_train, y_train)

Fitting 3 folds for each of 10 candidates, totalling 30 fits


In [67]:
# Get the best parameters
best_params = random_search.best_params_

In [68]:
# Train the Random Forest model with the best hyperparameters
best_rf_model = RandomForestRegressor(**best_params, random_state=42)
best_rf_model.fit(X_train, y_train)

In [69]:
# Predict and evaluate the tuned model
y_pred_tuned = best_rf_model.predict(X_test)
r2_tuned = r2_score(y_test, y_pred_tuned)
mse_tuned = mean_squared_error(y_test, y_pred_tuned)
mae_tuned = mean_absolute_error(y_test, y_pred_tuned)

In [70]:
# Display the best parameters and the tuned model's performance
print("Best Hyperparameters:", best_params)
print(f"R-squared (Tuned): {r2_tuned}")
print(f"MSE (Tuned): {mse_tuned}")
print(f"MAE (Tuned): {mae_tuned}")

Best Hyperparameters: {'n_estimators': 100, 'min_samples_split': 10, 'min_samples_leaf': 2, 'max_depth': None, 'bootstrap': True}
R-squared (Tuned): 0.9434308080912253
MSE (Tuned): 4465799.467191673
MAE (Tuned): 1462.3137692513756
