In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

# Load the dataset
df = pd.read_csv('C:\\Users\\VARKEY JOSHY T\\Downloads\\CarPrice_Assignment.csv')

# Check for missing values
print(df.isnull().sum())

# Scale numerical features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(df[['horsepower', 'price']])
df[['scaled_horsepower', 'scaled_price']] = scaled_features

# One-hot encode categorical variables
categorical_cols = ['CarName','fueltype', 'aspiration', 'doornumber', 'carbody',
       'drivewheel', 'enginelocation', 'enginetype', 'cylindernumber',
       'fuelsystem']
df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

# Split the data
X = df.drop('price', axis=1)
y = df['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

car_ID              0
symboling           0
CarName             0
fueltype            0
aspiration          0
doornumber          0
carbody             0
drivewheel          0
enginelocation      0
wheelbase           0
carlength           0
carwidth            0
carheight           0
curbweight          0
enginetype          0
cylindernumber      0
enginesize          0
fuelsystem          0
boreratio           0
stroke              0
compressionratio    0
horsepower          0
peakrpm             0
citympg             0
highwaympg          0
price               0
dtype: int64


In [7]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

# Linear Regression
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
lr_pred = lr_model.predict(X_test)

# Decision Tree Regressor
dt_model = DecisionTreeRegressor(random_state=42)
dt_model.fit(X_train, y_train)
dt_pred = dt_model.predict(X_test)

# Random Forest Regressor
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)

# Support Vector Regressor
svr_model = SVR(kernel='rbf', C=1e3, gamma=0.1)
svr_model.fit(X_train, y_train)
svr_pred = svr_model.predict(X_test)

# Gradient Boosting Regressor
gb_model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
gb_model.fit(X_train, y_train)
gb_pred = gb_model.predict(X_test)

In [8]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

# Linear Regression
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
lr_pred = lr_model.predict(X_test)

# Decision Tree Regressor
dt_model = DecisionTreeRegressor(random_state=42)
dt_model.fit(X_train, y_train)
dt_pred = dt_model.predict(X_test)

# Random Forest Regressor
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)

# Support Vector Regressor
svr_model = SVR(kernel='rbf', C=1e3, gamma=0.1)
svr_model.fit(X_train, y_train)
svr_pred = svr_model.predict(X_test)

# Gradient Boosting Regressor
gb_model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
gb_model.fit(X_train, y_train)
gb_pred = gb_model.predict(X_test)

In [9]:
models = {
    'Linear Regression': {'model': lr_model, 'predictions': lr_pred},
    'Decision Tree Regressor': {'model': dt_model, 'predictions': dt_pred},
    'Random Forest Regressor': {'model': rf_model, 'predictions': rf_pred},
    'Support Vector Regressor': {'model': svr_model, 'predictions': svr_pred},
    'Gradient Boosting Regressor': {'model': gb_model, 'predictions': gb_pred}
}

for name, info in models.items():
    mse = mean_squared_error(y_test, info['predictions'])
    mae = mean_absolute_error(y_test, info['predictions'])
    r2 = r2_score(y_test, info['predictions'])
    print(f"{name}:")
    print(f"MSE: {mse:.4f}")
    print(f"MAE: {mae:.4f}")
    print(f"R-squared: {r2:.4f}")
    print()

Linear Regression:
MSE: 0.0000
MAE: 0.0000
R-squared: 1.0000

Decision Tree Regressor:
MSE: 1639312.3325
MAE: 560.8090
R-squared: 0.9792

Random Forest Regressor:
MSE: 1027734.5548
MAE: 434.7275
R-squared: 0.9870

Support Vector Regressor:
MSE: 86662891.7056
MAE: 5687.4158
R-squared: -0.0978

Gradient Boosting Regressor:
MSE: 799625.3796
MAE: 327.9635
R-squared: 0.9899



In [10]:
# For Random Forest Regressor
feature_importance = rf_model.feature_importances_
feature_names = X.columns
feature_importance_df = pd.DataFrame({'feature': feature_names, 'importance': feature_importance})
feature_importance_df = feature_importance_df.sort_values('importance', ascending=False)

print("Feature Importance:")
print(feature_importance_df)

Feature Importance:
                       feature  importance
16                scaled_price    0.893599
7                   enginesize    0.072730
11                  horsepower    0.008594
6                   curbweight    0.008120
15           scaled_horsepower    0.005274
..                         ...         ...
145   CarName_vokswagen rabbit    0.000000
150  CarName_volkswagen rabbit    0.000000
54     CarName_honda accord lx    0.000000
48          CarName_dodge d200    0.000000
96         CarName_nissan otti    0.000000

[192 rows x 2 columns]


In [15]:
from sklearn.model_selection import GridSearchCV

def tune_hyperparameters(model, param_grid):
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')
    grid_search.fit(X_train, y_train)
    return grid_search.best_estimator_

# Linear Regression
param_grid_lr = {'fit_intercept': [True, False]}
lr_tuned = tune_hyperparameters(LinearRegression(), param_grid_lr)

# Decision Tree Regressor
param_grid_dt = {'max_depth': [3, 5, 10, None],
                 'min_samples_split': [2, 5, 10],
                 'min_samples_leaf': [1, 2, 4]}
dt_tuned = tune_hyperparameters(DecisionTreeRegressor(), param_grid_dt)

# Random Forest Regressor
param_grid_rf = {'n_estimators': [100, 200, 300], 
                 'max_depth': [None, 5, 10, 20],
                 'min_samples_split': [2, 5, 10],
                 'min_samples_leaf': [1, 2, 4]}
rf_tuned = "tune_hyperparameter"
print("Hyperparameter tuning completed.")

Hyperparameter tuning completed.
