In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

# Spliting Dataset for Training & Testing

In [56]:
path = "E:/UDHAYA/Cardeko_project/Cleaned_csv/ML_Dataset.csv"
df = pd.read_csv(path)
# Split the dataset into features (X) and target (y)
# Adjust 'target_column' to actual target column name
target_column = 'price'  # Replace 'target' with the actual target column name
X = df.drop(columns=[target_column])
y = df[target_column]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the shapes of the splits
print("Training features shape:", X_train.shape)
print("Testing features shape:", X_test.shape)
print("Training labels shape:", y_train.shape)
print("Testing labels shape:", y_test.shape)

# Now data is preprocessed and ready for a machine learning model!

Training features shape: (6599, 527)
Testing features shape: (1650, 527)
Training labels shape: (6599,)
Testing labels shape: (1650,)


In [58]:
# Define models to train
models = {
    'Linear Regression': LinearRegression(),
    'Ridge': Ridge(),
    'Lasso': Lasso(alpha=0.1, max_iter=2000),
    'Decision Tree': DecisionTreeRegressor(random_state=42),
    'Random Forest': RandomForestRegressor(random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(random_state=42)
}

# Train and evaluate each model
results = {}
for name, model in models.items():
    # Train the model
    model.fit(X_train, y_train)
    
    # Predict on the test set
    y_pred = model.predict(X_test)
    
    # Calculate evaluation metrics
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    
    # Store results
    results[name] = {'MAE': mae, 'RMSE': rmse, 'R2': r2}

# Display results in a DataFrame
results_df = pd.DataFrame(results).T
print("Model Performance:\n", results_df)

  model = cd_fast.enet_coordinate_descent(


Model Performance:
                             MAE          RMSE            R2
Linear Regression  1.154332e+15  9.991625e+15 -3.407082e+19
Ridge              2.775181e+05  8.039563e+05  7.794154e-01
Lasso              2.664743e+05  7.705868e+05  7.973468e-01
Decision Tree      2.272109e+05  8.697106e+05  7.418573e-01
Random Forest      1.716606e+05  6.639516e+05  8.495532e-01
Gradient Boosting  2.683497e+05  7.002488e+05  8.326541e-01


In [60]:
#convert dataframe to csv
path = "E:/UDHAYA/Cardeko_project/Cleaned_csv/Models_results.csv"
results_df.to_csv(path,index=False)

In [62]:
best_model = results_df.loc[results_df['R2'].idxmax()]
print("Best model with high r2 score")
best_model

Best model with high r2 score


MAE     171660.636099
RMSE    663951.555290
R2           0.849553
Name: Random Forest, dtype: float64

# Hyper tuning for the best model Radom forest

In [71]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

def evaluate_model(model, X_train, y_train, X_test, y_test):
    # Fit the model on the training data
    model.fit(X_train, y_train)
    
    # Make predictions on the test data
    y_pred = model.predict(X_test)
    
    # Calculate evaluation metrics
    mae = mean_absolute_error(y_test, y_pred)
    rmse = mean_squared_error(y_test, y_pred, squared=False)  # RMSE
    r2 = r2_score(y_test, y_pred)
    
    # Print the metrics
    print("Mean Absolute Error (MAE):", mae)
    print("Root Mean Squared Error (RMSE):", rmse)
    print("R² Score:", r2)
    
    return mae, rmse, r2

# Define the parameter grid for hyperparameter tuning
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2', None]
}

# Initialize the Random Forest model
rf_model = RandomForestRegressor(random_state=42)

# Set up the grid search with cross-validation
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, scoring='r2', n_jobs=-1, verbose=2)

# Fit grid search
grid_search.fit(X_train, y_train)

# Retrieve the best parameters and best model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

# Evaluate the best model on the test set
best_model_performance = evaluate_model(best_model, X_train, y_train, X_test, y_test)
print("Best Parameters:", best_params)
print("Best Model Performance:", best_model_performance)

Fitting 5 folds for each of 243 candidates, totalling 1215 fits
Mean Absolute Error (MAE): 172356.02328303482
Root Mean Squared Error (RMSE): 656960.1610007397
R² Score: 0.852704888695496
Best Parameters: {'max_depth': 30, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 300}
Best Model Performance: (172356.02328303482, 656960.1610007397, 0.852704888695496)




# Training the best model with the best hyper tuning parameters

In [75]:
# final model fit and prediction
final_model=RandomForestRegressor(n_estimators=300,max_features=None,max_depth=30,min_samples_leaf=1,
                                  min_samples_split=2,random_state=42)

final_model.fit(X_train,y_train)

#Model evaluation
y_pred_fm=final_model.predict(X_test)
MSE_fm=mean_squared_error(y_test,y_pred_fm)
MAE_fm=mean_absolute_error(y_test,y_pred_fm)
RMSE_fm=np.sqrt(mean_squared_error(y_test,y_pred_fm))
r_squr_fm=r2_score(y_test,y_pred_fm)

print(f"Trainscore: {final_model.score(X_train, y_train)}")
print(f"Testscore: {final_model.score(X_test, y_test)}")

# create dataframe to view above evaluation metrics
rf = pd.DataFrame({"model":"RandomForestRegressor","MAE":[MAE_fm],"MSE":[MSE_fm],"RMSE":[RMSE_fm],"R2":r_squr_fm})
rf

Trainscore: 0.9604611462433542
Testscore: 0.852704888695496


Unnamed: 0,model,MAE,MSE,RMSE,R2
0,RandomForestRegressor,172356.023283,431596700000.0,656960.161001,0.852705


In [89]:
# Load the dataset
file_path = "E:/UDHAYA/Cardeko_project/Cleaned_csv/Processed_dataset.csv"
df = pd.read_csv(file_path)

# Define the target column
target_col = 'price'  # Replace 'price' with the actual name of your target column

# Separate target and feature columns
X = df.drop(columns=[target_col])
y = df[target_col]

# Identify categorical and numerical columns
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Ensure the target column is removed from the list of numerical columns
numerical_cols = [col for col in numerical_cols if col != target_col]

# Create preprocessing for numerical and categorical features
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessing steps into a ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ]
)

# Create the pipeline with preprocessing and model
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor(random_state=42))
])

# Split the data into training and testing sets
x_df_train, x_df_test, y_df_train, y_df_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [91]:
# fit with dataframe
pipeline.fit(x_df_train,y_df_train)

In [92]:
# evaluate the pipeline 
y_prediction=pipeline.predict(x_df_test)

MSE_pipe=mean_squared_error(y_df_test,y_prediction)

MAE_pipe=mean_absolute_error(y_df_test,y_prediction)

RMSE_pipe=np.sqrt(mean_squared_error(y_df_test,y_prediction))

r_squr_pipe=r2_score(y_df_test,y_prediction)

# create dataframe to view above evaluation metrics
Rf_Pipe=pd.DataFrame({"model":"RandomForestRegressor","MAE":[MAE_pipe],"MSE":[MSE_pipe],"RMSE":[RMSE_pipe],"R2":r_squr_pipe})
Rf_Pipe

Unnamed: 0,model,MAE,MSE,RMSE,R2
0,RandomForestRegressor,163349.645358,417274100000.0,645967.539179,0.857593


# Model testing and Prediction

In [126]:
# New data to predict car price
new_df=pd.DataFrame({
    'Fuel type':'Disel',
    'body type':'Sedan',
    'transmission':'Automatic',
    'ownerNo':1,
    'Brand':'BMW',
    'model':'BMW 5 Series',
    'modelYear':2020,
    'Insurance Type': 'Third Party insurance',
    'Kms Driven':35000.0,
    'Mileage':18,
    'Seats':5,
    'Color':'White',
    'City':'Bangalore'
},index=[0])
new_df

Unnamed: 0,Fuel type,body type,transmission,ownerNo,Brand,model,modelYear,Insurance Type,Kms Driven,Mileage,Seats,Color,City
0,Disel,Sedan,Automatic,1,BMW,BMW 5 Series,2020,Third Party insurance,35000.0,18,5,White,Bangalore


In [128]:
# FINAL MODEL PREDICTION 
prediction=pipeline.predict(new_df)
print(f"The price of the {new_df['Brand'].iloc[0]} car is: ₹{round(prediction[0],2)} ")

The price of the BMW car is: ₹4647300.0 


In [132]:
import joblib
joblib.dump(pipeline, 'E:/UDHAYA/Cardeko_project/pipeline_model.pkl')

['E:/UDHAYA/Cardeko_project/pipeline_model.pkl']