In [2]:
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plot
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn import metrics
import pickle

# Load the dataset
df = pd.read_csv("car data.csv")

# Preprocess the dataset
final_dataset = df[['Year', 'Selling_Price', 'Present_Price', 'Kms_Driven',
                    'Fuel_Type', 'Seller_Type', 'Transmission', 'Owner']]
final_dataset['Current_Year'] = 2023
final_dataset['number_of_years'] = final_dataset['Current_Year'] - final_dataset['Year']
final_dataset.drop(['Year', 'Current_Year'], axis=1, inplace=True)
final_dataset = pd.get_dummies(final_dataset, drop_first=True)

# Split the dataset into independent and dependent features
X = final_dataset.iloc[:, 1:]
y = final_dataset.iloc[:, 0]

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create the random grid for hyperparameter tuning
n_estimators = [int(x) for x in np.linspace(start=100, stop=1200, num=12)]
max_features = ['auto', 'sqrt', 'log2', None]
max_depth = [int(x) for x in np.linspace(5, 30, num=6)]
min_samples_split = [2, 5, 10, 15, 100]
min_samples_leaf = [1, 2, 5, 10]

random_grid = {
    'n_estimators': n_estimators,
    'max_features': max_features,
    'max_depth': max_depth,
    'min_samples_split': min_samples_split,
    'min_samples_leaf': min_samples_leaf
}

# Create the base model to tune
rf = RandomForestRegressor()

# Random search of parameters using 5-fold cross-validation
rf_random = RandomizedSearchCV(estimator=rf, param_distributions=random_grid,
                               scoring='neg_mean_squared_error', n_iter=10,
                               cv=5, verbose=2, random_state=42, n_jobs=1)

# Fit the random search model to the training data
rf_random.fit(X_train, y_train)

# Get the best hyperparameters
best_params = rf_random.best_params_

# Create the final Random Forest model with the best hyperparameters
rf_final = RandomForestRegressor(n_estimators=best_params['n_estimators'],
                                 max_features=best_params['max_features'],
                                 max_depth=best_params['max_depth'],
                                 min_samples_split=best_params['min_samples_split'],
                                 min_samples_leaf=best_params['min_samples_leaf'],
                                 random_state=42)

# Train the final model on the training data
rf_final.fit(X_train, y_train)

# Make predictions on the test data
predictions = rf_final.predict(X_test)

# Evaluate the model's performance
mae = metrics.mean_absolute_error(y_test, predictions)
mse = metrics.mean_squared_error(y_test, predictions)
rmse = np.sqrt(mse)

print('Mean Absolute Error:', mae)
print('Mean Squared Error:', mse)
print('Root Mean Squared Error:', rmse)

# Save the final model to a file
with open('random_forest_regression_model.pkl', 'wb') as file:
    pickle.dump(rf_final, file)


Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] END max_depth=5, max_features=None, min_samples_leaf=5, min_samples_split=5, n_estimators=900; total time=   1.5s
[CV] END max_depth=5, max_features=None, min_samples_leaf=5, min_samples_split=5, n_estimators=900; total time=   1.3s
[CV] END max_depth=5, max_features=None, min_samples_leaf=5, min_samples_split=5, n_estimators=900; total time=   1.5s
[CV] END max_depth=5, max_features=None, min_samples_leaf=5, min_samples_split=5, n_estimators=900; total time=   1.3s
[CV] END max_depth=5, max_features=None, min_samples_leaf=5, min_samples_split=5, n_estimators=900; total time=   1.3s
[CV] END max_depth=30, max_features=log2, min_samples_leaf=2, min_samples_split=100, n_estimators=300; total time=   0.2s
[CV] END max_depth=30, max_features=log2, min_samples_leaf=2, min_samples_split=100, n_estimators=300; total time=   0.3s
[CV] END max_depth=30, max_features=log2, min_samples_leaf=2, min_samples_split=100, n_estimators=30

5 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\srsha\anaconda3\envs\carprediction\lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\srsha\anaconda3\envs\carprediction\lib\site-packages\sklearn\base.py", line 1144, in wrapper
    estimator._validate_params()
  File "C:\Users\srsha\anaconda3\envs\carprediction\lib\site-packages\sklearn\base.py", line 637, in _validate_params
    validate_parameter_constraints(
  File "C:\Users\srsha\anaconda3\envs\carprediction\lib\site-packages\sklearn\utils\_param_validation.

Mean Absolute Error: 0.7947042579286103
Mean Squared Error: 1.6910783352652745
Root Mean Squared Error: 1.3004146781951034
