# 1. Grid CV

In [1]:
import pandas as pd
# Load the dataset
data = pd.read_csv("./Vol_fuel_efficiency_data.csv")
data

Unnamed: 0,Timestamp,Truck_ID,Engine_Type,Maintenance_History,Weather_Temperature,Average_Speed,Fuel_Consumption
0,2023-01-01 00:00:00,TRK001,Gasoline,No history,23.889392,92.482529,9.712944
1,2023-01-01 01:00:00,TRK001,Gasoline,No history,22.023105,75.861589,10.728945
2,2023-01-01 02:00:00,TRK001,Diesel,No history,11.289818,54.941086,11.583613
3,2023-01-01 03:00:00,TRK001,Diesel,No history,10.137305,52.658868,11.176255
4,2023-01-01 04:00:00,TRK001,Diesel,No history,20.150700,86.757627,10.295938
...,...,...,...,...,...,...,...
115,2023-01-01 19:00:00,TRK005,Gasoline,No history,19.633839,57.852596,10.881121
116,2023-01-01 20:00:00,TRK005,Diesel,Service recently,25.326517,70.795798,11.658100
117,2023-01-01 21:00:00,TRK005,Gasoline,Service recently,23.263413,73.347791,9.326941
118,2023-01-01 22:00:00,TRK005,Gasoline,Service recently,24.613533,63.821579,11.992151


In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Load the dataset
data = pd.read_csv("./Vol_fuel_efficiency_data.csv")

# Preprocessing: Convert timestamp to datetime
data["Timestamp"] = pd.to_datetime(data["Timestamp"])

# Feature engineering: Extract day and hour from timestamp
data["Day"] = data["Timestamp"].dt.day
data["Hour"] = data["Timestamp"].dt.hour

# Drop unnecessary columns
data = data.drop(["Timestamp", "Truck_ID"], axis=1)

# Encode categorical features
data = pd.get_dummies(data, columns=["Engine_Type", "Maintenance_History"])

# Separate features and target variable
X = data.drop("Fuel_Consumption", axis=1)
y = data["Fuel_Consumption"]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define parameter grid for Grid Search
param_grid = {
    "n_estimators": [100, 200, 300],
    "max_depth": [1, 10, 20],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4]
}

# Initialize the model
model = RandomForestRegressor(random_state=42)

# Initialize Grid Search with cross-validation
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

# Fit the Grid Search to the training data
grid_search.fit(X_train, y_train)

# Get the best model from Grid Search
best_model = grid_search.best_estimator_

# Predict on test data
y_pred = best_model.predict(X_test)

# Calculate Mean Squared Error
mse = mean_squared_error(y_test, y_pred)
print(f"Best Model MSE: {mse:.2f}")

# Print the best hyperparameters
print("Best Hyperparameters:", grid_search.best_params_)


Best Model MSE: 1.39
Best Hyperparameters: {'max_depth': 1, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 300}


### Interpretation:

After applying Grid Search, you'll get the best hyperparameters for the Random Forest Regressor model. The selected hyperparameters will be those that minimize the Mean Squared Error on the validation sets during cross-validation.

The param_grid specifies different values for the hyperparameters n_estimators, max_depth, min_samples_split, and min_samples_leaf. The Grid Search algorithm tries out all combinations of these hyperparameters and selects the combination that results in the lowest Mean Squared Error.

The output will include the best hyperparameters found by Grid Search and the Mean Squared Error of the best model on the test set. This helps you assess how well the tuned model performs on new, unseen data.

Remember that the specific results and interpretations will depend on the dataset, the range of hyperparameters you provide, and the characteristics of the algorithm you're using.

In [8]:
# Load the dataset
data = pd.read_csv("./Vol_fuel_efficiency_data.csv")
data

Unnamed: 0,Timestamp,Truck_ID,Engine_Type,Maintenance_History,Weather_Temperature,Average_Speed,Fuel_Consumption
0,2023-01-01 00:00:00,TRK001,Gasoline,No history,23.889392,92.482529,9.712944
1,2023-01-01 01:00:00,TRK001,Gasoline,No history,22.023105,75.861589,10.728945
2,2023-01-01 02:00:00,TRK001,Diesel,No history,11.289818,54.941086,11.583613
3,2023-01-01 03:00:00,TRK001,Diesel,No history,10.137305,52.658868,11.176255
4,2023-01-01 04:00:00,TRK001,Diesel,No history,20.150700,86.757627,10.295938
...,...,...,...,...,...,...,...
115,2023-01-01 19:00:00,TRK005,Gasoline,No history,19.633839,57.852596,10.881121
116,2023-01-01 20:00:00,TRK005,Diesel,Service recently,25.326517,70.795798,11.658100
117,2023-01-01 21:00:00,TRK005,Gasoline,Service recently,23.263413,73.347791,9.326941
118,2023-01-01 22:00:00,TRK005,Gasoline,Service recently,24.613533,63.821579,11.992151


# 2. Random Search

In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Load the dataset
data = pd.read_csv("./Vol_fuel_efficiency_data.csv")

# Preprocessing: Convert timestamp to datetime
data["Timestamp"] = pd.to_datetime(data["Timestamp"])

# Feature engineering: Extract day and hour from timestamp
data["Day"] = data["Timestamp"].dt.day
data["Hour"] = data["Timestamp"].dt.hour

# Drop unnecessary columns
data = data.drop(["Timestamp", "Truck_ID"], axis=1)

# Encode categorical features
data = pd.get_dummies(data, columns=["Engine_Type", "Maintenance_History"])

# Separate features and target variable
X = data.drop("Fuel_Consumption", axis=1)
y = data["Fuel_Consumption"]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define parameter distributions for Random Search
param_dist = {
    "n_estimators": [100, 200, 600],
    "max_depth": [None, 10, 20],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4]
}

# Initialize the model
model = RandomForestRegressor(random_state=42)

# Initialize Random Search with cross-validation
random_search = RandomizedSearchCV(model, param_distributions=param_dist,
                                   n_iter=10, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)

# Fit the Random Search to the training data
random_search.fit(X_train, y_train)

# Get the best model from Random Search
best_model = random_search.best_estimator_

# Predict on test data
y_pred = best_model.predict(X_test)

# Calculate Mean Squared Error
mse = mean_squared_error(y_test, y_pred)
print(f"Best Model MSE: {mse:.2f}")

# Print the best hyperparameters
print("Best Hyperparameters:", random_search.best_params_)


Best Model MSE: 1.45
Best Hyperparameters: {'n_estimators': 600, 'min_samples_split': 2, 'min_samples_leaf': 4, 'max_depth': 20}


### Interpretation:

After applying Random Search, you'll get the best hyperparameters for the Random Forest Regressor model. The selected hyperparameters will be based on randomly sampled combinations from the specified parameter distributions.

The param_dist specifies different values for the hyperparameters n_estimators, max_depth, min_samples_split, and min_samples_leaf. The Random Search algorithm randomly samples values from these distributions and selects the combination that results in the lowest Mean Squared Error on the validation sets during cross-validation.

The output will include the best hyperparameters found by Random Search and the Mean Squared Error of the best model on the test set. This helps you assess how well the tuned model performs on new, unseen data.

The key difference between Random Search and Grid Search is that Random Search randomly samples hyperparameter values, making it more efficient for searching through a large hyperparameter space. However, it might not guarantee finding the absolute best combination but is often more practical for high-dimensional problems.

# 3. Bayesian Optimization

In [25]:
import pandas as pd
data = pd.read_csv("automobile_part_life_prediction.csv")


In [26]:
data

Unnamed: 0,Date,Part_Type,Car_Model,Mileage,Temperature,Humidity,Usage_Hours,Part_Life
0,19-09-2022,Electronics,Hatchback,119877,21.142292,61.827068,1193,1268.012708
1,17-01-2018,Brakes,Truck,48098,4.456887,70.661045,981,1085.693410
2,27-03-2013,Engine,Sedan,54209,6.097699,2.685836,1151,1097.985648
3,18-09-2013,Engine,Truck,111030,74.792728,9.659201,709,708.579241
4,21-11-2012,Suspension,Sedan,57342,93.622667,75.015403,1802,1451.863833
...,...,...,...,...,...,...,...,...
995,17-07-2013,Brakes,Sedan,59597,3.471163,68.462961,1833,1664.782270
996,08-02-2017,Suspension,SUV,144618,29.116614,2.092660,618,550.358400
997,06-02-2018,Engine,Truck,100587,91.947420,91.476072,856,1022.777301
998,02-03-2013,Transmission,Sedan,89624,44.383940,53.679957,809,849.308150


In [28]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from bayes_opt import BayesianOptimization

# Load the dataset
data = pd.read_csv("./automobile_part_life_prediction.csv")

# Preprocessing: Convert Date to datetime
data["Date"] = pd.to_datetime(data["Date"])

# Feature engineering: Extract day, month, and year from Date
data["Year"] = data["Date"].dt.year
data["Month"] = data["Date"].dt.month
data["Day"] = data["Date"].dt.day

# Drop unnecessary columns
data = data.drop("Date", axis=1)

# Encode categorical features
data = pd.get_dummies(data, columns=["Part_Type", "Car_Model"])

# Separate features and target variable
X = data.drop("Part_Life", axis=1)
y = data["Part_Life"]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the objective function for Bayesian Optimization
def optimize_rf(n_estimators, max_depth, min_samples_split, min_samples_leaf):
    model = RandomForestRegressor(
        n_estimators=int(n_estimators),
        max_depth=int(max_depth),
        min_samples_split=int(min_samples_split),
        min_samples_leaf=int(min_samples_leaf),
        random_state=42,
    )
    model.fit(X_train, y_train)
    return -model.score(X_test, y_test)  # Negative since BayesianOptimization minimizes

# Define the search space for Bayesian Optimization
pbounds = {
    'n_estimators': (10, 2000),
    'max_depth': (1, 500),
    'min_samples_split': (2, 200),
    'min_samples_leaf': (1, 200),
}

# Initialize Bayesian Optimization object
optimizer = BayesianOptimization(
    f=optimize_rf,
    pbounds=pbounds,
    random_state=42,
)

# Perform optimization iterations
optimizer.maximize(
    init_points=5,
    n_iter=10,
)

# Print the best hyperparameters and score
print("Best Hyperparameters:", optimizer.max['params'])
print("Best Score:", -optimizer.max['target'])


  data["Date"] = pd.to_datetime(data["Date"])


|   iter    |  target   | max_depth | min_sa... | min_sa... | n_esti... |
-------------------------------------------------------------------------
| [0m1        [0m | [0m-0.6695  [0m | [0m187.9    [0m | [0m190.2    [0m | [0m146.9    [0m | [0m1.201e+03[0m |
| [0m2        [0m | [0m-0.8679  [0m | [0m78.85    [0m | [0m32.04    [0m | [0m13.5     [0m | [0m1.734e+03[0m |
| [0m3        [0m | [0m-0.682   [0m | [0m301.0    [0m | [0m141.9    [0m | [0m6.076    [0m | [0m1.94e+03 [0m |
| [0m4        [0m | [0m-0.8655  [0m | [0m416.4    [0m | [0m43.26    [0m | [0m38.0     [0m | [0m375.0    [0m |
| [0m5        [0m | [0m-0.8305  [0m | [0m152.8    [0m | [0m105.4    [0m | [0m87.53    [0m | [0m589.5    [0m |
| [0m6        [0m | [0m-0.7519  [0m | [0m361.8    [0m | [0m133.9    [0m | [0m46.12    [0m | [0m137.2    [0m |
| [0m7        [0m | [0m-0.8645  [0m | [0m25.56    [0m | [0m4.678    [0m | [0m120.9    [0m | [0m1.45e+03 [0m 

### In this example, we're using the Rosenbrock function, which is a standard optimization benchmark. Bayesian Optimization tries to find the minimum value of this function within the defined search space.

Here's a breakdown of the code:

Define the objective function (rosenbrock in this case).
Define the search space (pbounds) by specifying the ranges for each parameter to be optimized.
Initialize the Bayesian Optimization object (optimizer) with the objective function and search space.
Use the maximize method to perform optimization iterations. We specify the number of initial points (init_points) and the number of subsequent iterations (n_iter).
The max attribute of the optimizer object contains the optimal point and function value.
Bayesian Optimization builds a probabilistic model of the objective function based on the initial points and then selects new points to evaluate in a way that balances exploration and exploitation. It's particularly useful when the objective function is expensive to evaluate, as it aims to find the optimal point with fewer evaluations compared to grid search or random search.

In [2]:
import pandas as pd
# Load the dataset
data = pd.read_csv("./automobile_part_life_prediction.csv")
data

Unnamed: 0,Date,Part_Type,Car_Model,Mileage,Temperature,Humidity,Usage_Hours,Part_Life
0,19-09-2022,Electronics,Hatchback,119877,21.142292,61.827068,1193,1268.012708
1,17-01-2018,Brakes,Truck,48098,4.456887,70.661045,981,1085.693410
2,27-03-2013,Engine,Sedan,54209,6.097699,2.685836,1151,1097.985648
3,18-09-2013,Engine,Truck,111030,74.792728,9.659201,709,708.579241
4,21-11-2012,Suspension,Sedan,57342,93.622667,75.015403,1802,1451.863833
...,...,...,...,...,...,...,...,...
995,17-07-2013,Brakes,Sedan,59597,3.471163,68.462961,1833,1664.782270
996,08-02-2017,Suspension,SUV,144618,29.116614,2.092660,618,550.358400
997,06-02-2018,Engine,Truck,100587,91.947420,91.476072,856,1022.777301
998,02-03-2013,Transmission,Sedan,89624,44.383940,53.679957,809,849.308150


# 4.1 Gradient Descent

In [30]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Load the dataset
data = pd.read_csv("./automobile_part_life_prediction.csv")

# Preprocessing: Convert Date to datetime
data["Date"] = pd.to_datetime(data["Date"])

# Feature engineering: Extract day, month, and year from Date
data["Year"] = data["Date"].dt.year
data["Month"] = data["Date"].dt.month
data["Day"] = data["Date"].dt.day

# Drop unnecessary columns
data = data.drop("Date", axis=1)

# Encode categorical features
data = pd.get_dummies(data, columns=["Part_Type", "Car_Model"])

# Separate features and target variable
X = data.drop("Part_Life", axis=1)
y = data["Part_Life"]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Linear Regression model
model = LinearRegression()

# Fit the model to the training data
model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = model.predict(X_test)

# Calculate Mean Squared Error
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)


Mean Squared Error: 26596.967671163202


  data["Date"] = pd.to_datetime(data["Date"])


### In this code:

We preprocess the dataset similarly to the previous examples.
We split the data into features (X) and the target (y).
We initialize a Linear Regression model using sklearn's LinearRegression class.
We fit the model to the training data.
We make predictions on the test data and calculate the Mean Squared Error (MSE) to evaluate the model's performance.
Gradient Descent is the default optimization method used by LinearRegression in scikit-learn. The model is trained to minimize the MSE, which measures the average squared difference between the predicted and actual values.

This code applies Gradient Descent behind the scenes to optimize the model's coefficients and find the best-fitting linear relationship between the features and the target variable.

# 4.2 Stochastic Gradient Descent (SGD)

In [31]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import mean_squared_error

# Load the dataset
data = pd.read_csv("./automobile_part_life_prediction.csv")

# Preprocessing: Convert Date to datetime
data["Date"] = pd.to_datetime(data["Date"])

# Feature engineering: Extract day, month, and year from Date
data["Year"] = data["Date"].dt.year
data["Month"] = data["Date"].dt.month
data["Day"] = data["Date"].dt.day

# Drop unnecessary columns
data = data.drop("Date", axis=1)

# Encode categorical features
data = pd.get_dummies(data, columns=["Part_Type", "Car_Model"])

# Separate features and target variable
X = data.drop("Part_Life", axis=1)
y = data["Part_Life"]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the SGDRegressor model
model = SGDRegressor(max_iter=1000, random_state=42)

# Fit the model to the training data
model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = model.predict(X_test)

# Calculate Mean Squared Error
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)


Mean Squared Error: 3.641682216831581e+36


  data["Date"] = pd.to_datetime(data["Date"])


### In this code:

We preprocess the dataset similarly to previous examples.
We split the data into features (X) and the target (y).
We initialize an SGDRegressor model using sklearn's SGDRegressor class.
We fit the model to the training data using Stochastic Gradient Descent optimization.
We make predictions on the test data and calculate the Mean Squared Error (MSE) to evaluate the model's performance.
SGDRegressor uses Stochastic Gradient Descent to optimize the model's coefficients iteratively. It updates the coefficients using a small batch of training data at a time, which makes it suitable for large datasets.

In [5]:
import pandas as pd
# Load the dataset
data = pd.read_csv("./automobile_part_life_prediction.csv")
data

Unnamed: 0,Date,Part_Type,Car_Model,Mileage,Temperature,Humidity,Usage_Hours,Part_Life
0,19-09-2022,Electronics,Hatchback,119877,21.142292,61.827068,1193,1268.012708
1,17-01-2018,Brakes,Truck,48098,4.456887,70.661045,981,1085.693410
2,27-03-2013,Engine,Sedan,54209,6.097699,2.685836,1151,1097.985648
3,18-09-2013,Engine,Truck,111030,74.792728,9.659201,709,708.579241
4,21-11-2012,Suspension,Sedan,57342,93.622667,75.015403,1802,1451.863833
...,...,...,...,...,...,...,...,...
995,17-07-2013,Brakes,Sedan,59597,3.471163,68.462961,1833,1664.782270
996,08-02-2017,Suspension,SUV,144618,29.116614,2.092660,618,550.358400
997,06-02-2018,Engine,Truck,100587,91.947420,91.476072,856,1022.777301
998,02-03-2013,Transmission,Sedan,89624,44.383940,53.679957,809,849.308150


# 5. Ensemble methods:
     bagging, 
     boosting, and 
     stacking


In [33]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import BaggingRegressor, AdaBoostRegressor, StackingRegressor
from sklearn.metrics import mean_squared_error

# Load the dataset
data = pd.read_csv("./automobile_part_life_prediction.csv")

# Preprocessing: Convert Date to datetime
data["Date"] = pd.to_datetime(data["Date"])

# Feature engineering: Extract day, month, and year from Date
data["Year"] = data["Date"].dt.year
data["Month"] = data["Date"].dt.month
data["Day"] = data["Date"].dt.day

# Drop unnecessary columns
data = data.drop("Date", axis=1)

# Encode categorical features
data = pd.get_dummies(data, columns=["Part_Type", "Car_Model"])

# Separate features and target variable
X = data.drop("Part_Life", axis=1)
y = data["Part_Life"]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize base linear regression model
base_model = LinearRegression()

# Initialize Bagging Regressor
bagging_model = BaggingRegressor(base_model, n_estimators=10, random_state=42)

# Initialize AdaBoost Regressor
adaboost_model = AdaBoostRegressor(base_model, n_estimators=10, random_state=42)

# Create a list of models for stacking
estimators = [('bagging', bagging_model), ('adaboost', adaboost_model)]
stacking_model = StackingRegressor(estimators=estimators, final_estimator=LinearRegression())

# Fit the models
base_model.fit(X_train, y_train)
bagging_model.fit(X_train, y_train)
adaboost_model.fit(X_train, y_train)
stacking_model.fit(X_train, y_train)

# Make predictions on the test data
y_pred_base = base_model.predict(X_test)
y_pred_bagging = bagging_model.predict(X_test)
y_pred_adaboost = adaboost_model.predict(X_test)
y_pred_stacking = stacking_model.predict(X_test)

# Calculate Mean Squared Error for each model
mse_base = mean_squared_error(y_test, y_pred_base)
mse_bagging = mean_squared_error(y_test, y_pred_bagging)
mse_adaboost = mean_squared_error(y_test, y_pred_adaboost)
mse_stacking = mean_squared_error(y_test, y_pred_stacking)

print("Mean Squared Error (Base Model):", mse_base)
print("Mean Squared Error (Bagging Model):", mse_bagging)
print("Mean Squared Error (AdaBoost Model):", mse_adaboost)
print("Mean Squared Error (Stacking Model):", mse_stacking)


  data["Date"] = pd.to_datetime(data["Date"])


Mean Squared Error (Base Model): 26596.967671163202
Mean Squared Error (Bagging Model): 26636.106153008477
Mean Squared Error (AdaBoost Model): 26649.665269250385
Mean Squared Error (Stacking Model): 26662.215858176303


### Interpretation:

We preprocess the dataset similarly to previous examples.
We split the data into features (X) and the target (y).
We initialize a base Linear Regression model.
We create Bagging and AdaBoost regressors, both using the base model.
We create a Stacking Regressor with Bagging and AdaBoost models as base models and a final Linear Regression model as the meta-estimator.
We fit all models to the training data.
We make predictions using each model on the test data and calculate the Mean Squared Error (MSE) for evaluation.
You can interpret the results by comparing the MSE values for different models. Lower MSE values indicate better model performance. Ensemble techniques like Bagging, Boosting, and Stacking aim to improve predictive performance by combining the strengths of multiple models.

In [6]:
import pandas as pd
# Load the dataset
data = pd.read_csv("./automobile_part_life_prediction.csv")
data

Unnamed: 0,Date,Part_Type,Car_Model,Mileage,Temperature,Humidity,Usage_Hours,Part_Life
0,19-09-2022,Electronics,Hatchback,119877,21.142292,61.827068,1193,1268.012708
1,17-01-2018,Brakes,Truck,48098,4.456887,70.661045,981,1085.693410
2,27-03-2013,Engine,Sedan,54209,6.097699,2.685836,1151,1097.985648
3,18-09-2013,Engine,Truck,111030,74.792728,9.659201,709,708.579241
4,21-11-2012,Suspension,Sedan,57342,93.622667,75.015403,1802,1451.863833
...,...,...,...,...,...,...,...,...
995,17-07-2013,Brakes,Sedan,59597,3.471163,68.462961,1833,1664.782270
996,08-02-2017,Suspension,SUV,144618,29.116614,2.092660,618,550.358400
997,06-02-2018,Engine,Truck,100587,91.947420,91.476072,856,1022.777301
998,02-03-2013,Transmission,Sedan,89624,44.383940,53.679957,809,849.308150


# 6. Automated Hyperparameter Tuning

# 6.1 Optuna:
Optuna is an open-source hyperparameter optimization framework. It uses a combination of Bayesian optimization and Pruned Recursive Search to intelligently search the hyperparameter space. It's known for its flexibility and ease of use.

In [48]:
import pandas as pd
import optuna
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error

# Load the dataset
data = pd.read_csv("./automobile_part_life_prediction.csv")

# Preprocessing: Convert Date to datetime
data["Date"] = pd.to_datetime(data["Date"])

# Feature engineering: Extract day, month, and year from Date
data["Year"] = data["Date"].dt.year
data["Month"] = data["Date"].dt.month
data["Day"] = data["Date"].dt.day

# Drop unnecessary columns
data = data.drop("Date", axis=1)

# Encode categorical features
data = pd.get_dummies(data, columns=["Part_Type", "Car_Model"])

# Separate features and target variable
X = data.drop("Part_Life", axis=1)
y = data["Part_Life"]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

def objective(trial):
    # Define hyperparameter search space
    max_depth = trial.suggest_int("max_depth", 1, 32)
    min_samples_split = trial.suggest_float("min_samples_split", 0.1, 1.0)
    min_samples_leaf = trial.suggest_float("min_samples_leaf", 0.1, 0.5)
    
    # Initialize Decision Tree Regressor with suggested hyperparameters
    model = DecisionTreeRegressor(max_depth=max_depth,
                                   min_samples_split=min_samples_split,
                                   min_samples_leaf=min_samples_leaf,
                                   random_state=42)
    
    # Fit the model to the training data
    model.fit(X_train, y_train)
    
    # Make predictions on the test data
    y_pred = model.predict(X_test)
    
    # Calculate Mean Squared Error
    mse = mean_squared_error(y_test, y_pred)
    return mse

# Initialize Optuna study
study = optuna.create_study(direction="minimize")

# Start optimization
study.optimize(objective, n_trials=100)

# Print optimization results
print("Number of finished trials:", len(study.trials))
print("Best trial:")
trial = study.best_trial

print("Value: ", trial.value)
print("Params: ")
for key, value in trial.params.items():
    print(f"    {key}: {value}")


  data["Date"] = pd.to_datetime(data["Date"])
[I 2023-08-14 16:45:01,506] A new study created in memory with name: no-name-bf710fbb-739c-4661-9591-041a3b02076d
[I 2023-08-14 16:45:01,511] Trial 0 finished with value: 37083.28371654218 and parameters: {'max_depth': 27, 'min_samples_split': 0.35169969051843675, 'min_samples_leaf': 0.13931946656867644}. Best is trial 0 with value: 37083.28371654218.
[I 2023-08-14 16:45:01,515] Trial 1 finished with value: 77369.99800068769 and parameters: {'max_depth': 22, 'min_samples_split': 0.326154848975291, 'min_samples_leaf': 0.39152702845213816}. Best is trial 0 with value: 37083.28371654218.
[I 2023-08-14 16:45:01,521] Trial 2 finished with value: 77369.99800068769 and parameters: {'max_depth': 4, 'min_samples_split': 0.6814755523798534, 'min_samples_leaf': 0.2658679685628593}. Best is trial 0 with value: 37083.28371654218.
[I 2023-08-14 16:45:01,525] Trial 3 finished with value: 77369.99800068769 and parameters: {'max_depth': 31, 'min_samples_spl

[I 2023-08-14 16:45:01,928] Trial 33 finished with value: 37083.28371654218 and parameters: {'max_depth': 4, 'min_samples_split': 0.2987371778670846, 'min_samples_leaf': 0.15189464916115367}. Best is trial 10 with value: 29431.31438088006.
[I 2023-08-14 16:45:01,945] Trial 34 finished with value: 29508.53112094879 and parameters: {'max_depth': 13, 'min_samples_split': 0.1548489545643431, 'min_samples_leaf': 0.10201390230402065}. Best is trial 10 with value: 29431.31438088006.
[I 2023-08-14 16:45:01,961] Trial 35 finished with value: 31278.216125245457 and parameters: {'max_depth': 17, 'min_samples_split': 0.24375141488895508, 'min_samples_leaf': 0.12911471543818706}. Best is trial 10 with value: 29431.31438088006.
[I 2023-08-14 16:45:01,976] Trial 36 finished with value: 37083.28371654218 and parameters: {'max_depth': 9, 'min_samples_split': 0.17510195173023027, 'min_samples_leaf': 0.15893847734230848}. Best is trial 10 with value: 29431.31438088006.
[I 2023-08-14 16:45:01,993] Trial 3

[I 2023-08-14 16:45:02,511] Trial 67 finished with value: 31278.216125245457 and parameters: {'max_depth': 16, 'min_samples_split': 0.23718746139912827, 'min_samples_leaf': 0.13320019539763622}. Best is trial 51 with value: 29321.083311840473.
[I 2023-08-14 16:45:02,530] Trial 68 finished with value: 30247.424047176613 and parameters: {'max_depth': 19, 'min_samples_split': 0.1392012622482795, 'min_samples_leaf': 0.11255467656540599}. Best is trial 51 with value: 29321.083311840473.
[I 2023-08-14 16:45:02,546] Trial 69 finished with value: 29321.083311840473 and parameters: {'max_depth': 24, 'min_samples_split': 0.16499039895434642, 'min_samples_leaf': 0.11037745235759298}. Best is trial 51 with value: 29321.083311840473.
[I 2023-08-14 16:45:02,562] Trial 70 finished with value: 37083.28371654218 and parameters: {'max_depth': 24, 'min_samples_split': 0.1725843443923643, 'min_samples_leaf': 0.1519162620213308}. Best is trial 51 with value: 29321.083311840473.
[I 2023-08-14 16:45:02,579] 

Number of finished trials: 100
Best trial:
Value:  29301.33141070164
Params: 
    max_depth: 28
    min_samples_split: 0.19508657709627364
    min_samples_leaf: 0.11127419084396953


In [7]:
import pandas as pd
# Load the dataset
data = pd.read_csv("./automobile_part_life_prediction.csv")
data

Unnamed: 0,Date,Part_Type,Car_Model,Mileage,Temperature,Humidity,Usage_Hours,Part_Life
0,19-09-2022,Electronics,Hatchback,119877,21.142292,61.827068,1193,1268.012708
1,17-01-2018,Brakes,Truck,48098,4.456887,70.661045,981,1085.693410
2,27-03-2013,Engine,Sedan,54209,6.097699,2.685836,1151,1097.985648
3,18-09-2013,Engine,Truck,111030,74.792728,9.659201,709,708.579241
4,21-11-2012,Suspension,Sedan,57342,93.622667,75.015403,1802,1451.863833
...,...,...,...,...,...,...,...,...
995,17-07-2013,Brakes,Sedan,59597,3.471163,68.462961,1833,1664.782270
996,08-02-2017,Suspension,SUV,144618,29.116614,2.092660,618,550.358400
997,06-02-2018,Engine,Truck,100587,91.947420,91.476072,856,1022.777301
998,02-03-2013,Transmission,Sedan,89624,44.383940,53.679957,809,849.308150


## 6.2 Hyperopt:
Hyperopt is another popular library for hyperparameter optimization. It uses a tree-structured Parzen Estimator (TPE) algorithm to guide the search in a probabilistic manner. It's well-suited for optimizing complex search spaces.

In [49]:
import pandas as pd
from hyperopt import fmin, tpe, hp, Trials
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error

# Load the dataset
data = pd.read_csv("./automobile_part_life_prediction.csv")

# Preprocessing: Convert Date to datetime
data["Date"] = pd.to_datetime(data["Date"])

# Feature engineering: Extract day, month, and year from Date
data["Year"] = data["Date"].dt.year
data["Month"] = data["Date"].dt.month
data["Day"] = data["Date"].dt.day

# Drop unnecessary columns
data = data.drop("Date", axis=1)

# Encode categorical features
data = pd.get_dummies(data, columns=["Part_Type", "Car_Model"])

# Separate features and target variable
X = data.drop("Part_Life", axis=1)
y = data["Part_Life"]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define hyperparameter search space
space = {
    'max_depth': hp.quniform('max_depth', 1, 32, 1),
    'min_samples_split': hp.uniform('min_samples_split', 0.1, 1.0),
    'min_samples_leaf': hp.uniform('min_samples_leaf', 0.1, 0.5)
}

def objective(params):
    # Initialize Decision Tree Regressor with suggested hyperparameters
    model = DecisionTreeRegressor(max_depth=int(params['max_depth']),
                                   min_samples_split=params['min_samples_split'],
                                   min_samples_leaf=params['min_samples_leaf'],
                                   random_state=42)
    
    # Fit the model to the training data
    model.fit(X_train, y_train)
    
    # Make predictions on the test data
    y_pred = model.predict(X_test)
    
    # Calculate Mean Squared Error
    mse = mean_squared_error(y_test, y_pred)
    return mse

# Initialize Trials
trials = Trials()

# Start optimization
best = fmin(fn=objective, space=space, algo=tpe.suggest, max_evals=100, trials=trials)

# Print optimization results
print("Best Hyperparameters:", best)


  data["Date"] = pd.to_datetime(data["Date"])


100%|██████████████████████████████████████████████| 100/100 [00:00<00:00, 166.59trial/s, best loss: 29301.33141070164]
Best Hyperparameters: {'max_depth': 18.0, 'min_samples_leaf': 0.11206632559313444, 'min_samples_split': 0.15555456752003632}
