In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score


In [3]:
# Load the dataset
data = pd.read_csv('data.csv')

# Display basic information about the dataset
data.info()
data.head()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 92000 entries, 0 to 91999
Data columns (total 27 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Vehicle_ID             92000 non-null  int64  
 1   Make_and_Model         92000 non-null  object 
 2   Year_of_Manufacture    92000 non-null  int64  
 3   Vehicle_Type           92000 non-null  object 
 4   Usage_Hours            92000 non-null  int64  
 5   Route_Info             92000 non-null  object 
 6   Load_Capacity          92000 non-null  float64
 7   Actual_Load            92000 non-null  float64
 8   Last_Maintenance_Date  92000 non-null  object 
 9   Maintenance_Type       92000 non-null  object 
 10  Maintenance_Cost       92000 non-null  float64
 11  Engine_Temperature     92000 non-null  float64
 12  Tire_Pressure          92000 non-null  float64
 13  Fuel_Consumption       92000 non-null  float64
 14  Battery_Status         92000 non-null  float64
 15  Vi

Unnamed: 0,Vehicle_ID,Make_and_Model,Year_of_Manufacture,Vehicle_Type,Usage_Hours,Route_Info,Load_Capacity,Actual_Load,Last_Maintenance_Date,Maintenance_Type,...,Brake_Condition,Failure_History,Anomalies_Detected,Predictive_Score,Maintenance_Required,Weather_Conditions,Road_Conditions,Delivery_Times,Downtime_Maintenance,Impact_on_Efficiency
0,1,Ford F-150,2022,Truck,530,Rural,7.534549,9.004247,2023-04-09,Oil Change,...,Good,1,0,0.171873,1,Clear,Highway,30.0,0.093585,0.150063
1,2,Volvo FH,2015,Van,10679,Rural,7.671728,6.111785,2023-07-20,Tire Rotation,...,Fair,1,0,0.24667,1,Clear,Rural,30.0,3.361201,0.343017
2,3,Chevy Silverado,2022,Van,4181,Rural,2.901159,3.006055,2023-03-17,Oil Change,...,Good,1,1,0.455236,1,Clear,Highway,48.627823,1.3653,0.1
3,4,Chevy Silverado,2011,Truck,2974,Urban,15.893347,18.82529,2024-05-01,Tire Rotation,...,Good,0,1,0.060208,1,Clear,Highway,30.0,0.0,0.135749
4,5,Ford F-150,2014,Van,2539,Rural,60.66832,65.605463,2023-11-15,Tire Rotation,...,Good,1,1,0.264929,1,Rainy,Urban,300.0,6.608704,0.395193


In [4]:
# Fill missing categorical values with the mode
categorical_features = ['Make_and_Model', 'Vehicle_Type', 'Route_Info', 'Weather_Conditions', 'Road_Conditions']
for feature in categorical_features:
    data[feature] = data[feature].fillna(data[feature].mode()[0])

# Fill missing numerical values with the mean
numerical_features = ['Usage_Hours', 'Load_Capacity', 'Actual_Load', 'Engine_Temperature', 'Tire_Pressure',
                      'Fuel_Consumption', 'Battery_Status', 'Vibration_Levels', 'Anomalies_Detected',
                      'Predictive_Score', 'Downtime_Maintenance']
for feature in numerical_features:
    data[feature] = data[feature].fillna(data[feature].mean())


In [5]:
# Convert 'Last_Maintenance_Date' to datetime format
data['Last_Maintenance_Date'] = pd.to_datetime(data['Last_Maintenance_Date'], errors='coerce')

# Derived Features
data['Time_Since_Last_Maintenance'] = (pd.to_datetime('today') - data['Last_Maintenance_Date']).dt.days
data['Vehicle_Age'] = pd.to_datetime('today').year - data['Year_of_Manufacture']
data['Load_Utilization'] = (data['Actual_Load'] / data['Load_Capacity']) * 100


In [6]:
# Encode categorical features using Label Encoding
label_encoders = {}
for feature in categorical_features:
    label_encoders[feature] = LabelEncoder()
    data[feature] = label_encoders[feature].fit_transform(data[feature])


In [7]:
# Features to drop
columns_to_drop = ['Vehicle_ID', 'Year_of_Manufacture', 'Last_Maintenance_Date', 'Maintenance_Type',
                   'Maintenance_Cost', 'Brake_Condition', 'Oil_Quality', 'Failure_History',
                   'Delivery_Times', 'Downtime_Maintenance', 'Impact_on_Efficiency', 'Engine_Temperature','Vehicle_Type','Battery_Status']

# Drop irrelevant features before splitting the data
X = data.drop(columns=['Maintenance_Required'] + columns_to_drop)

print(X.columns)

# Target variable remains the same
y = data['Maintenance_Required']

# Split the dataset into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


Index(['Make_and_Model', 'Usage_Hours', 'Route_Info', 'Load_Capacity',
       'Actual_Load', 'Tire_Pressure', 'Fuel_Consumption', 'Vibration_Levels',
       'Anomalies_Detected', 'Predictive_Score', 'Weather_Conditions',
       'Road_Conditions', 'Time_Since_Last_Maintenance', 'Vehicle_Age',
       'Load_Utilization'],
      dtype='object')


In [8]:
from imblearn.over_sampling import SMOTE
smote = SMOTE(sampling_strategy=0.7,random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)



In [9]:
model = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
model.fit(X_train_res, y_train_res)

# Initialize and train the Random Forest Classifier
# model = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
# model.fit(X_train, y_train)

In [10]:
# from xgboost import XGBClassifier

# scale_pos_weight = len(y_train[y_train == 0]) / len(y_train[y_train == 1])

# model = XGBClassifier(n_estimators=100,
#     learning_rate=0.1,
#     max_depth=6,
#     # scale_pos_weight=scale_pos_weight,  # Adjusts class weights
#     random_state=42)
# model.fit(X_train, y_train)


In [11]:
# Make predictions on the test set
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]

# Evaluate the model
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nROC-AUC Score:", roc_auc_score(y_test, y_prob))


Confusion Matrix:
 [[ 2172  2139]
 [ 2964 11125]]

Classification Report:
               precision    recall  f1-score   support

           0       0.42      0.50      0.46      4311
           1       0.84      0.79      0.81     14089

    accuracy                           0.72     18400
   macro avg       0.63      0.65      0.64     18400
weighted avg       0.74      0.72      0.73     18400


ROC-AUC Score: 0.7907111333641841


In [12]:
# Feature importance analysis
importances = model.feature_importances_
feature_names = X.columns
importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
importance_df = importance_df.sort_values(by='Importance', ascending=False)

print("\nTop Features Contributing to Maintenance Prediction:\n", importance_df)



Top Features Contributing to Maintenance Prediction:
                         Feature  Importance
8            Anomalies_Detected    0.393884
13                  Vehicle_Age    0.068020
7              Vibration_Levels    0.059905
1                   Usage_Hours    0.059236
12  Time_Since_Last_Maintenance    0.058208
9              Predictive_Score    0.058099
14             Load_Utilization    0.057980
4                   Actual_Load    0.054368
3                 Load_Capacity    0.054118
6              Fuel_Consumption    0.038284
5                 Tire_Pressure    0.032191
0                Make_and_Model    0.022082
11              Road_Conditions    0.016797
2                    Route_Info    0.015959
10           Weather_Conditions    0.010870
