In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

In [2]:
cat = [
    "manufacturer", 
    "model", 
    "type", 
    "drive_type", 
    "fuel_type", 
    "color", 
    "fast_charging",  # Boolean/Binary indicator
    "country", 
    "city", 
]

num = [
    "battery_kwh", 
    "range_km", 
    "charging_time_hr", 
    "release_year", 
    "seats", 
    "acceleration_0_100_kmph", 
    "top_speed_kmph", 
    "warranty_years", 
    "cargo_space_liters", 
    "safety_rating"
]

In [3]:
df=pd.read_csv('../data/cleaned_dataset.csv')



In [4]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import cross_val_score

X = df.drop(columns=['target_high_efficiency', 'price_usd','efficiency_score'])
y = df['target_high_efficiency']

print("Classification Task - Predicting High Efficiency EVs")
print(f"Dataset shape: {X.shape}")
print(f"Class distribution:\n{y.value_counts()}")

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat),
        ('num', StandardScaler(), num)
    ]
)

model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(
        max_iter=1000,
        solver='lbfgs',
        n_jobs=-1
    ))
])

print("\nTraining Random Forest Classifier...")
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')
print(f"\nCross-Validation Scores: {cv_scores}")
print(f"Mean CV Accuracy: {cv_scores.mean():.4f} (+/- {cv_scores.std():.4f})")

accuracy = accuracy_score(y_test, y_pred)
print(f"\nTest Set Accuracy: {accuracy:.4f}")
print(f"\nClassification Report:\n{classification_report(y_test, y_pred)}")
print(f"\nConfusion Matrix:\n{confusion_matrix(y_test, y_pred)}")


Classification Task - Predicting High Efficiency EVs
Dataset shape: (248730, 19)
Class distribution:
target_high_efficiency
0    124403
1    124327
Name: count, dtype: int64

Training Random Forest Classifier...

Cross-Validation Scores: [0.80631706 0.80591502 0.80284946 0.80558836 0.80606091]
Mean CV Accuracy: 0.8053 (+/- 0.0013)

Test Set Accuracy: 0.8048

Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.81      0.80     24737
           1       0.81      0.80      0.81     25009

    accuracy                           0.80     49746
   macro avg       0.80      0.80      0.80     49746
weighted avg       0.80      0.80      0.80     49746


Confusion Matrix:
[[19924  4813]
 [ 4895 20114]]


In [5]:
import joblib

# Save the trained model
joblib.dump(model, "../model/ev_efficiency_classifier.pkl")

print("âœ… Model saved as ev_efficiency_classifier.pkl")


âœ… Model saved as ev_efficiency_classifier.pkl


In [6]:
import joblib

# Load model
loaded_model = joblib.load("../model/ev_efficiency_classifier.pkl")

print("âœ… Model loaded successfully")

import pandas as pd

new_ev = pd.DataFrame([{
    "manufacturer": "Tesla",
    "model": "Model 3",
    "type": "Sedan",
    "drive_type": "RWD",
    "fuel_type": "Electric",
    "color": "White",
    "fast_charging": 1,
    "country": "USA",
    "city": "San Francisco",

    "battery_kwh": 60,
    "range_km": 450,
    "charging_time_hr": 1.2,
    "release_year": 2023,
    "seats": 5,
    "acceleration_0_100_kmph": 5.8,
    "top_speed_kmph": 225,
    "warranty_years": 8,
    "cargo_space_liters": 425,
    "safety_rating": 5
}])
import pandas as pd

new_ev = pd.DataFrame([{
    "manufacturer": "Tesla",
    "model": "Model 3",
    "type": "Sedan",
    "drive_type": "RWD",
    "fuel_type": "Electric",
    "color": "White",
    "fast_charging": 1,
    "country": "USA",
    "city": "San Francisco",

    "battery_kwh": 60,
    "range_km": 450,
    "charging_time_hr": 1.2,
    "release_year": 2023,
    "seats": 5,
    "acceleration_0_100_kmph": 5.8,
    "top_speed_kmph": 225,
    "warranty_years": 8,
    "cargo_space_liters": 425,
    "safety_rating": 5
}])
prediction = loaded_model.predict(new_ev)
print("Predicted class:", prediction[0])
probability = loaded_model.predict_proba(new_ev)
print("Probability [Low, High]:", probability[0])
label = "High Efficiency EV ðŸš—âš¡" if prediction[0] == 1 else "Low Efficiency EV ðŸš™"
confidence = max(probability[0]) * 100

print(f"{label} (Confidence: {confidence:.2f}%)")


âœ… Model loaded successfully
Predicted class: 1
Probability [Low, High]: [0.00109198 0.99890802]
High Efficiency EV ðŸš—âš¡ (Confidence: 99.89%)
