In [1]:
# Import packages
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.mixture import GaussianMixture
from sklearn.feature_selection import RFECV
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import statsmodels.api as sm
pd.set_option('future.no_silent_downcasting', True)

# Add in text formatting
BOLD = '\033[1m'  # ANSI escape sequence for bold
END = '\033[0m'   # ANSI escape sequence to reset formatting

In [2]:
# Import and clean up dataset
df = pd.read_csv('data/electric_vehicles_spec_2025.csv')
df = df.dropna()
df.head()

Unnamed: 0,brand,model,top_speed_kmh,battery_capacity_kWh,battery_type,number_of_cells,torque_nm,efficiency_wh_per_km,range_km,acceleration_0_100_s,...,towing_capacity_kg,cargo_volume_l,seats,drivetrain,segment,length_mm,width_mm,height_mm,car_body_type,source_url
0,Abarth,500e Convertible,155,37.8,Lithium-ion,192.0,235.0,156,225,7.0,...,0.0,185,4,FWD,B - Compact,3673,1683,1518,Hatchback,https://ev-database.org/car/1904/Abarth-500e-C...
1,Abarth,500e Hatchback,155,37.8,Lithium-ion,192.0,235.0,149,225,7.0,...,0.0,185,4,FWD,B - Compact,3673,1683,1518,Hatchback,https://ev-database.org/car/1903/Abarth-500e-H...
2,Abarth,600e Scorpionissima,200,50.8,Lithium-ion,102.0,345.0,158,280,5.9,...,0.0,360,5,FWD,JB - Compact,4187,1779,1557,SUV,https://ev-database.org/car/3057/Abarth-600e-S...
3,Abarth,600e Turismo,200,50.8,Lithium-ion,102.0,345.0,158,280,6.2,...,0.0,360,5,FWD,JB - Compact,4187,1779,1557,SUV,https://ev-database.org/car/3056/Abarth-600e-T...
6,Alfa,Romeo Junior Elettrica 54 kWh,150,50.8,Lithium-ion,102.0,260.0,128,320,9.0,...,0.0,400,5,FWD,JB - Compact,4173,1781,1532,SUV,https://ev-database.org/car/2184/Alfa-Romeo-Ju...


In [3]:
# Check for missing values
df.isnull().sum()

brand                        0
model                        0
top_speed_kmh                0
battery_capacity_kWh         0
battery_type                 0
number_of_cells              0
torque_nm                    0
efficiency_wh_per_km         0
range_km                     0
acceleration_0_100_s         0
fast_charging_power_kw_dc    0
fast_charge_port             0
towing_capacity_kg           0
cargo_volume_l               0
seats                        0
drivetrain                   0
segment                      0
length_mm                    0
width_mm                     0
height_mm                    0
car_body_type                0
source_url                   0
dtype: int64

Using the dataset from Kaggle about electric vehicles - https://www.kaggle.com/datasets/urvishahir/electric-vehicle-specifications-dataset-2025

Description :

This dataset provides a comprehensive collection of specifications and performance metrics for modern electric vehicles (EVs). It is designed to support researchers, analysts, students, and developers working on data science, machine learning, automotive market research, sustainability studies, or electric vehicle adoption analysis.

Each row in the dataset represents a specific electric vehicle model with a rich set of attributes covering:


        Brand and Model: Manufacturer and specific nameplate of the EV.
        Car Body Type: Classification such as hatchback, SUV, sedan, etc.
        Segment: Vehicle segment (e.g., compact, midsize, executive).

        Battery Capacity (kWh): The gross energy capacity of the battery.
        Number of Cells and Battery Type: Technical battery information, where available.
        Efficiency (Wh/km): Power consumption rate of the vehicle.
        Range (km): Estimated driving range on a full charge.

        Fast Charging Power (kW): Maximum supported DC fast-charging power.
        Fast Charge Port Type: Connector standard (e.g., CCS, CHAdeMO).

        Top Speed (km/h): Maximum speed of the vehicle.
        0–100 km/h Acceleration (s): Time to reach 100 km/h from a standstill.
        Torque (Nm): Maximum torque output, where available.

        Towing Capacity (kg): Ability to tow loads, provided where applicable.
        Cargo Volume (L): Luggage space, sometimes approximate or expressed in alternative units.
        Seats: Total seating capacity.
        
        Length, Width, Height (mm): Physical footprint of the vehicle.
        Drivetrain: Powertrain configuration (e.g., AWD, RWD, FWD).
        Source URL: Reference link for each car (used in scraping).

## Question 1: Is there a relationship between the technical qualities? 
Cluster analysis: Are there natural clusters of EVs (e.g., economy commuter, high-performance, luxury long-range) based on specs?
- Unsupervised
- No latent variable
- No clear truth variable
- Perform clustering
- Perform other unsupervised modeling

In [4]:
# Could cluster car brands together based on technical specs: battery size, top speeds, number of cells, etc...
scaler = StandardScaler()
technical_features = df[['top_speed_kmh', 'battery_capacity_kWh', 'number_of_cells', 
                      'torque_nm', 'efficiency_wh_per_km', 'range_km', 'acceleration_0_100_s']].values


technical_features_scaled = scaler.fit_transform(technical_features)

manu = df[['brand']].values # Grab brands to group by

In [5]:
# Look for best cluster parameters based on AIC and/or BIC
def best_cluster(X):
    # Initialize parameters
    cluster_range = range(2, 21)
    aic_history = []
    bic_history = []
    models = []
    
    for num_clusters in cluster_range:
        gmm = GaussianMixture(n_components = num_clusters, n_init = 10)
        gmm.fit(X)
    
        aic = gmm.aic(X)
        bic = gmm.bic(X)
    
        aic_history.append(aic) # Append AIC scores
        bic_history.append(bic) # Append BIC scores
        models.append(gmm) # Append models used
    
        print(f"Number of clusters = {num_clusters} with AIC = {aic:.4f}, BIC = {bic:.4f}")
    
    # Find best number of clusters
    aic_min_index = np.argmin(aic_history)
    bic_min_index = np.argmin(bic_history)
    best_aic_cluster = cluster_range[aic_min_index]
    best_bic_cluster = cluster_range[bic_min_index]

    # Depending on the run, sometimes AIC and BIC don't agree
    best_aic_model = models[best_aic_cluster - 2] # minus 2 because we started with range of 2
    best_bic_model = models[best_bic_cluster - 2]
    
    print(f"\nWith a minimum value of {min(aic_history):.4f} AIC, the optimal number of clusters based on AIC is {best_aic_cluster}")
    print(f"With a minimum value of {min(bic_history):.4f} BIC, the optimal number of clusters based on BIC is {best_bic_cluster}")
    return best_aic_model, best_bic_model, best_aic_cluster, best_bic_cluster

# Function to print out clusters and their respective countries through specified evaluator
def brand_clusters(cluster, best_cluster_value, df, evaluator):
    print(BOLD + f'\nGiven a total of {best_cluster_value} clusters through {evaluator}' + END)
    for cluster_number in range(best_cluster_value): # Iterate through each cluster
        brand_in_clusters = df[df[cluster] == cluster_number]['brand'].unique()
        print(f"\nCluster {cluster_number + 1} ({len(brand_in_clusters)} brands):")
        print(', '.join(brand_in_clusters)) # https://stackoverflow.com/questions/22399014/print-elements-in-an-array-with-a-delimiter

In [6]:
# Find best cluster, then group and print out the brands
q1_copy = df.copy() # Create a copy of dataframe for cluster labels
best_aic_model, best_bic_model, best_aic_cluster, best_bic_cluster = best_cluster(technical_features_scaled)

# Get cluster labels
aic_cluster_labels = best_aic_model.predict(technical_features_scaled)
bic_cluster_labels = best_bic_model.predict(technical_features_scaled)
q1_copy['AIC Cluster'] = aic_cluster_labels
q1_copy['BIC Cluster'] = bic_cluster_labels

# Print out brands clustered together, based on AIC or BIC
brand_clusters('AIC Cluster', best_aic_cluster, q1_copy, 'AIC')
brand_clusters('BIC Cluster', best_bic_cluster, q1_copy, 'BIC')

Number of clusters = 2 with AIC = 2589.7583, BIC = 2843.6507
Number of clusters = 3 with AIC = 1842.5340, BIC = 2225.1606
Number of clusters = 4 with AIC = 1459.8186, BIC = 1971.1793
Number of clusters = 5 with AIC = 1180.6825, BIC = 1820.7774
Number of clusters = 6 with AIC = 1202.5951, BIC = 1971.4242
Number of clusters = 7 with AIC = 768.1024, BIC = 1665.6656
Number of clusters = 8 with AIC = 569.2110, BIC = 1595.5084
Number of clusters = 9 with AIC = 466.8440, BIC = 1621.8756
Number of clusters = 10 with AIC = 437.0535, BIC = 1720.8192
Number of clusters = 11 with AIC = -22.2679, BIC = 1390.2320
Number of clusters = 12 with AIC = -24.3699, BIC = 1516.8642
Number of clusters = 13 with AIC = -181.7520, BIC = 1488.2162
Number of clusters = 14 with AIC = -616.1297, BIC = 1182.5727
Number of clusters = 15 with AIC = -648.0966, BIC = 1279.3400
Number of clusters = 16 with AIC = -717.7473, BIC = 1338.4235
Number of clusters = 17 with AIC = -940.1476, BIC = 1244.7573
Number of clusters = 1

### Q1 Temporary Analysis
BIC tends to penalizes model complexity more than AIC. It tends to select simpler models with fewer clusters; therefore BIC is generally preferred to prevent overfitting, and to compress down datasets to have less complexity.

AIC tends to penalizes complexity less, so it will often select models with more clusters. AIC is better when the goal is to predict or when we want a model that captures more nuances in the data.

With our current EM algorithm implementation, we can see this reflected in the number of clusters chosen either via AIC or BIC. AIC consistently chooses more clusters than BIC through multiple runs. Due to brands having different models of cars, we will see brands repeated throughout the clusters. We can still extrapolate information from the groupings (are perceived high-end brands grouped together? etc...)

It's also interesting to see how the clusters are formed with closely related car brands. For example, Polestar is the sister company or EV subsidiary of Volvo. However, they are frequently grouped separately. Is this because Volvo is considered the lower end EV models?

## Question 2: 
- What other factors are closely related to efficiency? What makes an electric car the most efficient?

In [7]:
# Build a function to choose the best X values
# https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.RFECV.html
def best_parameters(X, y):
    model = LinearRegression()
    rfecv = RFECV(estimator = model, step = 1, cv = KFold(5), 
                  scoring = 'r2', min_features_to_select = 1)
    rfecv.fit(X, y)

    # Results
    print(f"Optimal number of features: {rfecv.n_features_}")
    print(f"Best cross-validation score: {rfecv.cv_results_['mean_test_score'].max()}")
    
    # Get the selected features
    selected_features = X.columns[rfecv.support_]
    print(f"\nSelected features:")
    print(', '.join(selected_features.tolist()))
    
    return selected_features

In [8]:
# Grab features
q2_X = df[['top_speed_kmh', 'battery_capacity_kWh', 'number_of_cells', 
           'torque_nm', 'range_km', 'length_mm', 'width_mm',
           'height_mm', 'acceleration_0_100_s', 'seats', 'towing_capacity_kg', 
           'fast_charging_power_kw_dc']]
efficiency_y = df[['efficiency_wh_per_km']]

# Choose the best features to use for predictive modeling
q2_selected_features = best_parameters(q2_X, efficiency_y)

Optimal number of features: 12
Best cross-validation score: 0.6019965612792203

Selected features:
top_speed_kmh, battery_capacity_kWh, number_of_cells, torque_nm, range_km, length_mm, width_mm, height_mm, acceleration_0_100_s, seats, towing_capacity_kg, fast_charging_power_kw_dc


In [9]:
# Create X and y with respective features
q2_X = df[q2_selected_features]
q2_X = sm.add_constant(q2_X)

# Run statsmodel linear regression model
print(sm.OLS(efficiency_y, q2_X).fit().summary())

                             OLS Regression Results                             
Dep. Variable:     efficiency_wh_per_km   R-squared:                       0.834
Model:                              OLS   Adj. R-squared:                  0.826
Method:                   Least Squares   F-statistic:                     104.9
Date:                  Thu, 06 Nov 2025   Prob (F-statistic):           1.88e-90
Time:                          15:25:37   Log-Likelihood:                -1046.7
No. Observations:                   264   AIC:                             2119.
Df Residuals:                       251   BIC:                             2166.
Df Model:                            12                                         
Covariance Type:              nonrobust                                         
                                coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------------
co

### Q2 Temporary Analysis
Interestingly, even with a high colinearilty shown through the high R squared value, not many of the chosen parameters are statistically significant. If we take a look at the chassis specifications, the length and height are statistically significant while the width is not. Additionally, only battery capacity, number of cells in the battery, the range of the car, and towing capacity in kilograms can help predict the efficiency of the car.

Therefore, parts of the chassis dimensions, battery specifications, range of the vehicle, and overall horsepower of the engine (how much weight it can pull) are the key components when considering effieciency.

Another interesting analysis we could do is to incorporate drive train (using numbers to deliminate what type of drive train). Traditionally, AWD is considered to be less efficient than FWD or RWD in gas cars. Would this also be the case with EVs?

In [10]:
# Question 2 cont.
# Convert unique types of drivetrains to numerical values to run logistic regression or decision tree
# Current drive trains include: FWD, RWD, AWD (Front Wheel Drive, Rear Wheel Drive, All Wheel Drive)

# Convert FWD - 1, RWD - 2, AWD - 3
df_q2 = df.copy()
df_q2['drivetrain'] = df_q2['drivetrain'].replace({'FWD': 1, 'RWD': 2, 'AWD': 3})

q2_feature_list = q2_selected_features.tolist()
q2_feature_list.append('drivetrain')

q2_X = df_q2[q2_feature_list]
q2_X['drivetrain'] = q2_X['drivetrain'].astype(int)
q2_X = sm.add_constant(q2_X) # New dataframe with drive train included


print(sm.OLS(efficiency_y, q2_X).fit().summary())

                             OLS Regression Results                             
Dep. Variable:     efficiency_wh_per_km   R-squared:                       0.834
Model:                              OLS   Adj. R-squared:                  0.825
Method:                   Least Squares   F-statistic:                     96.48
Date:                  Thu, 06 Nov 2025   Prob (F-statistic):           1.92e-89
Time:                          15:25:37   Log-Likelihood:                -1046.7
No. Observations:                   264   AIC:                             2121.
Df Residuals:                       250   BIC:                             2172.
Df Model:                            13                                         
Covariance Type:              nonrobust                                         
                                coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------------
co

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  q2_X['drivetrain'] = q2_X['drivetrain'].astype(int)


### Q2 Temporary Analysis cont.
Interestingly, there is a low statistical significance when including drive train as a parameter when trying to predict the efficiency value of the vehicle. Perhaps the drivetrain does not make as big of a difference in EVs compared to gas cars.

Research opportunities:
- https://www.subaruofauburn.com/does-all-wheel-drive-hurt-fuel-economy/
- https://www.louscarcare.com/blog/what-is-the-difference-between-awd-and-4wd

## Question 3:

Predictive Modeling

Build a predictive model: Given variables like battery_capacity_kWh, body type, segment, drivetrain, brand, dimensions — can you predict range_km?

Or: Predict fast_charging_power_kW given other specs (battery size, brand, segment) — which vehicles support high charging?

Feature engineering: Create derived metrics like “range per kWh”, “Wh per km per kg volume”, etc.— which features are most predictive of being in “premium” vs “economy” class?

In [11]:
# Predictive modeling function, can we predict range based on technical specs?    
q3_X = df[['top_speed_kmh', 'battery_capacity_kWh', 'number_of_cells', 
                  'torque_nm', 'efficiency_wh_per_km', 'length_mm', 'width_mm',
                  'height_mm', 'acceleration_0_100_s', 'seats', 'towing_capacity_kg', 
                 'fast_charging_power_kw_dc']]


range_y = df[['range_km']]
range_y_scaled = scaler.fit_transform(range_y)

# Choose the best features to use for predictive modeling
selected_features = best_parameters(q3_X, range_y)

Optimal number of features: 11
Best cross-validation score: 0.9274497315691977

Selected features:
top_speed_kmh, battery_capacity_kWh, number_of_cells, torque_nm, efficiency_wh_per_km, length_mm, width_mm, height_mm, acceleration_0_100_s, seats, fast_charging_power_kw_dc


In [12]:
# Create X and y with respective features
q3_X = df[selected_features]
q3_X = sm.add_constant(q3_X)

# Run statsmodel linear regression model
print(sm.OLS(range_y, q3_X).fit().summary())

                            OLS Regression Results                            
Dep. Variable:               range_km   R-squared:                       0.957
Model:                            OLS   Adj. R-squared:                  0.955
Method:                 Least Squares   F-statistic:                     510.2
Date:                Thu, 06 Nov 2025   Prob (F-statistic):          2.97e-165
Time:                        15:25:37   Log-Likelihood:                -1181.1
No. Observations:                 264   AIC:                             2386.
Df Residuals:                     252   BIC:                             2429.
Df Model:                          11                                         
Covariance Type:            nonrobust                                         
                                coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------------
const                 

### Q3 Temporary Analysis
With technical specs such as the size of the chassis, battery capacity, number of cells, etc... most seem to be statistically significant (p-value less than 0.05). The only two features that do not hold as much statistical significance are the top speed of the car and the fast charging capacity. Overall with a high R squared value, the linear regression model performs really well with estimating the range of the cars measured in kilometers. If customers are looking at the market in the future and are looking for high performing, efficient EVs, predictive analysis seems to be very performant.

In [13]:
# Extra modeling idea - use logistic regression with classification for "premium" vs "economy" brands
# Group brands together based on a few parameters, only have 2 groups and see if there is a high and low class separation