In [None]:
# EV Data Wrangling

**Dataset source:** https://www.kaggle.com/datasets/urvishahir/electric-vehicle-specifications-dataset-2025  

This script explores the electric vehicle dataset step by step,  
with commentary to show the reasoning process behind each check.

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path

# --- Load data ---
csv_path = Path(r"C:\Users\teren\OneDrive\Documents\Education\05._Continued Education\01._SpringBoard Data Science Bootcamp\03._Capstone Project\EV_Project\02._Data\electric_vehicles_spec_2025.csv.csv")
df = pd.read_csv(csv_path)

print("Shape:", df.shape)
print(df.head())

In [None]:
## Model Column Sanity
- One row is blank, leaving 477 unique values from 478 total.  
- The “Firefly” appears only once — rare entries like this risk slowing analysis and can be dropped.

mask = (
    df['brand'].astype(str).str.strip().str.lower() == "firefly"
) & (
    df['model'].isna() | (df['model'].astype(str).str.strip() == "")
)

before = len(df)
df = df[~mask].copy()
print(f"Dropped Firefly rows with no model: {before - len(df)}")

In [None]:
## Top Speed
- Range goes from 125 km/h to 325 km/h.  
- That 325 is very high, but it belongs to a Maserati GranTurismo Folgore.  
- Cross-checking shows this is realistic, so it should be kept.

print(df['top_speed_kmh'].describe())
print("Highest top speed:", df['top_speed_kmh'].max())
print(df.loc[df['top_speed_kmh'] == df['top_speed_kmh'].max(), ['brand','model','top_speed_kmh']])

In [None]:
## Range
- Max range = 685 km. Manufacturer sites list slightly lower values, but still within reason.  
- The top 10 models show gradual decreases, so we will keep them all.

print(df['range_km'].describe())
print(df.nlargest(10, 'range_km')[['brand','model','range_km']])

In [None]:
## Battery, Fast Charging, Efficiency
- **battery_capacity_kWh**: no missing values  
- **fast_charging_power_kw_dc**: 0.21% missing (Renault 5 E-Tech 40kWh). This lone row can be dropped.  
- **efficiency_wh_per_km**: 0% missing

for col in ['battery_capacity_kWh','fast_charging_power_kw_dc','efficiency_wh_per_km']:
    missing_count = df[col].isna().sum()
    missing_pct = df[col].isna().mean()*100
    print(col, 'missing:', missing_count, f"({missing_pct:.2f}%)")

In [None]:
## Drop Renault 5 E-Tech 40kWh Row
- This row is the only one with missing fast-charging info (0.21%).  
- Instead of guessing, drop it.

mask = (
    df['brand'].astype(str).str.strip().str.lower() == "renault"
) & (
    df['model'].astype(str).str.strip().str.lower() == "5 e-tech 40kwh 95hp"
)

before = len(df)
df = df[~mask].copy()
print(f"Dropped Renault 5 E-Tech 40kWh 95hp rows: {before - len(df)}")

In [None]:
## Towing Capacity
- Missing ~5.44% — this is significant, meaning we may need imputation or exclusion, depending on how central it is to the modeling.  
- NOTE: Since towing capacity has gaps, we can use **cargo_volume_l** as a similar benchmark for vehicle utility.

print("Towing missing %:", df['towing_capacity_kg'].isna().mean()*100)

In [None]:
## Cargo Volume
- Only 0.21% missing, but 4 rows are non-numeric (e.g., "10 Banana Boxes", "31 Banana Boxes").  
- These are clearly errors and best removed — no guessing.

cargo_numeric = pd.to_numeric(df['cargo_volume_l'], errors='coerce')
df = df[~cargo_numeric.isna()].copy()
df['cargo_volume_l'] = cargo_numeric[~cargo_numeric.isna()]
print("Non-numeric and blank cargo_volume_l rows dropped. New shape:", df.shape)

In [None]:
## Width (mm)
- All values are tightly clustered between ~1400–2200 mm, with no missing entries.  
- This is a solid variable for analysis.

print(df['width_mm'].describe())              

In [None]:
## Final Note
The only columns with incomplete information (blank cells) are **number_of_cells** and **towing_capacity_kg**.  
They will remain in the dataset for now but may be removed during further exploratory data analysis.

In [None]:
out_path = csv_path.with_name(csv_path.stem.replace('.csv','') + '_CLEAN.csv')
df.to_csv(out_path, index=False)

print("Saved cleaned data to:", out_path)