
Data Cleaning Questions:


In [None]:
import pandas as pd

In [None]:
data = r"/content/Electric_Vehicle_Population_Data.csv"

In [None]:
df = pd.read_csv(data)

In [None]:
missing = df.isnull().sum()

In [None]:
print(missing)

In [None]:
df.info()

In [None]:
df.shape

In [None]:
df['Electric Range'] = df['Electric Range'].replace(0, pd.NA)

In [None]:
df.head()

In [None]:
df['Electric Range'].fillna(df['Electric Range'].median(), inplace=True)

In [None]:
df.head()

In [None]:
A=df.duplicated().sum()

In [None]:
print("Number of rows :",A)

In [None]:
df = df.drop_duplicates

In [None]:
import pandas as pd

In [None]:
import hashlib

In [None]:
data=r"/content/Electric_Vehicle_Population_Data.csv"

In [None]:
df = pd.read_csv(data)

In [None]:
type(df)

In [None]:
print(df.columns)

In [None]:
def anonymize_vin(vin):
    if pd.isna(vin):
        return None
    return hashlib.sha256(str(vin).encode()).hexdigest()

df['ANONYMIZE'] = df['VIN (1-10)'].apply(anonymize_vin)


In [None]:
df['Vehicle Location'] = (
    df['Vehicle Location']
    .astype(str)
    .str.replace('POINT', '', regex=False)
    .str.replace('[()]', '', regex=True)
    .str.strip()
)


In [None]:
a = df['Vehicle Location'].str.split(r'\s+', expand=True)


In [None]:
df['Longitude'] = pd.to_numeric(a[0], errors='coerce')
df['Latitude'] = pd.to_numeric(a[1], errors='coerce')


In [None]:
df = df.dropna(subset=['Latitude', 'Longitude'])


. Data Exploration


In [None]:
top_5_makes = df['Make'].value_counts().head(5)
top_5_models = df['Model'].value_counts().head(5)

top_5_makes, top_5_models


In [None]:
county_distribution = df['County'].value_counts()
top_county = county_distribution.idxmax()

county_distribution.head(), top_county


In [None]:
ev_by_year = df['Model Year'].value_counts().sort_index()
ev_by_year


In [None]:
average_electric_range = df['Electric Range'].mean()
average_electric_range


In [None]:
cafv_percentage = (
    df['Clean Alternative Fuel Vehicle (CAFV) Eligibility']
    .value_counts(normalize=True) * 100
)

cafv_percentage


In [None]:
avg_range_by_make = (
    df.groupby('Make')['Electric Range']
    .mean()
    .sort_values(ascending=False)
)

avg_range_by_model = (
    df.groupby('Model')['Electric Range']
    .mean()
    .sort_values(ascending=False)
)

avg_range_by_make, avg_range_by_model


In [None]:
df['County'].value_counts().head(10)


Data Visualization Questions

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import folium


In [None]:
# Top 5 Makes
top_makes = df['Make'].value_counts().head(5)
plt.figure(figsize=(8,5))
sns.barplot(x=top_makes.index, y=top_makes.values, palette="viridis")
plt.title("Top 5 EV Makes by Count")
plt.ylabel("Number of EVs")
plt.xlabel("Make")
plt.show()

# Top 5 Models
top_models = df['Model'].value_counts().head(5)
plt.figure(figsize=(10,5))
sns.barplot(x=top_models.index, y=top_models.values, palette="magma")
plt.title("Top 5 EV Models by Count")
plt.ylabel("Number of EVs")
plt.xlabel("Model")
plt.xticks(rotation=45)
plt.show()


In [None]:
# County distribution
county_counts = df['County'].value_counts().reset_index()
county_counts.columns = ['County', 'EV_Count']

# Plot heatmap using Seaborn (simple version)
plt.figure(figsize=(12,6))
sns.heatmap(county_counts.set_index('County').T, annot=True, cmap="YlGnBu")
plt.title("EV Distribution by County")
plt.show()


In [None]:
ev_by_year = df['Model Year'].value_counts().sort_index()
plt.figure(figsize=(10,5))
sns.lineplot(x=ev_by_year.index, y=ev_by_year.values, marker="o")
plt.title("EV Adoption Trend by Model Year")
plt.xlabel("Model Year")
plt.ylabel("Number of EVs")
plt.grid(True)
plt.show()


In [None]:
plt.figure(figsize=(10,6))
sns.scatterplot(data=df, x='Electric Range',  hue='Make', palette='tab10', alpha=0.7)

plt.xlabel("Electric Range (miles)")

plt.legend(title='Make')
plt.show()


In [None]:
cafv_counts = df['Clean Alternative Fuel Vehicle (CAFV) Eligibility'].value_counts()
plt.figure(figsize=(6,6))
plt.pie(cafv_counts, labels=cafv_counts.index, autopct='%1.1f%%', startangle=140, colors=['#66b3ff','#ff9999'])
plt.title("CAFV Eligibility Proportion")
plt.show()


In [None]:
# Ensure Latitude & Longitude columns exist
m = folium.Map(location=[df['Latitude'].mean(), df['Longitude'].mean()], zoom_start=6)

for idx, row in df.iterrows():
    if pd.notnull(row['Latitude']) and pd.notnull(row['Longitude']):
        folium.CircleMarker(
            location=[row['Latitude'], row['Longitude']],
            radius=3,
            color='blue',
            fill=True,
            fill_opacity=0.6
        ).add_to(m)

# Display map
m
