In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [2]:
# Create directories for saving plots and data
if not os.path.exists("plots"):
    os.makedirs("plots")
if not os.path.exists("../data"):
    os.makedirs("../data")


In [3]:
#Load dataset
try:
    df = pd.read_csv("../data/vehicle_maintenance_data.csv")
except FileNotFoundError:
    raise FileNotFoundError("The file '../data/vehicle_maintenance_data.csv' was not found. Please ensure it exists in the correct directory.")

In [4]:
# 1. Data Quality Check
print("Dataset Shape:", df.shape)
print("\nMissing Values:\n", df.isnull().sum())
print("\nDuplicates:", df.duplicated().sum())

Dataset Shape: (50000, 20)

Missing Values:
 Vehicle_Model           0
Mileage                 0
Maintenance_History     0
Reported_Issues         0
Vehicle_Age             0
Fuel_Type               0
Transmission_Type       0
Engine_Size             0
Odometer_Reading        0
Last_Service_Date       0
Warranty_Expiry_Date    0
Owner_Type              0
Insurance_Premium       0
Service_History         0
Accident_History        0
Fuel_Efficiency         0
Tire_Condition          0
Brake_Condition         0
Battery_Status          0
Need_Maintenance        0
dtype: int64

Duplicates: 0


In [5]:
# 2. Convert Categorical Columns to Numerical
# Define mappings for ordinal categorical variables
maintenance_mapping = {"Poor": 0, "Average": 1, "Good": 2}
condition_mapping = {"New": 0, "Good": 1, "Worn Out": 2}
battery_mapping = {"Weak": 0, "Good": 1, "New": 2}

In [6]:
# Check if the columns exist and values are valid
for col, mapping in [("Maintenance_History", maintenance_mapping), 
                     ("Tire_Condition", condition_mapping), 
                     ("Brake_Condition", condition_mapping), 
                     ("Battery_Status", battery_mapping)]:
    if col not in df.columns:
        raise KeyError(f"Column '{col}' not found in the dataset.")
    invalid_values = df[col][~df[col].isin(mapping.keys())]
    if not invalid_values.empty:
        raise ValueError(f"Column '{col}' contains invalid values: {invalid_values.unique()}")

df["Maintenance_History"] = df["Maintenance_History"].map(maintenance_mapping)
df["Tire_Condition"] = df["Tire_Condition"].map(condition_mapping)
df["Brake_Condition"] = df["Brake_Condition"].map(condition_mapping)
df["Battery_Status"] = df["Battery_Status"].map(battery_mapping)

In [7]:
# One-hot encode other categorical variables
df = pd.get_dummies(df, columns=["Vehicle_Model", "Fuel_Type", "Transmission_Type", "Owner_Type"], drop_first=False)

In [8]:
# Convert date columns to numerical (days since last service, warranty expired)
current_date = pd.to_datetime("2025-05-28")
df["Last_Service_Date"] = pd.to_datetime(df["Last_Service_Date"])
df["Warranty_Expiry_Date"] = pd.to_datetime(df["Warranty_Expiry_Date"])
df["Days_Since_Last_Service"] = (current_date - df["Last_Service_Date"]).dt.days
df["Warranty_Expired"] = (df["Warranty_Expiry_Date"] < current_date).astype(int)
df = df.drop(columns=["Last_Service_Date", "Warranty_Expiry_Date"])

In [9]:
# Convert all columns to int32/int64
for col in df.columns:
    if df[col].dtype == "float64":
        df[col] = df[col].astype("int64")
    elif df[col].dtype == "bool":
        df[col] = df[col].astype("int32")
    elif df[col].dtype == "object":
        df[col] = df[col].astype("int32")
    else:
        df[col] = df[col].astype("int32")

In [10]:
# 6. Maintenance Needs by Key Features
plt.figure(figsize=(8, 6))
sns.barplot(x="Maintenance_History", y="Need_Maintenance", data=df)
plt.title("Need Maintenance by Maintenance History")
plt.savefig("plots/maintenance_by_history.png")
plt.close()

In [11]:
# 4. Numerical Features Distribution
numerical_cols = ["Mileage", "Reported_Issues", "Vehicle_Age", "Engine_Size", "Odometer_Reading", "Days_Since_Last_Service"]
plt.figure(figsize=(12, 8))
for i, col in enumerate(numerical_cols, 1):
    plt.subplot(2, 3, i)
    sns.histplot(df[col], kde=True)
    plt.title(f"Distribution of {col}")
plt.tight_layout()
plt.savefig("plots/numerical_distributions.png")
plt.close()

In [12]:
# 5. Correlation Analysis
plt.figure(figsize=(10, 8))
sns.heatmap(df.corr(), cmap="coolwarm", annot=False)
plt.title("Correlation Heatmap")
plt.savefig("plots/correlation_heatmap.png")
plt.close()

In [13]:
# 6. Maintenance Needs by Key Features
plt.figure(figsize=(8, 6))
sns.barplot(x="Maintenance_History", y="Need_Maintenance", data=df)
plt.title("Need Maintenance by Maintenance History")
plt.savefig("plots/maintenance_by_history.png")
plt.close()

In [14]:
# 7. Save Processed Data for Further Use
df.to_csv("../data/processed_data.csv", index=False)
print("Processed data saved to 'data/processed_data.csv'")

Processed data saved to 'data/processed_data.csv'
