In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer

# Read the CSV file
df = pd.read_csv('working_db_fin.csv')

# Method 1: Fill missing values with mean based on similar bikes (same displacement range)
def fill_missing_values(df):
    # Create displacement ranges (bins)
    df['displacement_range'] = pd.cut(df['displacement_cc'].astype(float), 
                                    bins=[0, 125, 250, 500, 1000, float('inf')],
                                    labels=['0-125', '126-250', '251-500', '501-1000', '1000+'])
    
    # For each displacement range
    for range_group in df['displacement_range'].unique():
        mask = df['displacement_range'] == range_group
        
        # Fill mileage
        mean_mileage = df[mask]['mileage'].astype(float).mean()
        df.loc[mask & df['mileage'].isna(), 'mileage'] = mean_mileage
        
        # Fill top_speed if it exists
        if 'top_speed' in df.columns:
            mean_speed = df[mask]['top_speed'].astype(float).mean()
            df.loc[mask & df['top_speed'].isna(), 'top_speed'] = mean_speed
    
    # Drop the temporary displacement_range column
    df = df.drop('displacement_range', axis=1)
    return df

# Clean and convert data
df['mileage'] = pd.to_numeric(df['mileage'], errors='coerce')
if 'top_speed' in df.columns:
    df['top_speed'] = pd.to_numeric(df['top_speed'], errors='coerce')

# Fill missing values
df = fill_missing_values(df)

# Round the values to 2 decimal places
df['mileage'] = df['mileage'].round(2)
if 'top_speed' in df.columns:
    df['top_speed'] = df['top_speed'].round(2)

# Save the updated dataframe
df.to_csv('working_db_filled.csv', index=False)

# Print statistics
print("\nMissing values statistics:")
print(df.isnull().sum())

print("\nValue ranges:")
print("\nMileage:")
print(f"Min: {df['mileage'].min():.2f}")
print(f"Max: {df['mileage'].max():.2f}")
print(f"Mean: {df['mileage'].mean():.2f}")

if 'top_speed' in df.columns:
    print("\nTop Speed:")
    print(f"Min: {df['top_speed'].min():.2f}")
    print(f"Max: {df['top_speed'].max():.2f}")
    print(f"Mean: {df['top_speed'].mean():.2f}")


Missing values statistics:
brand               0
bike_name           0
displacement_cc     0
price               0
power_bhp          11
weight_kg          13
mileage             0
top_speed          38
star_rating        11
dtype: int64

Value ranges:

Mileage:
Min: 9.00
Max: 104.00
Mean: 35.37

Top Speed:
Min: 83.00
Max: 200.00
Mean: 132.82
