In [None]:
# Data Preprocessing
import pandas as pd
import numpy as np


In [None]:
# Load datasets
data_paths = {
    'ai4i2020': '../data/ai4i2020.csv',
    'cleaned_vehicle_data': '../data/cleaned_vehicle_data.csv',
    'engine_data': '../data/engine_data.csv',
    'vehicle_sensor_data': '../data/vehicle_sensor_data.csv'
}
dfs = {name: pd.read_csv(path) for name, path in data_paths.items()}
for name, df in dfs.items():
    print(f'--- {name} ---')
    print(df.head())
    print(df.info())
    print(df.describe())


In [None]:
# Data Cleaning: missing values, duplicates, outliers
for name, df in dfs.items():
    print(f'--- {name} ---')
    print('Missing values per column:')
    print(df.isnull().sum())
    print('Number of duplicate rows:', df.duplicated().sum())
    df.drop_duplicates(inplace=True)
    # Fill missing values with median for numeric columns
    num_cols = df.select_dtypes(include=[np.number]).columns
    df[num_cols] = df[num_cols].fillna(df[num_cols].median())
    # Fill missing categorical values with mode
    cat_cols = df.select_dtypes(include=['object']).columns
    for col in cat_cols:
        df[col] = df[col].fillna(df[col].mode()[0])
    # Simple outlier detection (Z-score for numeric columns)
    z_scores = np.abs((df[num_cols] - df[num_cols].mean()) / df[num_cols].std())
    outliers = (z_scores > 3).sum()
    print('Potential outliers per column:')
    print(outliers)


In [None]:
# Data Normalization/Scaling
from sklearn.preprocessing import MinMaxScaler
scaled_dfs = {}
for name, df in dfs.items():
    num_cols = df.select_dtypes(include=[np.number]).columns
    scaler = MinMaxScaler()
    df_scaled = df.copy()
    df_scaled[num_cols] = scaler.fit_transform(df[num_cols])
    scaled_dfs[name] = df_scaled
    print(f'{name} - Scaled sample:')
    print(df_scaled.head())


In [None]:
# Feature Engineering Example
for name, df in dfs.items():
    if 'mileage' in df.columns:
        df['high_mileage'] = (df['mileage'] > 100000).astype(int)
    if 'temperature' in df.columns:
        df['temp_above_avg'] = (df['temperature'] > df['temperature'].mean()).astype(int)
    print(f'Feature engineering done for {name}')
