# Aerofit Case Study â€” Refactored EDA and Baseline Modeling

This notebook contains helper functions for loading, cleaning, plotting, and a baseline RandomForest classifier to predict which product (KP281, KP481, KP781) a customer will buy.


In [None]:
# Standard imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import warnings
warnings.filterwarnings('ignore')
RANDOM_STATE = 42


In [None]:
DATA_URL = 'https://d2beiqkhq929f0.cloudfront.net/public_assets/assets/000/001/125/original/aerofit_treadmill.csv?1639992749'

def load_data(url=DATA_URL):
    df = pd.read_csv(url)
    return df

def basic_checks(df):
    print('Shape:', df.shape)
    print('Nulls by column:
', df.isnull().sum())
    print('Duplicates:', df.duplicated().sum())
    display(df.head())

def preprocess(df, drop_duplicates=True):
    df = df.copy()
    if drop_duplicates:
        df = df.drop_duplicates().reset_index(drop=True)
    # Ensure correct dtypes
    df['Product'] = df['Product'].astype(str)
    df['Gender'] = df['Gender'].astype(str)
    df['MaritalStatus'] = df['MaritalStatus'].astype(str)
    # Create a simple numeric features DataFrame
    X_num = df[['Age','Education','Usage','Fitness','Income','Miles']].copy()
    # One-hot encode categorical features
    X_cat = pd.get_dummies(df[['Gender','MaritalStatus']], drop_first=True)
    X = pd.concat([X_num, X_cat], axis=1)
    y = df['Product']
    return X, y, df


In [None]:
def plot_counts(df, col, hue=None):
    ax = sns.countplot(data=df, x=col, hue=hue)
    for c in ax.containers:
        ax.bar_label(c)
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    plt.tight_layout()
    plt.show()

def plot_hist(df, col, bins=10):
    ax = sns.histplot(df[col], bins=bins)
    plt.title(col)
    plt.show()

def show_corr(df):
    corr = df.corr()
    plt.figure(figsize=(8,6))
    sns.heatmap(corr, annot=True, fmt='.2f')
    plt.show()


In [None]:
# Modeling helper: train a baseline RandomForest and return basic metrics
def train_baseline(X, y, random_state=RANDOM_STATE):
    # Stratified split
    X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.25, random_state=random_state)
    # Simple scaler for numeric columns (keep manual handling simple)
    scaler = StandardScaler()
    num_cols = ['Age','Education','Usage','Fitness','Income','Miles']
    X_train_scaled = X_train.copy()
    X_test_scaled = X_test.copy()
    X_train_scaled[num_cols] = scaler.fit_transform(X_train[num_cols])
    X_test_scaled[num_cols] = scaler.transform(X_test[num_cols])

    clf = RandomForestClassifier(n_estimators=200, random_state=random_state, class_weight='balanced')
    clf.fit(X_train_scaled, y_train)
    y_pred = clf.predict(X_test_scaled)

    print('Accuracy:', accuracy_score(y_test, y_pred))
    print('
Classification report:
')
    print(classification_report(y_test, y_pred))

    # Feature importances (map back to column names)
    fi = pd.Series(clf.feature_importances_, index=X.columns).sort_values(ascending=False)
    display(fi.head(20))
    return clf, fi, (X_train_scaled, X_test_scaled, y_train, y_test, y_pred)


In [None]:
# Run the pipeline
df = load_data()
basic_checks(df)
X, y, df_clean = preprocess(df)

# Quick EDA examples
plot_counts(df_clean, 'Product')
plot_counts(df_clean, 'Gender', hue='Product')
plot_hist(df_clean, 'Income', bins=10)
show_corr(df_clean[['Age','Education','Usage','Fitness','Income','Miles']])

# Train baseline model
clf, fi, model_objects = train_baseline(X, y)

# Business rules suggestion example (simple thresholds based on EDA)
print('Example business rule: Recommend KP781 when Income > 60000 AND Fitness >= 4 AND Usage >= 5')
