In [1]:
import os
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.linear_model import LinearRegression, Perceptron
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import (
    mean_squared_error, r2_score,
    accuracy_score, precision_score, recall_score, f1_score, classification_report
)
from sklearn.model_selection import train_test_split

# Create output directory
OUTDIR = "outputs"
os.makedirs(OUTDIR, exist_ok=True)

# Utility functions
def savefig(fname):
    full = os.path.join(OUTDIR, fname)
    plt.savefig(full, dpi=300, bbox_inches='tight')
    print("Saved:", full)

def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

# Load data (Train.xlsx expected)
try:
    df_train = pd.read_excel("Train.xlsx")
    df_test = pd.read_excel("Test.xlsx")
    print("Loaded Train.xlsx and Test.xlsx")
except Exception as e:
    print("Could not load Excel files; using generated sample data. Error:", e)
    np.random.seed(42)
    n = 400
    df_train = pd.DataFrame({
        'name': np.random.choice([f"user_{i:02d}" for i in range(1, 11)], n),
        'typingSpeedWPM': np.random.normal(45, 12, n),
        'avgHoldTime': np.random.normal(100, 25, n),
        'commonDigraphTiming': np.random.normal(80, 30, n),
        'correctionLatencyMean': np.random.normal(400, 150, n),
        'entropyIKD': np.random.normal(0.9, 0.12, n),
        'holdTimeStdDev': np.random.normal(40, 12, n),
        'skewnessIKD': np.random.normal(1.6, 0.7, n),
        'tempoChangeRate': np.random.normal(0.7, 0.15, n),
        'backspaceRatio': np.random.beta(2, 20, n),
        'medianIKD': np.random.normal(200, 60, n),
        'maxBurstLength': np.random.normal(50, 20, n),
        'ikdStdDev': np.random.normal(140, 50, n),
    })
    # create test set by sampling
    df_test = df_train.sample(frac=0.2, random_state=1).reset_index(drop=True)
    df_train = df_train.drop(df_test.index).reset_index(drop=True)

print("Train shape:", df_train.shape, "Test shape:", df_test.shape)

# Feature selection
features = [
    "avgHoldTime", "medianIKD", "holdTimeStdDev", "tempoChangeRate",
    "typingSpeedWPM", "entropyIKD", "maxBurstLength", "commonDigraphTiming",
    "skewnessIKD", "ikdStdDev", "correctionLatencyMean", "backspaceRatio"
]

# ensure features exist
features = [f for f in features if f in df_train.columns]
print("Using features:", features)

target_label = 'name'

# Encode labels + split
encoder = LabelEncoder()
encoder.fit(pd.concat([df_train[target_label], df_test[target_label]]))
df_train['label_enc'] = encoder.transform(df_train[target_label])
df_test['label_enc'] = encoder.transform(df_test[target_label])

# Build X, y for predicting user name
X = df_train[features].copy()
y = df_train['label_enc'].copy()

# scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# train/test split for model evaluation
X_train, X_val, y_train, y_val = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)

# Batch Gradient Descent (custom) for linear regression
def batch_gradient_descent(X, y, lr=0.001, n_iters=5000, add_intercept=True, verbose=False):
    Xb = np.copy(X)
    if add_intercept:
        Xb = np.hstack([np.ones((Xb.shape[0], 1)), Xb])  # add bias column
    n_samples, n_features = Xb.shape
    theta = np.zeros(n_features)
    history = []
    for i in range(n_iters):
        preds = Xb.dot(theta)
        error = preds - y
        grad = (1.0 / n_samples) * Xb.T.dot(error)
        theta -= lr * grad
        if i % 500 == 0:
            loss = (1.0 / (2*n_samples)) * np.sum(error ** 2)
            history.append((i, loss))
            if verbose:
                print(f"Iter {i:5d}, Loss: {loss:.6f}")
    return theta, history

# K-Means Clustering (unsupervised)
k = 17  # number of clusters up to number of users
kmeans = KMeans(n_clusters=k, random_state=42)
kmeans.fit(X_scaled)
cluster_labels = kmeans.predict(X_scaled)

# Add clusters to a PCA 2D plot later
numOfPC = 2
pca2 = PCA(n_components=numOfPC)
X_pca2 = pca2.fit_transform(X_scaled)

plt.figure(figsize=(8,6))
sns.scatterplot(x=X_pca2[:,0], y=X_pca2[:,1], hue=cluster_labels, palette='tab10', legend='full', s=40)
plt.title(f'KMeans clusters (k={k}) on PCA({numOfPC})')
savefig("kmeans_pca2_scatter.png")
plt.close()

# PCA (for visualization & dimensionality reduction)
pca = PCA(n_components=numOfPC)
pc = pca.fit_transform(X_scaled)
explained = pca.explained_variance_ratio_
print("\nPCA explained variance ratio:", explained)

plt.figure(figsize=(8,6))
sns.scatterplot(x=pc[:,0], y=pc[:,1], hue=df_train['label_enc'], palette='tab10', s=40)
plt.title(f'PCA ({numOfPC} components) of features colored by user label')
plt.xlabel(f'PC1 ({explained[0]*100:.1f}%)')
plt.ylabel(f'PC2 ({explained[1]*100:.1f}%)')
savefig("pca_2_user_scatter.png")
plt.close()

# Scree plot
plt.figure(figsize=(6,4))
components = np.arange(1, len(pca.explained_variance_ratio_)+1)
plt.bar(components, pca.explained_variance_ratio_)
plt.xlabel("Principal component")
plt.ylabel("Explained variance ratio")
plt.title("PCA Scree Plot")
savefig("pca_scree.png")
plt.close()

# Neural Network (MLPClassifier) for user classification

# We'll use a small MLP with one hidden layer
mlp = MLPClassifier(hidden_layer_sizes=(64,), activation='relu', max_iter=1000, random_state=42)
mlp.fit(X_train, y_train)
y_pred_mlp = mlp.predict(X_val)

mlp_metrics = {
    'accuracy': accuracy_score(y_val, y_pred_mlp),
    'precision_macro': precision_score(y_val, y_pred_mlp, average='macro', zero_division=0),
    'recall_macro': recall_score(y_val, y_pred_mlp, average='macro', zero_division=0),
    'f1_macro': f1_score(y_val, y_pred_mlp, average='macro', zero_division=0)
}
print("\nMLP metrics:", mlp_metrics)
print("Classification report (MLP):\n", classification_report(y_val, y_pred_mlp, zero_division=0))

# Perceptron (simple linear classifier)
per = Perceptron(max_iter=1000, random_state=42)
per.fit(X_train, y_train)
y_pred_per = per.predict(X_val)
per_metrics = {
    'accuracy': accuracy_score(y_val, y_pred_per),
    'precision_macro': precision_score(y_val, y_pred_per, average='macro', zero_division=0),
    'recall_macro': recall_score(y_val, y_pred_per, average='macro', zero_division=0),
    'f1_macro': f1_score(y_val, y_pred_per, average='macro', zero_division=0)
}
print("\nPerceptron metrics:", per_metrics)

# Linear Regression for user prediction (treating encoded label as continuous)
linreg = LinearRegression()
linreg.fit(X_train, y_train.values)
y_pred_lin = linreg.predict(X_val)
# Round to nearest integer and clip to valid label range
y_pred_lin_rounded = np.clip(np.round(y_pred_lin).astype(int), 0, len(encoder.classes_)-1)

lin_metrics = {
    'MSE': mean_squared_error(y_val, y_pred_lin),
    'RMSE': rmse(y_val, y_pred_lin),
    'R2': r2_score(y_val, y_pred_lin),
    'accuracy': accuracy_score(y_val, y_pred_lin_rounded)
}
print("\nLinear Regression metrics:", lin_metrics)

# Gradient Descent for user prediction
theta, history = batch_gradient_descent(X_train, y_train.values, lr=0.01, n_iters=5000, verbose=False)
X_val_b = np.hstack([np.ones((X_val.shape[0],1)), X_val])
y_pred_gd = X_val_b.dot(theta)
# Round to nearest integer and clip to valid label range
y_pred_gd_rounded = np.clip(np.round(y_pred_gd).astype(int), 0, len(encoder.classes_)-1)

gd_metrics = {
    'MSE': mean_squared_error(y_val, y_pred_gd),
    'RMSE': rmse(y_val, y_pred_gd),
    'R2': r2_score(y_val, y_pred_gd),
    'accuracy': accuracy_score(y_val, y_pred_gd_rounded)
}
print("Gradient Descent metrics:", gd_metrics)

# save GD loss curve
iters, losses = zip(*history)
plt.figure(figsize=(6,4))
plt.plot(iters, losses)
plt.xlabel("Iteration")
plt.ylabel("Loss (1/(2n) sum sq)")
plt.title("GD Loss Curve")
savefig("gd_loss_curve.png")
plt.close()

# Summary comparisons and plots

# All models comparison (accuracy)
all_models_table = pd.DataFrame({
    'model': ['MLPClassifier', 'Perceptron', 'LinearRegression', 'GradientDescent'],
    'accuracy': [mlp_metrics['accuracy'], per_metrics['accuracy'],
                 lin_metrics['accuracy'], gd_metrics['accuracy']],
    'precision_macro': [mlp_metrics['precision_macro'], per_metrics['precision_macro'],
                       np.nan, np.nan],  # Regression models don't have precision/recall/f1
    'recall_macro': [mlp_metrics['recall_macro'], per_metrics['recall_macro'],
                     np.nan, np.nan],
    'f1_macro': [mlp_metrics['f1_macro'], per_metrics['f1_macro'],
                 np.nan, np.nan]
})
all_models_table.to_csv(os.path.join(OUTDIR, "all_models_comparison.csv"), index=False)
print("\nAll models comparison:\n", all_models_table)

# Regression models comparison (MSE, RMSE, R2)
reg_table = pd.DataFrame({
    'model': ['LinearRegression', 'GradientDescent'],
    'MSE': [lin_metrics['MSE'], gd_metrics['MSE']],
    'RMSE': [lin_metrics['RMSE'], gd_metrics['RMSE']],
    'R2': [lin_metrics['R2'], gd_metrics['R2']],
    'accuracy': [lin_metrics['accuracy'], gd_metrics['accuracy']]
})
reg_table.to_csv(os.path.join(OUTDIR, "regression_comparison.csv"), index=False)
print("\nRegression models comparison:\n", reg_table)

# Save a barplot comparing all model accuracies
plt.figure(figsize=(10,5))
sns.barplot(x='model', y='accuracy', data=all_models_table)
plt.ylim(0,1)
plt.title("User prediction accuracy comparison (all models)")
plt.xticks(rotation=45, ha='right')
savefig("all_models_accuracy_comparison.png")
plt.close()

# Save regression RMSE comparison
plt.figure(figsize=(8,5))
sns.barplot(x='model', y='RMSE', data=reg_table)
plt.title("Regression models RMSE comparison")
savefig("regression_rmse_comparison.png")
plt.close()

# Predictions examples (on test set)

# Prepare test scaled data
X_test_full = scaler.transform(df_test[features])
# Predict users with MLP
y_test_pred_mlp = mlp.predict(X_test_full)
# Map back to names
pred_names_mlp = encoder.inverse_transform(y_test_pred_mlp)
df_test['pred_mlp'] = pred_names_mlp

# Show a few
df_test_sample = df_test[[target_label] + features + ['pred_mlp']].head(10)
df_test_sample.to_csv(os.path.join(OUTDIR, "test_predictions_mlp_sample.csv"), index=False)
print("\nSaved test sample predictions (MLP).")

# Save results summary
with open(os.path.join(OUTDIR, "results_summary.txt"), "w", encoding='utf-8') as f:
    f.write("Linear Regression metrics:\n")
    f.write(str(lin_metrics) + "\n\n")
    f.write("Gradient Descent metrics:\n")
    f.write(str(gd_metrics) + "\n\n")
    f.write("MLP metrics:\n")
    f.write(str(mlp_metrics) + "\n\n")
    f.write("Perceptron metrics:\n")
    f.write(str(per_metrics) + "\n\n")
print("Saved results_summary.txt")

print("\nAll finished. Check the outputs/ directory for plots, CSVs, and lecture_notes.md")


Loaded Train.xlsx and Test.xlsx
Train shape: (850, 23) Test shape: (170, 23)
Using features: ['avgHoldTime', 'medianIKD', 'holdTimeStdDev', 'tempoChangeRate', 'typingSpeedWPM', 'entropyIKD', 'maxBurstLength', 'commonDigraphTiming', 'skewnessIKD', 'ikdStdDev', 'correctionLatencyMean', 'backspaceRatio']
Saved: outputs/kmeans_pca2_scatter.png

PCA explained variance ratio: [0.35055392 0.14166348]
Saved: outputs/pca_2_user_scatter.png
Saved: outputs/pca_scree.png

MLP metrics: {'accuracy': 0.9647058823529412, 'precision_macro': 0.9663101604278076, 'recall_macro': 0.9647058823529411, 'f1_macro': 0.9646616541353383}
Classification report (MLP):
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       0.91      1.00      0.95        10
           2       0.91      1.00      0.95        10
           3       1.00      1.00      1.00        10
           4       1.00      1.00      1.00        10
           5       0.90    