In [3]:
import pandas as pd, numpy as np, os, math
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import f_classif, SelectKBest
from collections import Counter

dataset_path = "dataset.xlsx"

# eda
df = pd.read_excel(dataset_path)
print("Loaded merged dataset:", dataset_path, "shape:", df.shape)


print("\nColumns:", df.columns.tolist())
print("\nFirst 3 rows:")
print(df.head(3))


print("\nMissing values per column:\n", df.isnull().sum())


numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
numeric_cols = [c for c in numeric_cols if c not in ["userID"]]
print("\nNumeric cols count:", len(numeric_cols))


plot_dir = "eda_plots"
os.makedirs(plot_dir, exist_ok=True)

features_to_plot = [
    "avgHoldTime","medianIKD","holdTimeStdDev","tempoChangeRate","typingSpeedWPM",
    "entropyIKD","maxBurstLength","commonDigraphTiming","skewnessIKD","ikdStdDev",
    "autocorrLag1IKD","correctionLatencyMean","backspaceRatio"
]
features_to_plot = [f for f in features_to_plot if f in df.columns]

for col in features_to_plot:
    plt.figure(figsize=(5,3))
    plt.hist(df[col].dropna(), bins=25)
    plt.title(col)
    plt.xlabel(col)
    plt.ylabel("count")
    plt.tight_layout()
    plt.savefig(os.path.join(plot_dir, f"hist_{col}.png"))
    plt.close()

print("Saved histograms for selected features.")


corr = df[numeric_cols].corr().abs()
corr.to_csv("correlation_matrix.csv")
plt.figure(figsize=(8,6))
plt.imshow(corr, interpolation='nearest')
plt.colorbar()
plt.xticks(range(len(corr.columns)), corr.columns, rotation=90)
plt.yticks(range(len(corr.index)), corr.index)
plt.title("Correlation (abs)")
plt.tight_layout()
plt.savefig(os.path.join(plot_dir, "corr_matrix.png"))
plt.close()
print("Saved correlation matrix and heatmap.")

Loaded merged dataset: dataset.xlsx shape: (1020, 23)

Columns: ['userID', 'autocorrLag1IKD', 'avgBurstLength', 'avgHoldTime', 'avgIKD', 'avgPauseLength', 'backspaceCount', 'backspaceRatio', 'burstCount', 'commonDigraphTiming', 'correctionLatencyMean', 'entropyIKD', 'errorRate', 'holdTimeStdDev', 'ikdStdDev', 'maxBurstLength', 'medianIKD', 'name', 'pauseCount', 'shiftPressCount', 'skewnessIKD', 'tempoChangeRate', 'typingSpeedWPM']

First 3 rows:
   userID  autocorrLag1IKD  avgBurstLength  avgHoldTime      avgIKD  \
0       1        -0.002090       80.000000    94.501000  117.084000   
1       1         0.060646       70.760716    97.626946  122.845754   
2       1         0.128373       72.092142    98.855506  126.540442   

   avgPauseLength  backspaceCount  backspaceRatio  burstCount  \
0        0.000000        6.000000        0.074000    1.000000   
1        0.058128        4.896679        0.046157    0.979272   
2       -0.038170        3.071730        0.004722    0.938096   

   c

Loaded merged dataset: merged_17_users_dataset.xlsx shape: (510, 23)

Columns: ['userID', 'autocorrLag1IKD', 'avgBurstLength', 'avgHoldTime', 'avgIKD', 'avgPauseLength', 'backspaceCount', 'backspaceRatio', 'burstCount', 'commonDigraphTiming', 'correctionLatencyMean', 'entropyIKD', 'errorRate', 'holdTimeStdDev', 'ikdStdDev', 'maxBurstLength', 'medianIKD', 'name', 'pauseCount', 'shiftPressCount', 'skewnessIKD', 'tempoChangeRate', 'typingSpeedWPM']

First 3 rows:
   userID  autocorrLag1IKD  avgBurstLength  avgHoldTime   avgIKD  \
0       1         0.058946            67.0       95.916  111.061   
1       1         0.085662            89.0      103.980  121.481   
2       1        -0.031170            64.0       99.425  131.163   

   avgPauseLength  backspaceCount  backspaceRatio  burstCount  \
0             0.0               0           0.000           1   
1             0.0              12           0.132           1   
2             0.0               1           0.015           1   

   commonDigraphTiming  ...  holdTimeStdDev  ikdStdDev  maxBurstLength  \
0                68.57  ...          36.867     50.993              67   
1                16.31  ...          78.751     60.947              89   
2                24.23  ...          42.862     77.650              64   

   medianIKD       name  pauseCount  shiftPressCount skewnessIKD  \
0     101.00  Maaz Khan           0                1    0.987325   
1     106.95  Maaz Khan           0                4    1.354817   
2     118.30  Maaz Khan           0                1    1.251438   

   tempoChangeRate  typingSpeedWPM  
0            0.738              93  
1            0.736              72  
2            0.774              74  

[3 rows x 23 columns]

Missing values per column:
 userID                   0
autocorrLag1IKD          0
avgBurstLength           0
avgHoldTime              0
avgIKD                   0
avgPauseLength           0
backspaceCount           0
backspaceRatio           0
burstCount               0
commonDigraphTiming      0
correctionLatencyMean    0
entropyIKD               0
errorRate                0
holdTimeStdDev           0
ikdStdDev                0
maxBurstLength           0
medianIKD                0
name                     0
pauseCount               0
shiftPressCount          0
skewnessIKD              0
tempoChangeRate          0
typingSpeedWPM           0
dtype: int64

Numeric cols count: 21
Saved histograms for selected features.
Saved correlation matrix and heatmap.


In [4]:
# data preprocessing
df_proc = df.copy()
for col in numeric_cols:
    if df_proc[col].isnull().any():
        df_proc[col] = df_proc[col].fillna(df_proc[col].median())

df_proc = df_proc.dropna(subset=["name"])
print("After filling, missing values:", df_proc.isnull().sum().sum())



X = df_proc.drop(columns=["name"] + (["userID"] if "userID" in df_proc.columns else []))
y = df_proc["name"].astype(str)
print("Feature count:", X.shape[1])



le = LabelEncoder()
y_enc = le.fit_transform(y)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_enc, test_size=0.30, random_state=42, stratify=y_enc)
print("Train/Test sizes:", X_train.shape, X_test.shape)

After filling, missing values: 0
Feature count: 21
Train/Test sizes: (714, 21) (306, 21)


After filling, missing values: 0
Feature count: 21
Train/Test sizes: (357, 21) (153, 21)


In [5]:
# feature selection
selected_features = [
    "avgHoldTime","medianIKD","holdTimeStdDev","tempoChangeRate",
    "typingSpeedWPM","entropyIKD","maxBurstLength","commonDigraphTiming",
    "skewnessIKD","ikdStdDev","correctionLatencyMean","backspaceRatio"
]
present_selected = [f for f in selected_features if f in X.columns.tolist()]
print("Selected features present:", present_selected)

selector = SelectKBest(f_classif, k='all').fit(X, y_enc)
f_scores = pd.Series(selector.scores_, index=X.columns).sort_values(ascending=False)
rf = RandomForestClassifier(n_estimators=200, random_state=0).fit(X, y_enc)
rf_imps = pd.Series(rf.feature_importances_, index=X.columns).sort_values(ascending=False)

print("\nTop 10 ANOVA features:\n", f_scores.head(10))
print("\nTop 10 RF features:\n", rf_imps.head(10))

# Count how many of the selected features are in top 20 of either metric
in_top = {f: {"in_f_top20": f in f_scores.head(20).index, "in_rf_top20": f in rf_imps.head(20).index} for f in present_selected}
print("\nSelected features ranking presence in top20:\n", pd.DataFrame(in_top).T)

Selected features present: ['avgHoldTime', 'medianIKD', 'holdTimeStdDev', 'tempoChangeRate', 'typingSpeedWPM', 'entropyIKD', 'maxBurstLength', 'commonDigraphTiming', 'skewnessIKD', 'ikdStdDev', 'correctionLatencyMean', 'backspaceRatio']

Top 10 ANOVA features:
 medianIKD         1536.865234
avgIKD            1336.043977
burstCount         844.375212
pauseCount         803.536116
avgHoldTime        759.691964
typingSpeedWPM     464.390793
maxBurstLength     441.722184
ikdStdDev          325.460694
avgBurstLength     313.624859
avgPauseLength     242.284401
dtype: float64

Top 10 RF features:
 avgHoldTime        0.140992
medianIKD          0.115733
holdTimeStdDev     0.083063
avgIKD             0.075854
shiftPressCount    0.069514
typingSpeedWPM     0.046714
ikdStdDev          0.045683
avgBurstLength     0.039116
pauseCount         0.038371
burstCount         0.038336
dtype: float64

Selected features ranking presence in top20:
                        in_f_top20  in_rf_top20
avgHoldTime 

Selected features present: ['avgHoldTime', 'medianIKD', 'holdTimeStdDev', 'tempoChangeRate', 'typingSpeedWPM', 'entropyIKD', 'maxBurstLength', 'commonDigraphTiming', 'skewnessIKD', 'ikdStdDev', 'correctionLatencyMean', 'backspaceRatio']

Top 10 ANOVA features:
 medianIKD         768.074364
avgIKD            611.633933
avgHoldTime       420.874718
burstCount        403.759690
pauseCount        400.438827
maxBurstLength    232.638353
typingSpeedWPM    227.560837
avgBurstLength    158.208851
ikdStdDev         152.398188
avgPauseLength    114.615541
dtype: float64

Top 10 RF features:
 avgHoldTime        0.135206
medianIKD          0.106823
holdTimeStdDev     0.080794
avgIKD             0.077299
shiftPressCount    0.060853
tempoChangeRate    0.049973
ikdStdDev          0.047881
typingSpeedWPM     0.046042
entropyIKD         0.040009
avgBurstLength     0.039993
dtype: float64

Selected features ranking presence in top20:
                        in_f_top20  in_rf_top20
avgHoldTime                  True         True
medianIKD                    True         True
holdTimeStdDev               True         True
tempoChangeRate              True         True
typingSpeedWPM               True         True
entropyIKD                   True         True
maxBurstLength               True         True
commonDigraphTiming          True         True
skewnessIKD                  True         True
ikdStdDev                    True         True
correctionLatencyMean        True         True
backspaceRatio               True         True


In [6]:
# Train sklearn KNN
k = 5
knn = KNeighborsClassifier(n_neighbors=k)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
acc_knn = accuracy_score(y_test, y_pred)
print(f"\nSklearn KNN accuracy: {acc_knn:.4f}")
print(classification_report(y_test, y_pred, target_names=le.classes_))


Sklearn KNN accuracy: 0.9118
                     precision    recall  f1-score   support

        Ammar Ahmed       0.94      0.94      0.94        18
       Ammar Wolfie       0.76      0.89      0.82        18
          Amna kazi       1.00      1.00      1.00        18
              Beiah       1.00      1.00      1.00        18
      Ghina Durrani       0.95      1.00      0.97        18
            Hasnain       0.90      1.00      0.95        18
MUHAMMAD JAWAD KHAN       1.00      1.00      1.00        18
          Maaz Khan       0.80      0.67      0.73        18
       Mahad Arshad       1.00      1.00      1.00        18
         Maira Qazi       0.90      1.00      0.95        18
Muhammad Ayan Azhar       1.00      0.94      0.97        18
          Qasim Ali       0.83      0.56      0.67        18
      Umaima Fatima       0.94      0.89      0.91        18
        Umer Shaikh       0.81      0.72      0.76        18
             ayesha       0.75      1.00      0.86    


Sklearn KNN accuracy: 0.8954
                     precision    recall  f1-score   support

        Ammar Ahmed       0.82      1.00      0.90         9
       Ammar Wolfie       1.00      1.00      1.00         9
          Amna kazi       0.90      1.00      0.95         9
              Beiah       1.00      1.00      1.00         9
      Ghina Durrani       0.89      0.89      0.89         9
            Hasnain       0.73      0.89      0.80         9
MUHAMMAD JAWAD KHAN       1.00      1.00      1.00         9
          Maaz Khan       0.86      0.67      0.75         9
       Mahad Arshad       1.00      1.00      1.00         9
         Maira Qazi       0.90      1.00      0.95         9
Muhammad Ayan Azhar       0.90      1.00      0.95         9
          Qasim Ali       0.83      0.56      0.67         9
      Umaima Fatima       0.89      0.89      0.89         9
        Umer Shaikh       1.00      0.78      0.88         9
             ayesha       0.78      0.78      0.78         9
      daniyal badar       0.90      1.00      0.95         9
        marina ayaz       0.88      0.78      0.82         9

           accuracy                           0.90       153
          macro avg       0.90      0.90      0.89       153
       weighted avg       0.90      0.90      0.89       153



In [7]:
# custom knn
def euclidean_distance(x1, x2):
  return np.sqrt(np.sum((x1 - x2) ** 2))

class KNN:
  def __init__(self, k=3):
    self.k = k

  def fit(self, X, y):
    self.X_train = X
    self.y_train = y

  def predict(self, X):
    predictions = [self._predict(x) for x in X]
    return np.array(predictions)

  def _predict(self, x):
    distances = [euclidean_distance(x, x_train) for x_train in self.X_train]
    k_indices = np.argsort(distances)[:self.k]
    k_labels = [self.y_train[i] for i in k_indices]
    most_common = Counter(k_labels).most_common(1)
    return most_common[0][0]

In [8]:
print("\nRunning custom KNN...")
knn = KNN(k=5)
knn.fit(X_train, y_train)
y_pred_custom = knn.predict(X_test)
accuracy = np.mean(y_pred_custom == y_test)
print("Accuracy:", accuracy)
print(classification_report(y_test, y_pred_custom, target_names=le.classes_))

# Compare
agree = (y_pred == y_pred_custom).mean()
print(f"Agreement between sklearn and custom KNN: {agree:.4f}")


Running custom KNN...
Accuracy: 0.9150326797385621
                     precision    recall  f1-score   support

        Ammar Ahmed       0.94      0.94      0.94        18
       Ammar Wolfie       0.76      0.89      0.82        18
          Amna kazi       1.00      1.00      1.00        18
              Beiah       1.00      1.00      1.00        18
      Ghina Durrani       0.95      1.00      0.97        18
            Hasnain       0.90      1.00      0.95        18
MUHAMMAD JAWAD KHAN       1.00      1.00      1.00        18
          Maaz Khan       0.86      0.67      0.75        18
       Mahad Arshad       1.00      1.00      1.00        18
         Maira Qazi       0.90      1.00      0.95        18
Muhammad Ayan Azhar       1.00      0.94      0.97        18
          Qasim Ali       0.85      0.61      0.71        18
      Umaima Fatima       0.94      0.89      0.91        18
        Umer Shaikh       0.81      0.72      0.76        18
             ayesha       0.75  

In [9]:
# Save results
pd.DataFrame({"sklearn_pred": y_pred, "custom_pred": y_pred_custom, "true": y_test}).to_csv("knn_preds_compare.csv", index=False)
pd.DataFrame({"feature": X.columns, "f_score": f_scores, "rf_imp": rf_imps}).to_csv("feature_justification.csv")
pd.DataFrame(confusion_matrix(y_test, y_pred), index=le.classes_, columns=le.classes_).to_csv("confusion_sklearn_knn.csv")
pd.DataFrame(confusion_matrix(y_test, y_pred_custom), index=le.classes_, columns=le.classes_).to_csv("confusion_custom_knn.csv")

print("\nSaved comparison and justification outputs")


Saved comparison and justification outputs
