In [50]:
# pip install pandas

In [51]:
import numpy as np
import pandas as pd
from pathlib import Path

Let's Define the root

In [52]:
ROOT = Path("..")
DATA = ROOT / "StudentProjectData"
list(DATA.iterdir())

[PosixPath('../StudentProjectData/TrainData3.txt'),
 PosixPath('../StudentProjectData/TrainData2.txt'),
 PosixPath('../StudentProjectData/TrainLabel4.txt'),
 PosixPath('../StudentProjectData/TrainData1.txt'),
 PosixPath('../StudentProjectData/TestData4.txt'),
 PosixPath('../StudentProjectData/TrainLabel3.txt'),
 PosixPath('../StudentProjectData/.DS_Store'),
 PosixPath('../StudentProjectData/TrainData4.txt'),
 PosixPath('../StudentProjectData/TestData1.txt'),
 PosixPath('../StudentProjectData/TrainLabel2.txt'),
 PosixPath('../StudentProjectData/TestData3.txt'),
 PosixPath('../StudentProjectData/TestData2.txt'),
 PosixPath('../StudentProjectData/TrainLabel1.txt'),
 PosixPath('../StudentProjectData/MissingData2.txt'),
 PosixPath('../StudentProjectData/MissingData1.txt')]

Data loaders to explore the data

In [53]:
def load_txt_matrix(path):
    text = open(path).read()

    if "," in text.splitlines()[0]:   # if the first line contains commas
        return np.loadtxt(path, delimiter=",")
    else:
        return np.loadtxt(path)

Load Classification sets:

In [54]:
def load_classification(i):
    X_train = load_txt_matrix(DATA / f"TrainData{i}.txt")
    y_train = load_txt_matrix(DATA / f"TrainLabel{i}.txt")
    X_test = load_txt_matrix(DATA / f"TestData{i}.txt" )
    return X_train, y_train, X_test

Load Missing Value Data Sets:

In [55]:
def load_missing(i):
    return load_txt_matrix(DATA / f"MissingData{i}.txt")

Inspect Classification Dataset:

In [58]:
print("–"*40)
for i in range(1,5):
    X_train, y_train, X_test = load_classification(i)
    print(f"Dataset {i}")
    print("Train shape: ", X_train.shape)
    print("Label shape: ", y_train.shape)
    print("Test shape: ", X_test.shape)
    print("–"*40)

––––––––––––––––––––––––––––––––––––––––
Dataset 1
Train shape:  (150, 3312)
Label shape:  (150,)
Test shape:  (53, 3312)
––––––––––––––––––––––––––––––––––––––––
Dataset 2
Train shape:  (100, 9182)
Label shape:  (100,)
Test shape:  (74, 9182)
––––––––––––––––––––––––––––––––––––––––
Dataset 3
Train shape:  (6300, 13)
Label shape:  (6300,)
Test shape:  (2693, 13)
––––––––––––––––––––––––––––––––––––––––
Dataset 4
Train shape:  (2547, 112)
Label shape:  (2547,)
Test shape:  (1092, 112)
––––––––––––––––––––––––––––––––––––––––


Identify Missing Values: 
- Missing values are encoded as 1e99.

In [60]:
for i in range(1,5):
    X_train, y_train, X_test = load_classification(i)
    missing_train = np.sum(X_train == 1e99)
    missing_test = np.sum(X_test == 1e99)
    print(f"Dataset {i}: missing train = {missing_train}, missing_test = {missing_test}")

Dataset 1: missing train = 9936, missing_test = 7021
Dataset 2: missing train = 0, missing_test = 0
Dataset 3: missing train = 1886, missing_test = 0
Dataset 4: missing train = 0, missing_test = 0


For missing datasets

In [61]:
for i in [1,2]:
    M = load_missing(i)
    missing = np.sum(M == 1e99)
    total = M.size 
    pct = missing / total * 100
    print(f"MissingData{i}: {missing} missing ({pct:.2f}%)")

MissingData1: 118 missing (3.48%)
MissingData2: 3762 missing (9.93%)


Basic statistics (feature-level):
- We will use masked arrays so missing values don’t distort stats:

In [62]:
def masked_stats(X):
    m = np.ma.masked_where(X == 1e99, X)
    return m.mean(), m.std(), m.min(), m.max()

In [65]:
for i in range(1,5):
    X_train, y_train, X_test = load_classification(i)
    mean, std, mn, mx = masked_stats(X_train)
    print(f"Dataset {i} | mean={mean:.2f}, std={std:.2f}, range=({mn:.2f}, {mx:.2f})")


Dataset 1 | mean=2.19, std=0.56, range=(1.00, 3.97)
Dataset 2 | mean=2.17, std=0.61, range=(1.30, 4.83)
Dataset 3 | mean=2.74, std=2.00, range=(0.00, 9.00)
Dataset 4 | mean=0.26, std=123.60, range=(-359.12, 356.42)


Class distribution:
- check whether the datasets are imbalanced or not

In [67]:
for i in range(1,5):
    _, y, _ = load_classification(i)
    uniques, counts = np.unique(y, return_counts=True)
    print(f"Dataset {i}:")
    for u, c in zip(uniques, counts):
        print(f"    class {u}: {c} samples")
    print()

Dataset 1:
    class 1.0: 108 samples
    class 2.0: 14 samples
    class 3.0: 11 samples
    class 4.0: 14 samples
    class 5.0: 3 samples

Dataset 2:
    class 1.0: 10 samples
    class 2.0: 8 samples
    class 3.0: 12 samples
    class 4.0: 11 samples
    class 5.0: 11 samples
    class 6.0: 10 samples
    class 7.0: 6 samples
    class 8.0: 9 samples
    class 9.0: 6 samples
    class 10.0: 9 samples
    class 11.0: 8 samples

Dataset 3:
    class 1.0: 1235 samples
    class 2.0: 554 samples
    class 3.0: 488 samples
    class 4.0: 566 samples
    class 5.0: 495 samples
    class 6.0: 777 samples
    class 7.0: 677 samples
    class 8.0: 912 samples
    class 9.0: 596 samples

Dataset 4:
    class 1.0: 288 samples
    class 2.0: 275 samples
    class 3.0: 270 samples
    class 4.0: 292 samples
    class 5.0: 278 samples
    class 6.0: 287 samples
    class 7.0: 289 samples
    class 8.0: 298 samples
    class 9.0: 270 samples



#### Conclusions:

Dataset 1:

Extremely imbalanced: 108 samples in class 1, only 3 samples in class 5.

High percentage of missing values: 9,936 missing in Train and 7,021 in Test.

Mean and standard deviation values suggest feature scaling is required.

Conclusion: 
- Prioritize robust imputation and feature scaling. Consider weighted classification or using KNN with careful tuning.

Dataset 2:

Large feature set: 9,182 features with no missing values.
Classes are more balanced compared to Dataset 1.

Conclusion: 
- Use dimensionality reduction or simple classification methods to avoid overfitting and computational issues.

Dataset 3:

Moderate missing values in Train (1,886 missing)
13 features only.

Healthy class distribution.

Conclusion: 
- Straightforward imputation and classification can be applied.

Dataset 4

No missing values, but feature distribution is problematic: very large standard deviation caused by outliers or variation in the data.

Classes are fairly balanced.

Conclusion: 
- Strong normalization or feature scaling is necessary.
