# Performing needed imports and boilder plate code.

In [11]:
import random
import numpy as np
from tqdm.notebook import tqdm
import pandas as pd
from torch.utils.data import DataLoader

# set this variable to a number to be used as the random seed
# or to None if you don't want to set a random seed
seed = 1234

if seed is not None:
    random.seed(seed)
    np.random.seed(seed)

# Copied over data cleaning steps from our data cleaning notebook.

In [5]:
education_data = pd.read_csv('students_clean.csv')


education_data.drop('Parent_Education_Level', axis=1, inplace=True) 


education_data['Gender'] = education_data['Gender'].replace({'Male': 1, 'Female': 0}).astype(int)
education_data['Internet_Access_at_Home'] = education_data['Internet_Access_at_Home'].replace({'Yes': 1, 'No': 0}).astype(int)
education_data['Extracurricular_Activities'] = education_data['Extracurricular_Activities'].replace({'Yes': 1, 'No': 0}).astype(int)


# Low = 1, Medium = 2, High = 3
mapper = {'low': 1, 'medium': 2, 'high': 3}

education_data['Family_Income_Level'] = (
    education_data['Family_Income_Level']
      .astype(str)                  # works even if the value is already 1/2/3 or NaN
      .str.strip().str.lower()
      .map(mapper)                  # returns NaN where no mapping found
      .fillna(education_data['Family_Income_Level'])  # keepin the original numeric/blank entries
      .astype('Int64')              #  nullable integer dtype
)

labels = open('departments.txt').read().splitlines()
department_mapping = {name: index for index, name in enumerate(labels)}
department_indices = education_data['Department'].map(department_mapping)
education_data.insert(3, 'department index', department_indices)

mapper = {'A': 0, 'B': 1, 'C': 2, 'D':3,'F':4}

education_data['Grade'] = (
    education_data['Grade']
      .astype(str)              # convert everything to string
      .str.strip().str.upper()  # remove spaces and standardize to uppercase
      .map(mapper)              # map letters to numbers
)

education_data.head()

  education_data['Gender'] = education_data['Gender'].replace({'Male': 1, 'Female': 0}).astype(int)
  education_data['Internet_Access_at_Home'] = education_data['Internet_Access_at_Home'].replace({'Yes': 1, 'No': 0}).astype(int)
  education_data['Extracurricular_Activities'] = education_data['Extracurricular_Activities'].replace({'Yes': 1, 'No': 0}).astype(int)


Unnamed: 0,Gender,Age,Department,department index,Attendance (%),Midterm_Score,Final_Score,Assignments_Avg,Quizzes_Avg,Participation_Score,Projects_Score,Total_Score,Grade,Study_Hours_per_Week,Extracurricular_Activities,Internet_Access_at_Home,Family_Income_Level,Stress_Level (1-10),Sleep_Hours_per_Night
0,0,22,Mathematics,0,97.36,40.61,59.61,73.69,53.17,73.4,62.84,59.8865,4,10.3,1,0,2,1,5.9
1,1,18,Business,1,97.71,57.27,74.0,74.23,98.23,88.0,98.23,81.917,1,27.1,0,0,1,4,4.3
2,1,24,Engineering,2,99.52,41.84,63.85,85.85,50.0,4.7,91.22,67.717,3,12.4,1,0,1,9,6.1
3,0,24,Engineering,2,90.38,45.65,44.44,68.1,66.27,4.2,55.48,51.6535,4,25.5,0,1,1,8,4.9
4,0,23,CS,3,59.41,53.13,61.77,67.66,83.98,64.3,87.43,71.403,2,13.3,1,0,2,6,4.5


# Now defining our data loader and perceptron

In [6]:
from torch.utils.data import Dataset
import torch

class MyDataset(Dataset):
    def __init__(self, df, feature_cols, target_col):
        self.df = df
        self.feature_cols = feature_cols
        self.target_col = target_col

    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        row = self.df.iloc[index]
        x = torch.tensor(row[self.feature_cols].to_numpy(dtype=np.float32), dtype=torch.float32)
        y = torch.tensor(row[self.target_col], dtype=torch.long)  # long for classification
        return x, y

In [7]:
def transform_data_for_perc(dataset_df, target_col, features_lst, pos_class):
    
    # Copy over the data frame.
    dataset_df = dataset_df.copy(deep=True)
    
    # Builld the list of features to select.
    features_lst = features_lst.copy()
    features_lst.append(target_col)

    
    # Now actually select the columns in the desired order.
    dataset_df = dataset_df[features_lst]
    

    # Now what we are going to do is apply a function on the target col s.t all instances
    # where the target is the given class number are set to 1 and all others to 0.
    dataset_df[target_col] = dataset_df[target_col].apply(lambda x: 1 if x==pos_class else 0)


    return dataset_df

In [13]:
def train_perceptron(train_dl, n_features, pos_class):
    # First initialize the model.
    w = np.zeros(n_features)
    b = 0
    n_errors = 0

    # Now loop through each batch.
    for batch_idx, (x, y) in tqdm(enumerate(train_dl), total=len(train_dl)):
        #print(x)
        x_curr_np = x.numpy()
        y_curr_np = y.numpy()

        # Now perform the training/classification loop.
        scores = x_curr_np @ w + b
        #print(type(score))

        y_pred = (scores > 0).astype(int)

        # My hopethisis is for the A class barley any positives are being predicted.
        #if y_pred.sum(axis=0) > 0:
            #print(f"For current batch predicted {y_pred.sum(axis=0)} positives")

        # Now we vectorize the update to make this more efficient.
        pred_error = y_curr_np - y_pred
        n_errors += np.sum(pred_error != 0) # If the pred error is zero then it is correct.

        w += (pred_error[:,None]*x_curr_np).sum(axis=0) # Re-shape pred errors to update and only add
                                                        # inccorect preds, axis=0 for rows.
        b += pred_error.sum()
        

    # Now once we are done training the result is the weights and biases.
    return (w,b,n_errors)

# Create the training and testing partitions.

In [9]:
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(education_data, train_size=0.8)
train_df,dev_df = train_test_split(train_df, train_size=0.8)

train_df.reset_index(inplace=True,drop=True)
dev_df.reset_index(inplace=True,drop=True)
test_df.reset_index(inplace=True,drop=True)


print(f'train rows: {len(train_df.index):,}')
print(f'dev rows: {len(dev_df.index):,}')
print(f'test rows: {len(test_df.index):,}')

train rows: 3,200
dev rows: 800
test rows: 1,000


# Now actually perform the training.

In [16]:
batch_size = 5
shuffle = True

features_lst = ['department index','Gender', 'Age', 'Attendance (%)', \
       'Midterm_Score', 'Final_Score', 'Assignments_Avg', 'Quizzes_Avg', \
       'Participation_Score', 'Projects_Score', 'Total_Score', \
       'Study_Hours_per_Week', \
       'Family_Income_Level', 'Stress_Level (1-10)', \
       'Sleep_Hours_per_Night']
num_feat = len(features_lst)


# first create a list of the weight vectors and biases for all the classes.
weight_vecs = []
bias_vecs = []

# The class indexes for all the different grade classes.
class_nums = [0,1,2,3,4]

# Now loop through each class and record it's performance on the test set.
for class_n in class_nums:
    # First use the above class to create a data loader with the appropriete current positive class labels.
    train_df = transform_data_for_perc(train_df,'Grade',features_lst,class_n)
    train_ds = MyDataset( train_df,features_lst,'Grade')
    train_dl = DataLoader(train_ds,batch_size=batch_size,shuffle=shuffle)

    # Now run it through the perceptron to get the weight vector and bias for the current class.
    w_curr,b_curr,error_curr = train_perceptron(train_dl,num_feat,class_n)

    # Append current weights to overall weights.
    weight_vecs.append(w_curr)
    bias_vecs.append(b_curr)

    print(f"-------------------Report for class {class_n}------------------------------\n")
    print(f"The current class {class_n} had {error_curr} pred errors in training, the weight vector:\n")
    print(w_curr)
    print(f"With bias value {b_curr}")

    # Test on dev.
    dev_df_a = transform_data_for_perc(dev_df,'Grade',features_lst,class_n)
    X_dev_a = dev_df_a[features_lst]

    dev_y_true = dev_df_a['Grade'].to_numpy()
    dev_y_pred = ((X_dev_a @ w_curr + b_curr) > 0).astype(int)
    n_correct_dev = (dev_y_true==dev_y_pred).sum(axis=0)

    print(f"The number of correct preds was {n_correct_dev} for acc of {(n_correct_dev/dev_y_true.shape[0])*100}%")
    print(f"The number of pos preds was {(dev_y_pred==1).sum(axis=0)} and neg num was {(dev_y_pred==0).sum(axis=0)}")
    

  0%|          | 0/640 [00:00<?, ?it/s]

-------------------Report for class 0------------------------------

The current class 0 had 26 pred errors in training, the weight vector:

[ -14.           -4.          -77.         -318.439991      6.41999054
   85.          -79.04001999 -183.89999771   30.1999836   -93.43998718
  -34.55500031  -81.60000086   -5.          -29.          -19.19999838]
With bias value -4
The number of correct preds was 796 for acc of 99.5%
The number of pos preds was 0 and neg num was 800


  0%|          | 0/640 [00:00<?, ?it/s]

-------------------Report for class 1------------------------------

The current class 1 had 26 pred errors in training, the weight vector:

[  -2.           -2.          -57.         -136.439991    -11.19001007
 -115.56999207 -131.81001282 -192.56997681  181.60000229 -104.88999939
  -91.98649597  -32.70000219   -2.          -15.          -16.19999886]
With bias value -4
The number of correct preds was 705 for acc of 88.125%
The number of pos preds was 0 and neg num was 800


  0%|          | 0/640 [00:00<?, ?it/s]

-------------------Report for class 2------------------------------

The current class 2 had 0 pred errors in training, the weight vector:

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
With bias value 0
The number of correct preds was 437 for acc of 54.625%
The number of pos preds was 0 and neg num was 800


  0%|          | 0/640 [00:00<?, ?it/s]

-------------------Report for class 3------------------------------

The current class 3 had 0 pred errors in training, the weight vector:

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
With bias value 0
The number of correct preds was 502 for acc of 62.74999999999999%
The number of pos preds was 0 and neg num was 800


  0%|          | 0/640 [00:00<?, ?it/s]

-------------------Report for class 4------------------------------

The current class 4 had 0 pred errors in training, the weight vector:

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
With bias value 0
The number of correct preds was 760 for acc of 95.0%
The number of pos preds was 0 and neg num was 800
