# Performing needed imports and boilder plate code.

In [82]:
import random
import numpy as np
from tqdm.notebook import tqdm
import pandas as pd
from torch.utils.data import DataLoader

# set this variable to a number to be used as the random seed
# or to None if you don't want to set a random seed
seed = 1234

if seed is not None:
    random.seed(seed)
    np.random.seed(seed)

# Copied over data cleaning steps from our data cleaning notebook.

In [90]:
education_data = pd.read_csv('students_clean.csv')


education_data.drop('Parent_Education_Level', axis=1, inplace=True) 


education_data['Gender'] = education_data['Gender'].replace({'Male': 1, 'Female': 0}).astype(int)
education_data['Internet_Access_at_Home'] = education_data['Internet_Access_at_Home'].replace({'Yes': 1, 'No': 0}).astype(int)
education_data['Extracurricular_Activities'] = education_data['Extracurricular_Activities'].replace({'Yes': 1, 'No': 0}).astype(int)


# Low = 1, Medium = 2, High = 3
mapper = {'low': 1, 'medium': 2, 'high': 3}

education_data['Family_Income_Level'] = (
    education_data['Family_Income_Level']
      .astype(str)                  # works even if the value is already 1/2/3 or NaN
      .str.strip().str.lower()
      .map(mapper)                  # returns NaN where no mapping found
      .fillna(education_data['Family_Income_Level'])  # keepin the original numeric/blank entries
      .astype('Int64')              #  nullable integer dtype
)

labels = open('departments.txt').read().splitlines()
department_mapping = {name: index for index, name in enumerate(labels)}
department_indices = education_data['Department'].map(department_mapping)
education_data.insert(3, 'department index', department_indices)

mapper = {'A': 4, 'B': 3, 'C': 2, 'D':1,'F':0}

education_data['Grade'] = (
    education_data['Grade']
      .astype(str)              # convert everything to string
      .str.strip().str.upper()  # remove spaces and standardize to uppercase
      .map(mapper)              # map letters to numbers
)

education_data.head()

  education_data['Gender'] = education_data['Gender'].replace({'Male': 1, 'Female': 0}).astype(int)
  education_data['Internet_Access_at_Home'] = education_data['Internet_Access_at_Home'].replace({'Yes': 1, 'No': 0}).astype(int)
  education_data['Extracurricular_Activities'] = education_data['Extracurricular_Activities'].replace({'Yes': 1, 'No': 0}).astype(int)


Unnamed: 0,Gender,Age,Department,department index,Attendance (%),Midterm_Score,Final_Score,Assignments_Avg,Quizzes_Avg,Participation_Score,Projects_Score,Total_Score,Grade,Study_Hours_per_Week,Extracurricular_Activities,Internet_Access_at_Home,Family_Income_Level,Stress_Level (1-10),Sleep_Hours_per_Night
0,0,22,Mathematics,0,97.36,40.61,59.61,73.69,53.17,73.4,62.84,59.8865,0,10.3,1,0,2,1,5.9
1,1,18,Business,1,97.71,57.27,74.0,74.23,98.23,88.0,98.23,81.917,3,27.1,0,0,1,4,4.3
2,1,24,Engineering,2,99.52,41.84,63.85,85.85,50.0,4.7,91.22,67.717,1,12.4,1,0,1,9,6.1
3,0,24,Engineering,2,90.38,45.65,44.44,68.1,66.27,4.2,55.48,51.6535,0,25.5,0,1,1,8,4.9
4,0,23,CS,3,59.41,53.13,61.77,67.66,83.98,64.3,87.43,71.403,2,13.3,1,0,2,6,4.5


# Now defining our data loader and perceptron

In [91]:
from torch.utils.data import Dataset
import torch

class MyDataset(Dataset):
    def __init__(self, df, feature_cols, target_col):
        self.df = df
        self.feature_cols = feature_cols
        self.target_col = target_col

    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        row = self.df.iloc[index]
        x = torch.tensor(row[self.feature_cols].to_numpy(dtype=np.float32), dtype=torch.float32)
        y = torch.tensor(row[self.target_col], dtype=torch.long)  # long for classification
        return x, y

In [92]:
def transform_data_for_perc(dataset_df, target_col, features_lst, pos_class):
    
    # Copy over the data frame.
    dataset_df = dataset_df.copy(deep=True)
    
    # Builld the list of features to select.
    features_lst = features_lst.copy()
    features_lst.append(target_col)

    
    # Now actually select the columns in the desired order.
    dataset_df = dataset_df[features_lst]
    

    # Now what we are going to do is apply a function on the target col s.t all instances
    # where the target is the given class number are set to 1 and all others to 0.
    dataset_df[target_col] = dataset_df[target_col].apply(lambda x: 1 if x==pos_class else 0)


    return dataset_df

In [109]:
def train_perceptron(train_dl, n_features, pos_class):
    # First initialize the model.
    w = np.zeros(n_features)
    b = 0
    n_errors = 0
    weight_steps = []
    total_pos_in_train = 0

    # Adding this in for debug purposes to track the changes to the weight vectors on each
    # round.

    
    # Now loop through each batch.
    for batch_idx, (x, y) in tqdm(enumerate(train_dl), total=len(train_dl),):
        
        x_curr_np = x.numpy()
        y_curr_np = y.numpy()

        total_pos_in_train += (y_curr_np == 1).sum(axis=0)
        

        # Now perform the training/classification loop.
        scores = x_curr_np @ w + b
       
        
        y_pred = (scores > 0).astype(int)


        # Now we vectorize the update to make this more efficient.
        pred_error = y_curr_np - y_pred
        n_errors += np.sum(np.abs(pred_error) != 0) # If the pred error is zero then it is correct.

        # First append the previous weights to weight steps which will be used for debuging puprposes.
        weight_steps.append((pred_error[:,None]*x_curr_np).sum(axis=0).copy())
        
        w += (pred_error[:,None]*x_curr_np).sum(axis=0) # Re-shape pred errors to update and only add
                                                        # inccorect preds, axis=0 for rows.
        b += pred_error.sum()

        # Now print out the weights and bias updates every update if we are in debug mode.
        

    # Now once we are done training the result is the weights and biases.
    return (w,b,n_errors,weight_steps.copy(),total_pos_in_train) # I am just copying to avoid weird cases due to mutability of list.

# Create the training and testing partitions.

In [110]:
from sklearn.model_selection import train_test_split
train_df, test_df = train_test_split(education_data, train_size=0.9,random_state=42)
train_df,dev_df = train_test_split(train_df, train_size=0.8,random_state=42)

train_df.reset_index(inplace=True,drop=True)
dev_df.reset_index(inplace=True,drop=True)
test_df.reset_index(inplace=True,drop=True)


print(f'train rows: {len(train_df.index):,}')
print(f'dev rows: {len(dev_df.index):,}')
print(f'test rows: {len(test_df.index):,}')

train rows: 3,600
dev rows: 900
test rows: 500


# Now actually perform the training.

In [111]:
batch_size = 5
shuffle = True

features_lst = ['department index','Gender', 'Age', 'Attendance (%)', \
       'Midterm_Score', 'Final_Score', 'Assignments_Avg', 'Quizzes_Avg', \
       'Participation_Score', 'Projects_Score', 'Total_Score', \
       'Study_Hours_per_Week', \
       'Family_Income_Level', 'Stress_Level (1-10)', \
       'Sleep_Hours_per_Night']
num_feat = len(features_lst)


# first create a list of the weight vectors and biases for all the classes.
weight_vecs = []
bias_vecs = []
weight_vecs_hist = []

# The class indexes for all the different grade classes.
class_nums = [0,1,2,3,4]

# Now loop through each class and record it's performance on the test set.
for class_n in class_nums:
    # First use the above class to create a data loader with the appropriete current positive class labels.
    train_df_trans = transform_data_for_perc(train_df,'Grade',features_lst,class_n)
    train_ds = MyDataset(train_df_trans,features_lst,'Grade')
    train_dl = DataLoader(train_ds,batch_size=batch_size,shuffle=shuffle)

    # Now run it through the perceptron to get the weight vector and bias for the current class.
    w_curr,b_curr,error_curr,weight_hist_curr,tot_train_pos_curr = train_perceptron(train_dl,num_feat,class_n)
    weight_vecs_hist.append(weight_hist_curr)

    # Append current weights to overall weights.
    weight_vecs.append(w_curr)
    bias_vecs.append(b_curr)

    print(f"-------------------Report for class {class_n}------------------------------\n")
    print(f"The current class {class_n} saw {tot_train_pos_curr} positive examples in training")
    print(f"The current class {class_n} had {error_curr} pred errors in training, the weight vector:")
    print(w_curr)
    print(f"With bias value {b_curr}\n")

    # Test on dev.
    dev_df_trans = transform_data_for_perc(dev_df,'Grade',features_lst,class_n)
    X_dev = dev_df_trans[features_lst]

    dev_y_true = dev_df_trans['Grade'].to_numpy()
    dev_y_pred = ((X_dev_a @ w_curr + b_curr) > 0).astype(int)
    n_correct_dev = (dev_y_true==dev_y_pred).sum(axis=0)

    print(f"The number of correct preds was {n_correct_dev} for acc of {(n_correct_dev/dev_y_true.shape[0])*100}%")
    print(f"The number of pos preds was {(dev_y_pred==1).sum(axis=0)} and neg num was {(dev_y_pred==0).sum(axis=0)}")
    

  0%|          | 0/720 [00:00<?, ?it/s]

-------------------Report for class 0------------------------------

The current class 0 saw 220 positive examples in training
The current class 0 had 348 pred errors in training, the weight vector:
[  57.           22.          559.          777.67002869 -236.33995438
 -730.60004425 -148.45997238   40.76989746 -139.29999945 -593.00008774
 -421.15800858  495.69999933   61.           96.          166.599998  ]
With bias value 26

The number of correct preds was 860 for acc of 95.55555555555556%
The number of pos preds was 0 and neg num was 900


  0%|          | 0/720 [00:00<?, ?it/s]

-------------------Report for class 1------------------------------

The current class 1 saw 1260 positive examples in training
The current class 1 had 1483 pred errors in training, the weight vector:
[   62.            27.          1160.           756.71998978
  -322.37002945 -1247.40000153   -21.1399498    459.7500267
  -356.49999199 -1161.62998962  -683.71543121   636.59998703
   125.           161.           395.39999819]
With bias value 63

The number of correct preds was 576 for acc of 64.0%
The number of pos preds was 0 and neg num was 900


  0%|          | 0/720 [00:00<?, ?it/s]

-------------------Report for class 2------------------------------

The current class 2 saw 1654 positive examples in training
The current class 2 had 1711 pred errors in training, the weight vector:
[  -36.           -20.         -1017.          -939.99999237
   -53.27988815  1054.21002579  -107.15007401  -272.15006256
   -19.70002647   284.11007309   296.52091217  -406.59998035
  -123.          -226.          -301.60000277]
With bias value -45

The number of correct preds was 502 for acc of 55.77777777777778%
The number of pos preds was 191 and neg num was 709


  0%|          | 0/720 [00:00<?, ?it/s]

-------------------Report for class 3------------------------------

The current class 3 saw 454 positive examples in training
The current class 3 had 757 pred errors in training, the weight vector:
[  -99.            -5.         -1030.         -1277.13998795
   -40.71998215   728.11000824  -452.35998917  -848.72990417
    31.00004113   978.41011429   318.26541519  -977.69998837
  -103.          -293.          -282.30000019]
With bias value -51

The number of correct preds was 789 for acc of 87.66666666666667%
The number of pos preds was 1 and neg num was 899


  0%|          | 0/720 [00:00<?, ?it/s]

-------------------Report for class 4------------------------------

The current class 4 saw 12 positive examples in training
The current class 4 had 27 pred errors in training, the weight vector:
[   7.           -3.          -51.         -132.28998184    9.45998001
   67.11000443  -97.44001007  -63.8500061    41.49999523  -12.62000656
   -4.51550293  -47.30000019    3.          -18.          -23.09999943]
With bias value -3

The number of correct preds was 897 for acc of 99.66666666666667%
The number of pos preds was 0 and neg num was 900


# I am wondering now if we might have better luck splitting this into pass fail and making this into a binary classification problem.


In [116]:
# I am first gonna convert the grade col in a binary pass fail col.
educt_pfail = education_data.copy()
educt_pfail['Grade'] = educt_pfail['Grade'].apply(lambda x: 1 if x > 1 else 0)


train_df, test_df = train_test_split(educt_pfail, train_size=0.9,random_state=42)
train_df,dev_df = train_test_split(train_df, train_size=0.8,random_state=42)

train_df.reset_index(inplace=True,drop=True)
dev_df.reset_index(inplace=True,drop=True)
test_df.reset_index(inplace=True,drop=True)


# First use the above class to create a data loader with the appropriete current positive class labels.
train_ds = MyDataset(train_df,features_lst,'Grade')
train_dl = DataLoader(train_ds,batch_size=batch_size,shuffle=shuffle)

In [117]:
w_curr,b_curr,error_curr,weight_hist_curr,tot_train_pos_curr = train_perceptron(train_dl,num_feat,1)

print(f"-------------------Report for class {1}------------------------------\n")
print(f"The current class {1} saw {tot_train_pos_curr} positive examples in training")
print(f"The current class {1} had {error_curr} pred errors in training, the weight vector:")
print(w_curr)
print(f"With bias value {b_curr}\n")

# Test on dev.
dev_df_trans = transform_data_for_perc(dev_df,'Grade',features_lst,1)
X_dev_a = dev_df_trans[features_lst]

dev_y_true = dev_df_trans['Grade'].to_numpy()
dev_y_pred = ((X_dev_a @ w_curr + b_curr) > 0).astype(int)
n_correct_dev = (dev_y_true==dev_y_pred).sum(axis=0)

print(f"The number of correct preds was {n_correct_dev} for acc of {(n_correct_dev/dev_y_true.shape[0])*100}%")
print(f"The number of pos preds was {(dev_y_pred==1).sum(axis=0)} and neg num was {(dev_y_pred==0).sum(axis=0)}")

  0%|          | 0/720 [00:00<?, ?it/s]

-------------------Report for class 1------------------------------

The current class 1 saw 2120 positive examples in training
The current class 1 had 1377 pred errors in training, the weight vector:
[ -151.           -42.         -1726.         -1323.25997543
   463.39006805  1485.63002014   -83.71996689  -526.15988922
   235.10004844  1387.97010803   803.8879509   -940.00001383
  -136.          -297.          -542.400002  ]
With bias value -87

The number of correct preds was 540 for acc of 60.0%
The number of pos preds was 896 and neg num was 4


In [118]:
weight_hist_curr

[array([  6.        ,   0.        ,  61.        , 228.53000259,
        281.45000458, 204.37999344, 203.54999542, 210.8900032 ,
        186.39999676, 223.86999512, 221.41500092,  67.10000038,
          7.        ,  18.        ,  16.5       ]),
 array([  -5.        ,   -2.        ,  -60.        , -227.15000153,
        -175.27000046, -168.08000183, -195.92999649, -222.75000763,
        -152.10000134, -174.75      , -180.00500107,  -70.69999886,
          -7.        ,  -20.        ,  -20.30000019]),
 array([  -3.        ,   -1.        ,  -37.        , -182.62999725,
        -153.76000214, -106.66999817, -114.82999802, -139.66000366,
        -106.5       , -143.45000076, -129.2820015 ,  -39.70000076,
          -4.        ,  -10.        ,  -14.69999981]),
 array([  6.        ,   2.        ,  59.        , 249.91999817,
        251.33999634, 237.35000229, 222.91999435, 260.77000427,
        247.29999542, 198.32999802, 228.41749573,  41.60000038,
          3.        ,  14.        ,  19.5     

In [119]:
print((X_dev_a @ w_curr + b_curr))

0       99753.589225
1      127697.088412
2       99136.180293
3       149733.28236
4       67268.134839
           ...      
895    149504.422384
896    106447.816225
897    150520.157618
898    119111.906332
899    128435.458665
Length: 900, dtype: object
