### Data Preprocessing (See other Notebook for more details)

In [1]:
import os
import numpy as np
import pandas as pd
idx=pd.IndexSlice
import time
import random
import matplotlib
#%matplotlib notebook
import matplotlib.pyplot as plt
import scipy.stats
#from pandas.plotting import autocorrelation_plot
import matplotlib.offsetbox as offsetbox
from matplotlib.ticker import StrMethodFormatter
import sklearn.linear_model

In [None]:
#for some reason, this needs to be in a separate cell
params={
    "font.size":15,
    "lines.linewidth":5
}
plt.rcParams.update(params)

In [None]:
### Do all the cleaning and preprocessing here (Copied from the other notebook)
### SKIP THIS IF YOU ALREADY HAVE THE PICKLED X AND Y FILES

# Read in working_dataset.pkl
data = pd.read_pickle("working_dataset.pkl")

# Label Encoding Term and dropping YearTerm
from sklearn import preprocessing
  
# Label_encoder object knows how to understand word labels.
label_encoder = preprocessing.LabelEncoder()
df=data

# Encode labels in column 'species'.
df['Term']= label_encoder.fit_transform(df['Term'])
df = df.drop('YearTerm', axis=1)

#Creatinig a New Column CourseCode by combining Subject, Number and dropping the Course Title, Subject and Number Columns
df["CourseCode"] = (data["Subject"] + " " + df["Number"].astype('str'))
df.drop(['Course Title','Subject','Number'],axis=1,inplace=True)

## Need to Convert the Grades to a Overall Gpa Value
# First we will Drop the 'W' Column and define the Grades as well as their corresponding Weightage
df.drop(['W'],axis=1,inplace=True)
grades = ['A+', 'A', 'A-', 'B+', 'B', 'B-', 'C+', 'C', 'C-', 'D+', 'D', 'D-', 'F']
WEIGHT = [4.00, 4.00, 3.67, 3.33, 3, 2.67, 2.33, 2, 1.67, 1.33, 1, 0.67, 0]
df['GPA'] = df[grades].mul(WEIGHT).sum(1)/df[grades].sum(axis=1)

# Drop the Grades Column since we do not really need it
df_new=df.drop(grades,axis=1)

# Dropping all the rows with Null values
df_notna = df_new.dropna()

# Group the data
df_final = df_notna.groupby(['Year','Term','Sched Type','Primary Instructor', 'CourseCode'])['GPA'].mean().reset_index()

# Get one hot encoding of columns Sched Type
one_hot_encoded_data = pd.get_dummies(df_final, columns = ['Sched Type'])

# Remove redundant date columns
one_hot_encoded_data['Year'] = one_hot_encoded_data['Year'].apply(lambda x: int(str(x)[-2:]))

def calc_smooth_mean(df, by, on, m):
    # Compute the global mean
    mean = df[on].mean()

    # Compute the number of values and the mean of each group
    agg = df.groupby(by)[on].agg(['count', 'mean'])
    counts = agg['count']
    means = agg['mean']

    # Compute the "smoothed" means
    smooth = (counts * means + m * mean) / (counts + m)

    # Replace each value by the according smoothed mean
    return df[by].map(smooth)

one_hot_encoded_data['CourseCode'] = calc_smooth_mean(one_hot_encoded_data, by='CourseCode', on='GPA', m=10)
one_hot_encoded_data['Primary Instructor'] = calc_smooth_mean(one_hot_encoded_data, by='Primary Instructor', on='GPA', m=10)

Y = one_hot_encoded_data['GPA'].round(2)
X = one_hot_encoded_data.drop('GPA',axis=1)

# Save X and Y to a pickle file
X.to_pickle("X_Final.pkl")
Y.to_pickle("Y_Final.pkl")

### Load Pickled Preprocessed X Y Data

In [None]:
import pandas as pd

In [2]:
# Load preprocessed X and Y data
X = pd.read_pickle("X_Final.pkl")
Y = pd.read_pickle("Y_Final.pkl")

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=104, shuffle=True)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=104, shuffle=True) # 0.25 x 0.8 = 0.2

print("Training set size:", str(len(X_train)), "\n", "Test set size:", str(len(X_test)), "\n", "Validation set size:", str(len(X_val)))

Training set size: 23188 
 Test set size: 7730 
 Validation set size: 7730


### Get Linear Regression Baseline

In [3]:
from sklearn.model_selection import train_test_split
X_train, X_test,y_train, y_test = train_test_split(X,Y ,
                                   random_state=104, 
                                   test_size=0.25, 
                                   shuffle=True)

from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

y_pred = regressor.predict(X_test)

from sklearn.metrics import mean_absolute_error, mean_squared_error
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f'Mean absolute error: {mae:.2f}')
print(f'Mean squared error: {mse:.2f}')
print(f'Root mean squared error: {rmse:.2f}')

Mean absolute error: 0.18
Mean squared error: 0.05
Root mean squared error: 0.23


# Building the Neural Networks

## Imports and Dataset Class

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.utils.data.sampler import SubsetRandomSampler
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import average_precision_score, roc_auc_score

print(torch.cuda.is_available())
if torch.cuda.is_available():
    torch.cuda.set_device(0)
    torch.cuda.manual_seed_all(42)

True


In [None]:
class GPADataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X.values, dtype=torch.float32)
        self.y = torch.tensor(y.values, dtype=torch.float32)
        self.len = self.X.shape[0]

    def __getitem__(self, index):
        return self.X[index], self.y[index]

    def __len__(self):
        return self.len

## Build a Shallow NN (2 Hidden Layers)

In [None]:
class ShallowNetwork():
    def __init__(self, input_size, hidden1_size, hidden2_size, output_size, learning_rate, optimizer):
        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        self.input_size = input_size
        self.hidden1_size = hidden1_size
        self.hidden2_size = hidden2_size
        self.output_size = output_size
        self.learning_rate = learning_rate

        self.model = nn.Sequential(
            nn.Linear(self.input_size, self.hidden1_size),
            nn.ReLU(),
            nn.Dropout(0.4),
            nn.Linear(self.hidden1_size, self.hidden2_size),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(self.hidden2_size, self.output_size)
        )
        self.model.to(self.device)

        self.loss_function = nn.MSELoss()
        if optimizer == "Adam":
          self.optimizer = torch.optim.Adam(self.model.parameters(), lr=self.learning_rate, weight_decay=1e-4)
        elif optimizer == "SGD":
          self.optimizer = torch.optim.SGD(self.model.parameters(), lr = self.learning_rate, weight_decay=1e-4)
        elif optimizer == "RMSprop":
          self.optimizer = torch.optim.RMSprop(self.model.parameters(), lr=self.learning_rate, weight_decay=1e-4)

    def train(self, X_train, y_train, X_val, y_val, epochs, batch_size):
        self.model.train()
        train_dataset = GPADataset(X_train, y_train)
        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False)
        val_dataset = GPADataset(X_val, y_val)
        val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

        train_loss = []
        val_loss = []
        num_batches = len(X_train)/batch_size
        for epoch in range(epochs):
            for i, (X, y) in enumerate(train_loader):
                X = X.to(self.device)
                y = y.to(self.device)
                self.optimizer.zero_grad()
                y_pred = self.model(X)
                y_pred = y_pred.squeeze(-1)
                y = y.squeeze(-1)
                loss = self.loss_function(y_pred, y)
                loss.backward()
                self.optimizer.step()
                train_loss.append(loss.item())

            # Validation
            with torch.no_grad():
                for i, (X, y) in enumerate(val_loader):
                    X = X.to(self.device)
                    y = y.to(self.device)
                    y_pred = self.model(X)
                    y_pred = y_pred.squeeze(-1)
                    y = y.squeeze(-1)
                    loss = self.loss_function(y_pred, y)
                    val_loss.append(loss.item())

            # Only print every 10 epochs
            if epoch % 10 == 0:
                print(f"{epoch}/{epochs} - Training Loss: {train_loss[-1]:.4f} Validation Loss: {val_loss[-1]:.4f}")

        return train_loss, val_loss

    def predict(self, X_test):
        self.model.eval()
        with torch.no_grad():
            X_test = torch.tensor(X_test.values, dtype=torch.float32).to(self.device)
            output = self.model(X_test)
        return output.cpu().numpy()

    def evaluate(self, X_test, y_test):
        self.model.eval()
        with torch.no_grad():
            X_test = torch.tensor(X_test.values, dtype=torch.float32).to(self.device)
            output = self.model(X_test)
            loss = self.loss_function(output, y_test)
        return loss

    def save(self, path):
        torch.save(self.model.state_dict(), path)

    def load(self, path):
        self.model.load_state_dict(torch.load(path))

## Build a Deep NN (6 Hidden Layers)

In [None]:
class DeepNetwork():
    def __init__(self, input_size, hidden1_size, hidden2_size, hidden3_size, hidden4_size, hidden5_size, hidden6_size, output_size, learning_rate, optimizer):
        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        self.input_size = input_size
        self.hidden1_size = hidden1_size
        self.hidden2_size = hidden2_size
        self.hidden3_size = hidden3_size
        self.hidden4_size = hidden4_size
        self.hidden5_size = hidden5_size
        self.hidden6_size = hidden6_size
        self.output_size = output_size
        self.learning_rate = learning_rate

        self.model = nn.Sequential(
            nn.Linear(self.input_size, self.hidden1_size),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(self.hidden1_size, self.hidden2_size),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(self.hidden2_size, self.hidden3_size),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(self.hidden3_size, self.hidden4_size),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(self.hidden4_size, self.hidden5_size),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(self.hidden5_size, self.hidden6_size),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(self.hidden6_size, self.output_size)
        )
        self.model.to(self.device)

        self.loss_function = nn.MSELoss()
        if optimizer == "Adam":
          self.optimizer = torch.optim.Adam(self.model.parameters(), lr=self.learning_rate, weight_decay=1e-4)
        elif optimizer == "SGD":
          self.optimizer = torch.optim.SGD(self.model.parameters(), lr = self.learning_rate, weight_decay=1e-4)
        elif optimizer == "RMSprop":
          self.optimizer = torch.optim.RMSprop(self.model.parameters(), lr=self.learning_rate, weight_decay=1e-4)

    def train(self, X_train, y_train, X_val, y_val, epochs, batch_size):
        self.model.train()
        train_dataset = GPADataset(X_train, y_train)
        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False)
        val_dataset = GPADataset(X_val, y_val)
        val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

        train_loss = []
        val_loss = []
        num_batches = len(X_train)/batch_size
        for epoch in range(epochs):
            for i, (X, y) in enumerate(train_loader):
                X = X.to(self.device)
                y = y.to(self.device)
                self.optimizer.zero_grad()
                y_pred = self.model(X)
                y_pred = y_pred.squeeze(-1)
                y = y.squeeze(-1)
                loss = self.loss_function(y_pred, y)
                loss.backward()
                self.optimizer.step()
                train_loss.append(loss.item())

            # Validation
            with torch.no_grad():
                for i, (X, y) in enumerate(val_loader):
                    X = X.to(self.device)
                    y = y.to(self.device)
                    y_pred = self.model(X)
                    y_pred = y_pred.squeeze(-1)
                    y = y.squeeze(-1)
                    loss = self.loss_function(y_pred, y)
                    val_loss.append(loss.item())

            # Only print every 10 epochs
            if epoch % 10 == 0:
                print(f"{epoch}/{epochs} - Training Loss: {train_loss[-1]:.4f} Validation Loss: {val_loss[-1]:.4f}")

        return train_loss, val_loss

    def predict(self, X_test):
        self.model.eval()
        with torch.no_grad():
            X_test = torch.tensor(X_test.values, dtype=torch.float32).to(self.device)
            output = self.model(X_test)
        return output.cpu().numpy()

    def evaluate(self, X_test, y_test):
        self.model.eval()
        with torch.no_grad():
            X_test = torch.tensor(X_test.values, dtype=torch.float32).to(self.device)
            output = self.model(X_test)
            loss = self.loss_function(output, y_test)
        return loss

    def save(self, path):
        torch.save(self.model.state_dict(), path)

    def load(self, path):
        self.model.load_state_dict(torch.load(path))

# Construct and Train

In [None]:
### Set parameters for the modeltype, optimizer, learning rate, batch size, and epochs
#opt_types = ["Adam", "SGD", "RMSprop"]
mtype, opt, lr, bs, e = "SN", "Adam", .05, 1000, 100

In [None]:
### Change the variables in the cell above, do not manually change the code below ###
if (mtype == "SN"):
    net = ShallowNetwork(X_train.shape[1], 8, 4, 1, lr, opt)
elif (mtype == "DN"):
    net = DeepNetwork(X_train.shape[1], 32, 64, 128, 64, 32, 16, 1, lr, opt)


In [None]:
# Check if the model has been trained before
force_train = False
if (os.path.exists(f"./models/{mtype}_{opt}_lr{lr}_bs{bs}_e{e}.pth") and not force_train):
    print("Trained model with same parameters found, loading...")
    net.load(f"./models/{mtype}_{opt}_lr{lr}_bs{bs}_e{e}.pth")
else:
    print("Model not found, training...")
    train_losses, val_losses = net.train(X_train, y_train, X_val, y_val, e, bs)
    net.save(f"./models/{mtype}_{opt}_lr{lr}_bs{bs}_e{e}.pth")
    print(f"Model Saved: ./models/{mtype}_{opt}_lr{lr}_bs{bs}_e{e}.pth")


Model not found, training...
0/100 - Training Loss: 11.3098 Validation Loss: 11.1158
10/100 - Training Loss: 3.0167 Validation Loss: 2.8824
20/100 - Training Loss: 1.5566 Validation Loss: 1.3666
30/100 - Training Loss: 0.9735 Validation Loss: 0.8731
40/100 - Training Loss: 0.6191 Validation Loss: 0.5157
50/100 - Training Loss: 0.3594 Validation Loss: 0.3276
60/100 - Training Loss: 0.2127 Validation Loss: 0.2280
70/100 - Training Loss: 0.1936 Validation Loss: 0.1790
80/100 - Training Loss: 0.1632 Validation Loss: 0.1549
90/100 - Training Loss: 0.1546 Validation Loss: 0.1474
Model Saved: ./models/SN_Adam_lr0.005_bs1000_e100.pth


In [None]:
def Get_Scores(net, X_test, y_test):
    pred = net.predict(X_test)
    mae = np.mean(np.abs(pred - y_test.values))
    print(f"Mean Absolute Error: {mae}")

    # Get mean squared error on the validation data
    mse = np.mean((pred - y_test.values)**2)
    print(f"Mean Squared Error: {mse}")

    # get root mean squared error on the validation data
    rmse = np.sqrt(mse)
    print(f"Root Mean Squared Error: {rmse}")

    return mae, mse, rmse


In [None]:
mae, mse, rmse = Get_Scores(net, X_test, y_test)

Mean Absolute Error: 0.31466599384269534
Mean Squared Error: 0.1522082006778982
Root Mean Squared Error: 0.39013869415619135


In [None]:
### Save the model parameters and scores to a csv file for later comparison
with open("model_scores.csv", "a") as f:
    f.write(f"{mtype},{opt},{lr},{bs},{e},{mae:.4f},{mse:.4f},{rmse:.4f}\n")