In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [9]:
# Reads data
features = pd.read_csv("data/features.csv")
target = pd.read_csv("data/target.csv")

In [10]:
# Bins target values
from sklearn.preprocessing import KBinsDiscretizer

discretizer = KBinsDiscretizer(n_bins=4, encode = "onehot-dense", strategy = "quantile")

target_discretized = discretizer.fit_transform(target["G3"].values.reshape(-1, 1))
target_new = []
for row in target_discretized:
    target_new.append(list(row).index(1))

In [11]:
# Splits data into training, testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, random_state = 1001)

In [12]:
# Gets final grade for target
t_train = y_train["G3"]
t_test = y_test["G3"]

In [13]:
# Imports Ridge and accuracy metrics
from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

# Performs Ridge Regression
model = Ridge().fit(X=X_train, y=t_train)

# Measures accuracy
print("R-squared value for training set: ", r2_score(t_train, model.predict(X_train)))
print("R-squared value for testing set: ", r2_score(t_test, model.predict(X_test)))

R-squared value for training set:  0.34042660318538653
R-squared value for testing set:  0.22771777942292692


In [14]:
# Splits data for classification into training, testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target_new, random_state = 1001)

In [43]:
# Imports SVC and accuracy metrics
from sklearn.svm import SVC
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

# Recommended using kernel

# Performs SVC
model = SVC(gamma = "scale").fit(X=X_train, y=y_train)

# Measures accuracy
accuracy_train = model.score(X_train, y_train)
accuracy_test = model.score(X_test, y_test)
print("Prediction accuracy on the train data:", f"{accuracy_train:.2%}")
print("Prediction accuracy on the test data:", f"{accuracy_test:.2%}")

Prediction accuracy on the train data: 37.42%
Prediction accuracy on the test data: 33.72%


In [15]:
# Imports Decision Tree and accuracy metrics
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

# Performs Decision Tree Classifier
model = DecisionTreeClassifier().fit(X=X_train, y=y_train)

# Recommended using kernel -> we want to be able to model nonlinear things!
# Want to use feature expansion to classify them
# To choose the kernel: brute force/ try out a bunch of different ones (look at data and see if there's anything interesting you can pick up on, for example see if there's any 2 or more things that you can take the product of and it seems interesting)
# kernel: considering n features at the same time

# Measures accuracy
accuracy_train = model.score(X_train, y_train)
accuracy_test = model.score(X_test, y_test)
print("Prediction accuracy on the train data:", f"{accuracy_train:.2%}")
print("Prediction accuracy on the test data:", f"{accuracy_test:.2%}")

Prediction accuracy on the train data: 100.00%
Prediction accuracy on the test data: 38.70%


In [6]:
import torch

class NeuralNetwork(torch.nn.Module):
    def __init__(self, inputDim, outputDim, layerDim):
        super(NeuralNetwork, self).__init__()
        self.inputDim = inputDim
        self.outputDim = outputDim
        self.layerDim = layerDim


        self.l1 = torch.nn.Linear(self.inputDim, self.layerDim)
        self.relu = torch.nn.ReLU()
        self.l2 = torch.nn.Linear(self.layerDim, self.outputDim)
        self.relu = torch.nn.ReLU()

        
    def forward(self, x):
        hidden = self.l1(x)
        relu = self.relu(hidden)
        output = self.l2(relu)
        # output = self.relu(output)
        return output



In [78]:
X_test.iloc[3:6].values.shape

(3, 96)

In [74]:
model = NeuralNetwork(96, 20, 48)

In [88]:
torch.tensor(t_train.values[0]).reshape(-1).shape

torch.Size([1])

In [79]:
x_test_tensor = torch.FloatTensor(X_test.iloc[3:5].values)
y_pred = model(x_test_tensor)
y_pred.shape

torch.Size([2, 20])

In [96]:
# Now have to loop through all my data points to optimize my model - for loop looping through groups of (5) data points at a time using indexing

criterion = torch.nn.CrossEntropyLoss()
loss = criterion(y_pred, torch.tensor(t_train.values[0:2]).reshape(-1))

# model.eval()
# x_test_tensor = torch.FloatTensor(X_test.iloc[3].values[1:])
# y_pred = model(x_test_tensor)
# before_train = criterion(y_pred.squeeze(), torch.FloatTensor(t_test.values[0]))
# print('Test loss before training' , before_train.item())

# training mode vs. evaluation mode
# different architecture @ training/eval times
# model.train()
# y_pred = model(torch.FloatTensor(X_train.iloc[3].values[1:]))
# y_pred

learning_rate = 1e-4
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
# 2 inputs, first is model.parameters() - includes all parameters in the model you want to optimize, make sure model.parameters() includes everything you want to optimize

for t in range(500):
    # Forward pass: compute predicted y by passing x to the model.
    x_test_tensor = torch.FloatTensor(X_test.iloc[3:5].values)
    y_pred = model(x_test_tensor)

    # Compute and print loss.
    loss = criterion(y_pred, torch.tensor(t_train.values[0:2]).reshape(-1))
    if t % 100 == 99:
        print(t, loss.item())

    # Before the backward pass, use the optimizer object to zero all of the
    # gradients for the variables it will update (which are the learnable
    # weights of the model). This is because by default, gradients are
    # accumulated in buffers( i.e, not overwritten) whenever .backward()
    # is called. Checkout docs of torch.autograd.backward for more details.
    optimizer.zero_grad()

    # Backward pass: compute gradient of the loss with respect to model
    # parameters
    loss.backward()

    # Calling the step function on an Optimizer makes an update to its
    # parameters
    optimizer.step()

99 1.6315577030181885
199 1.5646674633026123
299 1.5326740741729736
399 1.5194493532180786
499 1.5106877088546753


In [65]:
t_train.values

array([16, 15,  5,  8, 15, 11,  7, 10, 14, 16,  9,  8,  5, 13,  8, 10, 11,
       13, 12, 16, 14,  7, 15, 11,  8, 11, 11, 10, 12, 13,  9, 10, 12, 10,
       12, 17,  9, 15, 16, 11, 16, 12, 16, 16, 16, 10, 13, 13, 13, 11, 11,
       18, 16, 17,  8, 13,  8, 18, 16, 18, 16, 13, 12, 16, 10, 11, 10, 10,
       12,  6, 16, 18,  8, 16, 13,  9, 12,  8, 11, 12, 11,  6,  9, 14,  0,
       11,  9, 11, 10, 16, 10,  6, 12, 13, 10, 14, 10,  8,  6,  8, 15, 13,
       10, 10, 12, 10, 11, 11, 11, 10, 10,  9, 15, 11, 19, 14,  0, 12, 13,
       12, 10, 10, 12,  9,  9, 14, 11, 10, 10, 14, 11, 14, 11,  0, 14, 13,
       14,  8, 12,  0,  7, 10, 10,  0, 12, 15, 12, 10, 14, 17, 13, 12,  6,
       10, 15, 15,  8,  9, 11, 13, 11, 13, 13, 13, 10, 10, 11, 10, 12, 11,
        8,  8,  0, 11,  7, 13, 12,  0, 11, 16, 13,  9,  8,  0, 18, 14, 14,
       14, 11, 19, 11, 12, 10, 14, 11, 11, 11,  8,  9,  5, 11, 15, 10, 12,
       10, 11, 16, 10, 10, 13, 14, 11, 14, 14, 15, 17, 10, 18, 17, 12, 12,
        8, 10, 11, 11, 13