In [1]:
import torch

# A simple neuron network example with regression

In [8]:
class LinearRegression(torch.nn.Module):
    #__init__() is a special method (constructor) that Python calls when we create an object of the class.
    def __init__(self, input_dim, output_dim):
        super().__init__()
        # super() refers to the parent class of LinearRegression. In this case, the parent is torch.nn.Module, a fundamental PyTorch class for building neural networks.
        #By calling super().__init__(), we're ensuring that any necessary initialization from the parent class (torch.nn.Module) is properly executed before we add our own specific components to our LinearRegression model.
        
        
        self.linear = torch.nn.Linear(input_dim, output_dim)
        # self.linear creates an attribute named linear within our LinearRegression object. This attribute will hold our linear layer.
        #torch.nn.Linear(input_dim, output_dim) creates a PyTorch linear layer. This layer takes an input of input_dim dimensions and transforms it into an output of output_dim dimensions. It does this using a weight matrix and a bias vector, which are learned during training.
        

    def forward(self, x):
        x = self.linear(x)
        
        return self.linear(x) # This line applies the linear transformation stored in the self.linear attribute to the input x
        # Remember that self.linear is a torch.nn.Linear object, which represents a fully connected layer with weights and biases.
        # This operation performs a matrix multiplication of the input x with the weight matrix of the linear layer and then adds the bias vector. This is the core computation of a linear regression model. 
        # The method then returns the result of this linear transformation as the output of the model.
    
        # In essence, this forward() method does the following:

        # 1. Takes an input x.
        # 2. Applies a linear transformation to x using the weights and biases stored in self.linear.
        # 3. Returns the result of the linear transformation as the output. 

In [9]:
#Generate sample data
X = torch.randn(100, 1) * 10

y = 2 * X + 1 + torch.randn(100, 1)

#torch.randn(100, 1) creates a PyTorch tensor with the shape 100x1. It's filled with random numbers sampled from a standard normal distribution (mean=0, standard deviation=1).  Essentially, you get a column vector with 100 random numbers. 

In [10]:
# Instantiate the model, loss function, and optimizer
model = LinearRegression(1, 1)
criterion = torch.nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.0001)

In [11]:
#Training loop
epochs = 1000
for epoch in range(epochs):
    #Forward pass
    outputs = model(X)
    loss = criterion(outputs, y)
    
    #Backward pass and optimization
    optimizer.zero_grad()
    #If you don't reset the gradients to zero before each backward pass, the gradients from the previous iteration will be added to the gradients of the current iteration. This leads to incorrect gradient values and ultimately disrupts the optimization process.
    #optimizer.zero_grad() sets the gradients of all the model parameters to zero. By doing this before each loss.backward(), you ensure that the gradients calculated are solely based on the current batch of data. This prevents the unwanted accumulation of gradients from previous iterations.
    #Imagine you're walking down a hill. The gradient represents the slope of the hill at your current location. If you keep adding the previous slope to your current slope, you'll end up with an inaccurate sense of the direction you should move in. Resetting the slope to zero each time allows you to accurately assess the current direction of descent.
    
    loss.backward()
    #loss.backward() figures out the direction and magnitude of the parameter updates needed to reduce the loss.
    #When you call loss.backward(), PyTorch performs automatic differentiation (autograd). This process traces the computations that led to the loss value and calculates the gradients of that loss with respect to every parameter in your model that requires gradients (those with requires_grad=True).
    #These gradients are stored in the .grad attribute of each parameter. Think of it like attaching a little note to each parameter saying, "This is how much you need to change to reduce the loss."
    
    
    optimizer.step()
    #optimizer.step() actually applies those updates to the model's parameters based on the chosen optimization strategy.
    #When you call optimizer.step(), the optimizer uses the gradients (calculated by loss.backward()) to update the values of the model's parameters.
    #The specific way it updates them depends on the optimization algorithm (SGD, Adam, etc.). SGD, for example, subtracts a fraction of the gradient (determined by the learning rate) from each parameter.
    
    
    if (epoch+1) %100 == 0:
        print('Epoch [{}/{}], Loss: {:.4f}'.format(epoch+1, epochs, loss.item()))

# Print the learned parameters
print("Weight:", model.linear.weight.item())
print("Bias:", model.linear.bias.item())

Epoch [100/1000], Loss: 2.1888
Epoch [200/1000], Loss: 2.1794
Epoch [300/1000], Loss: 2.1701
Epoch [400/1000], Loss: 2.1609
Epoch [500/1000], Loss: 2.1517
Epoch [600/1000], Loss: 2.1426
Epoch [700/1000], Loss: 2.1335
Epoch [800/1000], Loss: 2.1246
Epoch [900/1000], Loss: 2.1156
Epoch [1000/1000], Loss: 2.1068
Weight: -1.4160994291305542
Bias: 0.15423449873924255


# Dropout prediction model example

#### This is an example scenario where we will use a simple neural network to predict if students will drop out or not, based on their engagement in a course.

In [12]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

### Synthetic Data Generation

In [13]:
# Generate synthetic data for student dropout prediction

torch.manual_seed(0) # For reproducibility

num_students = 1000 # we will generate data for 1000 students.

# Assume three indicators (e.g., attendances, grades, engagement)
features = torch.randn(num_students, 3)

#features is a tensor of shape (1000, 3) representing the 3 indicators for each of the 1000 students.
features # assume that 1st column: attendances, 2nd col: grades, 3rd col: engagement

tensor([[-1.1258, -1.1524, -0.2506],
        [-0.4339,  0.8487,  0.6920],
        [-0.3160, -2.1152,  0.3223],
        ...,
        [ 1.2965, -0.1078,  0.7482],
        [-0.7423,  0.3447,  1.6422],
        [-0.3266,  0.3669, -0.7245]])

In [14]:
random_weights = torch.rand(3, 1)
random_bias = torch.randn(1)
dropout_probs = torch.sigmoid(features @ random_weights + random_bias)
#@ is the matrix multiplication operator in PyTorch. This operation essentially performs a weighted sum of the features for each student. The result is a tensor of shape (1000, 1)

# Generate probabilities
# It first calculates a weighted sum of the features, where the weights are random.
# It adds a random bias to this sum.
# Finally, it uses the sigmoid function to transform the result into a probability between 0 and 1.

# This approach simulates a scenario where the likelihood of a student dropping out is influenced by their attendance, grades, and engagement, but with some randomness introduced by the random weights and bias. This randomness reflects the complexity of real-world factors affecting dropout rates.

dropout_probs

tensor([[0.3934],
        [0.8380],
        [0.6049],
        [0.5903],
        [0.9330],
        [0.3462],
        [0.9281],
        [0.7110],
        [0.8458],
        [0.9561],
        [0.8171],
        [0.7878],
        [0.9014],
        [0.9040],
        [0.7990],
        [0.8750],
        [0.8411],
        [0.7515],
        [0.7980],
        [0.9930],
        [0.6402],
        [0.7630],
        [0.8622],
        [0.9804],
        [0.2938],
        [0.7643],
        [0.7979],
        [0.8414],
        [0.9374],
        [0.9593],
        [0.2072],
        [0.6979],
        [0.9356],
        [0.4188],
        [0.7911],
        [0.8584],
        [0.8779],
        [0.1949],
        [0.8667],
        [0.7791],
        [0.8624],
        [0.8508],
        [0.8906],
        [0.7249],
        [0.9685],
        [0.5687],
        [0.5421],
        [0.5770],
        [0.7755],
        [0.9440],
        [0.7642],
        [0.4786],
        [0.6913],
        [0.7910],
        [0.9042],
        [0

In [15]:
labels = (dropout_probs > 0.5).float().squeeze()  # Dropout (1) or not (0)
#squaeze  transforms the tensor from shape (1000, 1) to shape (1000). This results in a 1-dimensional tensor where each element corresponds to a student's dropout status (1.0 for dropout, 0.0 for no dropout).
labels

tensor([0., 1., 1., 1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 1., 1., 0., 1., 1., 0., 1., 1.,
        1., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 0., 1., 1., 0., 1., 1., 0., 1., 1., 1., 1., 1., 1., 0., 1., 0.,
        1., 1., 0., 1., 1., 1., 1., 0., 1., 1., 1., 1., 1., 0., 0., 1., 1., 1.,
        1., 1., 1., 0., 1., 1., 1., 0., 1., 1., 1., 1., 1., 0., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 0., 1., 1.,
        1., 1., 1., 1., 0., 1., 0., 0., 1., 1., 1., 1., 1., 1., 0., 0., 1., 1.,
        1., 1., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 0., 1., 1.,
        1., 1., 1., 1., 0., 1., 1., 1., 1., 1., 0., 1., 1., 1., 1., 0., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 

In [16]:
# Split data into training, validation, and test sets
# We have to use a separate set of data for training, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(features, labels, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

### Defining the Neural Network

In [38]:
#Define the model with multiple hidden layers
class DropoutPredictionModel(nn.Module):
    def __init__(self, dropout_rate=0.3):
        super().__init__()
        #!!! Define three layers fc1, fc2, and fc3. The parameter values should derived from the explanations below:
        
        # Input layer: 3 indicators
        # Hidden layer 1: The values 16 and 8 are random.
        # Output layer: dropout probability
        #  <<Your code goes here>>
        self.fc1 = nn.Linear(3,16)
        self.fc2 = nn.Linear(16,8)
        self.fc3 = nn.Linear(8,1)
        self.dropout = nn.Dropout(p=dropout_rate)
        
    
    def forward(self, x):
        
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x)) # ReLU introduces a non-linear transformation by setting all negative values to zero. This simple operation allows the network to approximate non-linear functions, which are essential for modeling real-world data and relationships.
        # Vanishing gradients hinder learning: In deep networks with sigmoid or tanh activations, gradients can become very small during backpropagation. This "vanishing gradient" problem makes it difficult for the network to learn effectively.  
        # ReLU helps: ReLU doesn't suffer from vanishing gradients as much as sigmoid or tanh, especially for positive values. The gradient of ReLU is 1 for positive inputs, allowing gradients to flow more easily through the network.
        #Zeroing out negative values: ReLU introduces sparsity by setting negative activations to zero. This can make the network more computationally efficient and can also lead to better generalization by preventing overfitting.
        x = self.dropout(x)
        
        x = torch.sigmoid(self.fc3(x)) #For binary classification problems, you need an activation function in your output layer that produces a probability-like value between 0 and 1
        return x

### Training the Model

In [39]:
# Instantiate the model, loss function, and optimizer with momentum


model = DropoutPredictionModel()
model.fc = nn.Sequential(
    nn.Linear(3,16),
    nn.ReLU(),
    nn.Linear(16,8),
    nn.ReLU(),
    nn.Linear(8,1),
    nn.Dropout(p=0.3)
)
model.to("cpu")
criterion = nn.BCELoss() # Binary Cross Entropy Loss for binary classification
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)

In [40]:
epochs = 100
for epoch in range(epochs):
    # !!! Switch the model to the training mode 
    model.train()
    # <<Your code goes here>>
    
    #!!! Get the predictions and store them inside outputs variable
    outputs = model(X_train)
    # <<Your code goes here>> 
    # Neural networks have different behaviors during training and evaluation (inference). model.train() sets the model to training mode
    
    #!!! Compute the loss. Remember to apply unsqueeze(1) to y_train
    loss = criterion(outputs, y_train.unsqueeze(1))
    # <<Your code goes here>>
    
    # Do the back propagation
    loss.backward() 
    # <<Your code goes here>>
    # Every time you process a batch of inputs with .backward(), you accumulate "suggestions" of where to step.
    
    #!!! Apply the optimization
    # <<Your code goes here>> 
    optimizer.step()
    # Notice that a "suggestion" is much weaker than a "decision". When you call optimizer.step(), the optimizer uses these suggestions to make actual decisions of where to actually step. The optimizer reads the suggestions and then steps in a direction that it hopes will minimize future losses.

    #!!! Reset the gradients
    # <<Your code goes here>>
    optimizer.zero_grad() 
     # Once you've completed a step, you don't really need to keep track of your previous suggestion (i.e. gradients) of where to step. By zeroing the gradients, you are throwing away this information. 

### Testing the model

In [42]:
model.eval() # Switches the mode to the evaluation. 

with torch.no_grad(): # This code creates a context within which PyTorch will not track gradients. This ensures that no gradients are calculated during validation, saving resources and preventing unintended changes to the model's parameters.
    
    #!!! Obtain the predictions on the test data: X_test and store the results into test_outputs
    # <<Your code goes here>> 
    # Produces the outputs: predictions on dropouts
    test_outputs = model(X_test)
    
    test_preds = (test_outputs > 0.5).float().squeeze() # converts the probabilities into 0 and 1
    
    #!!! Compute the accuracy by calling accuracy_score with parameters of y_test and test_preds
    # <<Your code goes here>> 
    # computes the accuracy
    test_acc = accuracy_score(y_test, test_preds)
    print(f'Test Accuracy: {test_acc:.4f}')

Test Accuracy: 0.8733
