# Setup

The following cells are involved in all sorts of setup. 

### Import Statements

The cell below imports a number of libraries / functions that're essential for the remainder of the notebook.

In [13]:
# Some import statements
import torch, json, random, time
import numpy as np
import matplotlib.pyplot as plt
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data.sampler import SubsetRandomSampler
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.model_selection import KFold
import copy

# Setting up Pytorch's use of CUDA; if your computer isn't CUDA-enabled, 
# you could replace "cuda" with "cpu"
device = torch.device("cuda")

### Loading Data

The following cell will load both kinds of account embeddings (network & text) into a Dataset.

In [29]:
# This class will be used to load in the account data
class AccountDataset(torch.utils.data.Dataset):
    
    # The __init__ method defines how we'll input the data
    def __init__(self, networkEmbPath, textEmbPath):  
        
        # First, open the network embedding file and extract the embeddings
        networkEmbDict = {}
        with open(networkEmbPath, "r") as networkEmbFile:
            networkEmbDict = json.load(networkEmbFile)
                
        # Next, open the text embedding file, and extract the embeddings
        textEmbDict = {}
        with open(textEmbPath, "r") as textEmbFile:
            textEmbDict = json.load(textEmbFile)
                    
        # Open the "Account Labels.json" file to indicate which accounts
        # are Friends / Blocked
        account_type_mapping = {"blocked": "Blocked", "not-blocked": "Friends"}
        acctTypeDict = {}
        with open("../Data/Account Labels.json", "r") as acctTypeJSON:
            account_types = json.load(acctTypeJSON)
            for account_type, account_list in account_types.items():
                for account in account_list:
                    acctTypeDict[str(account)] = account_type_mapping[account_type]
                
        # Join together the embeddings of the Friends / Blocked accounts
        # that have both a text embedding and a social network embedding
        accountDict = {}
        blockCt = 0
        friendCt = 0
        for acct, acctType in acctTypeDict.items():
            inBoth = (acct in networkEmbDict and acct in textEmbDict)
            if (acctType != "Adjacent" and inBoth):
                networkEmb = torch.tensor(networkEmbDict[acct], dtype=torch.float32, device=device)
                textEmb = torch.tensor(textEmbDict[acct], dtype=torch.float32, device=device)
                blocked = 0
                if (acctType == "Blocked"):
                    blocked = 1
                    blockCt += 1
                else:
                    friendCt += 1
                blocked = torch.tensor(blocked, dtype=torch.float32, device=device)
                accountDict[acct] = {"networkEmb": networkEmb, "textEmb": textEmb, "blocked": blocked}
        
        # Now, create a list version of the accountDict, and shuffle it
        shuffledAccountDict = list(accountDict.keys())
        np.random.shuffle(shuffledAccountDict)
        self.accountList = []
        for userID in shuffledAccountDict:
            account = accountDict[userID]
            account["userID"] = userID
            self.accountList.append(account)
            
        # Equalize the number of examples in the dataset
        # print("There are %d friends and %d blocked" % (friendCt, blockCt))
        typeToRemove = 0
        amtToRemove = friendCt - blockCt
        if (blockCt > friendCt): 
            amtToRemove = blockCt - friendCt
            typeToRemove = 1
        idxToRemove = []
        startPoint = 0
        for idx, account in enumerate(self.accountList):
            if (amtToRemove == 0): break
            if (account["blocked"] == typeToRemove): 
                idxToRemove.append(idx)
                amtToRemove -= 1
        for idx in reversed(idxToRemove):
            del self.accountList[idx]
            
        # Check the amount of examples in the dataset again
        newBlockCt = 0
        newFriendCt = 0
        for acct in self.accountList:
            if (acct["blocked"] == 0): newFriendCt += 1
            elif (acct["blocked"] == 1): newBlockCt += 1
        # print("There are %d friends and %d blocked" % (newFriendCt, newBlockCt))
            
        
    # The "len" method will return the number of data points in this Dataset
    def __len__(self):
        return len(self.accountList)
        
    # The "getitem" method will specify how to return the item
    # at a particular index 
    def __getitem__(self, idx):
        if (torch.is_tensor(idx)):
            idx = idx.tolist()
        account = self.accountList[idx]
        userID = account["userID"]
        networkEmb = account["networkEmb"]
        textEmb = account["textEmb"]
        blocked = account["blocked"]
        return (userID, networkEmb, textEmb, blocked)

### Training / Test Splits

Since we've defined the AccountDataset class, we're now able to create a new AccountDataset object, and then create some DataLoaders for training / test data. 

In [30]:
# These cells will actually load the data into the AccountDataset
networkEmbeddingPath = "../Data/Embeddings/node2vec embeddings.json"
textEmbeddingPath = "../Data/Embeddings/word2vec embeddings.json"
accounts = AccountDataset(networkEmbeddingPath, textEmbeddingPath)

In [32]:
# Now, this will split up the accounts into "training" and "test" sets
testSplit = .3
accountAmt = len(accounts)
splitIdx = int(np.floor(accountAmt * testSplit))
indices = list(range(accountAmt))
np.random.shuffle(indices)
trainIndices = indices[splitIdx:]
testIndices = indices[:splitIdx]

In [33]:
# Create DataLoaders for both the training and test sets
batchSize = 8
sampler_train = SubsetRandomSampler(trainIndices)
sampler_test = SubsetRandomSampler(testIndices)
loader_train = torch.utils.data.DataLoader(accounts, batch_size=batchSize, sampler=sampler_train)
loader_test = torch.utils.data.DataLoader(accounts, batch_size=batchSize, sampler=sampler_test)

### Defining CheckAccuracy( ) 

The following method, CheckAccuracy(), will be used to evaluate the accuracy of a given model. 

In [34]:
# This method will check the accuracy of the model using data from the loader
def CheckAccuracy(loader, model, modelType, confusion=False, returnAll=False):
    num_correct = 0
    num_samples = 0
    if (modelType != "split"):
        model.eval()
    yTrue = []
    yPred = []
    with torch.no_grad():
        for (userID, networkEmb, textEmb, blocked) in loader:
            binaryScores = []
            if (modelType == "network"):
                binaryScores = torch.round(torch.sigmoid(model(networkEmb))).reshape(blocked.shape)
            if (modelType == "text"):
                binaryScores = torch.round(torch.sigmoid(model(textEmb))).reshape(blocked.shape)
            if (modelType == "split"):
                networkEmb_Model_cutoff, textEmb_Model_cutoff, splitModel = model
                networkEmb_Model_cutoff.eval()
                textEmb_Model_cutoff.eval()
                splitModel.eval()
                networkScores = networkEmb_Model_cutoff(networkEmb)
                textScores = textEmb_Model_cutoff(textEmb)
                inputFeatures = torch.cat((networkScores, textScores), dim=1)
                binaryScores = torch.round(torch.sigmoid(splitModel(inputFeatures))).reshape(blocked.shape)
            num_correct += (blocked == binaryScores).sum().float()
            num_samples += len(blocked)
            blockedVector = list(blocked.cpu())
            scoresVector = list(binaryScores.cpu())
            yTrue = yTrue + blockedVector
            yPred = yPred + scoresVector
        if (confusion):
            f1Score = f1_score(yTrue, yPred)
            tn, fp, fn, tp = (confusion_matrix(yTrue, yPred).ravel())

        if (not returnAll):
            return (num_correct/num_samples)
        return ((num_correct/num_samples), f1Score, (tn, fp, fn, tp))

# Models

Below, you'll find a variety of cells meant to define the models and train them. There are three main models that're defined: 

- Social network model
- Textual model
- Mixed model

By understanding the performance of each of the models, we'll be able to better understand which data type (social network embeddings vs. textual data) contains more useful predictive information. 

### Social Network Model

This model is meant to make predictions based on the social network embeddings. 

In [35]:
# This method will return the best network model it found from the given 
# trainData and testData
def getNetworkModel(trainData, testData, epochAmt=150): 

    # Declare the drouput strength 
    dropoutStrength = 0.4

    # Declare a couple of layer sizes
    networkEmbDim = len(accounts[0][1])
    networkEmb_hidden1 = 512
    networkEmb_hidden2 = 256
    networkEmb_hidden3 = 128
    networkEmb_hidden4 = 64

    # Declare the model
    networkEmb_model = nn.Sequential(nn.Linear(networkEmbDim, networkEmb_hidden1),
                          nn.LeakyReLU(),
                          #nn.BatchNorm1d(networkEmb_hidden1),
                          nn.Linear(networkEmb_hidden1, networkEmb_hidden2),
                          nn.LeakyReLU(),
                          nn.Dropout(dropoutStrength),
                          nn.Linear(networkEmb_hidden2, networkEmb_hidden3),
                          nn.LeakyReLU(),
                          #nn.BatchNorm1d(networkEmb_hidden3),
                          nn.Linear(networkEmb_hidden3, networkEmb_hidden4),
                          nn.LeakyReLU(),
                          nn.Linear(networkEmb_hidden4, 1))

    # Define a number of parameters for training
    networkEmb_epochs = epochAmt
    networkEmb_LR = 0.00004
    networkEmb_model = networkEmb_model.to(device)
    networkEmb_opt = optim.Adam(networkEmb_model.parameters(), lr=networkEmb_LR)
    networkEmb_testAccList = []
    networkEmb_trainAccList = []
    bestModel = None
    bestTestAcc = 0

    # Iterate through each training epoch
    for e in range(networkEmb_epochs):
        for idx, (userID, networkEmb, textEmb, blocked) in enumerate(trainData):

            # Indicate that we're in training mode
            networkEmb_model.train() 

            # Declaring a loss function
            loss_fn = nn.BCEWithLogitsLoss()

            # Performing a training step for the network embedding model
            scores = networkEmb_model(networkEmb)
            scores = scores.reshape(blocked.shape)
            loss = loss_fn(scores, blocked)
            networkEmb_opt.zero_grad()
            loss.backward()
            networkEmb_opt.step()

        # Print the accuracy of the network embedding model
        testAcc = CheckAccuracy(testData, networkEmb_model, "network")
        trainAcc = CheckAccuracy(trainData, networkEmb_model, "network")

        if (testAcc > bestTestAcc):
            bestModel = copy.deepcopy(networkEmb_model)
            bestTestAcc = testAcc
            
    return bestModel, bestTestAcc

### Text Embedding Model

This model is meant to make predictions based on the account text embeddings. 

In [36]:
# This method will return the best network model it found from the given 
# trainData and testData
def getTextModel(trainData, testData, epochAmt=150): 

    # Declare the drouput strength
    dropoutStrength = 0.5

    # Declare a couple of layer sizes
    textEmbDim = len(accounts[0][2])
    textEmb_hidden1 = 2048
    textEmb_hidden2 = 1024
    textEmb_hidden3 = 512
    textEmb_hidden4 = 256

    # Declare the model
    textEmb_model = nn.Sequential(nn.Linear(textEmbDim, textEmb_hidden1),
                          nn.LeakyReLU(),
                          #nn.BatchNorm1d(textEmb_hidden1),
                          nn.Linear(textEmb_hidden1, textEmb_hidden2),
                          nn.LeakyReLU(),
                          nn.Dropout(dropoutStrength),
                          #nn.BatchNorm1d(textEmb_hidden2),
                          nn.Linear(textEmb_hidden2, textEmb_hidden3),
                          nn.LeakyReLU(),
                          nn.Dropout(dropoutStrength),
                          #nn.BatchNorm1d(textEmb_hidden3),
                          nn.Linear(textEmb_hidden3, textEmb_hidden4),
                          nn.LeakyReLU(),
                          #nn.BatchNorm1d(textEmb_hidden4),
                          nn.Linear(textEmb_hidden4, 1))

    # Define a number of parameters for training
    textEmb_epochs = epochAmt
    textEmb_LR = 0.0001
    textEmb_model = textEmb_model.to(device)
    textEmb_opt = optim.Adam(textEmb_model.parameters(), lr=textEmb_LR)
    textEmb_testAccList = []
    textEmb_trainAccList = []

    bestModel = None
    bestTestAcc = 0

    # Iterate through each training epoch
    for e in range(textEmb_epochs):
        for idx, (userID, textEmb, textEmb, blocked) in enumerate(trainData):

            # Indicate that we're in training mode
            textEmb_model.train() 

            # Declaring a loss function
            loss_fn = nn.BCEWithLogitsLoss()

            # Performing a training step for the text embedding model
            scores = textEmb_model(textEmb)
            scores = scores.reshape(blocked.shape)
            loss = loss_fn(scores, blocked)
            textEmb_opt.zero_grad()
            loss.backward()
            textEmb_opt.step()

        # Print the accuracy of the text embedding model
        testAcc = CheckAccuracy(testData, textEmb_model, "text")
        trainAcc = CheckAccuracy(trainData, textEmb_model, "text")

        if (testAcc > bestTestAcc):
            bestModel = copy.deepcopy(textEmb_model)
            bestTestAcc = testAcc
            
    return bestModel, bestTestAcc

### Mixed Model #1: Bagging

The following cell will make predictions based on *both* data types! 

In [11]:
# This bagging "meta-model" doesn't require any training, since we're just
# averaging the output of two pre-trained models
# Store the songIDs for each of the correct songs in bothRightGuesses.
bothRightGuesses = []
yTrue = []
yPred = []
networkEmb_model.eval()
textEmb_model.eval()
num_correct = 0
num_samples = 0
with torch.no_grad():
    for (userID, networkEmb, textEmb, blocked) in loader_test:
        networkEmb_Scores = (torch.sigmoid(networkEmb_model(networkEmb))).reshape(blocked.shape)
        textEmb_Scores = (torch.sigmoid(textEmb_model(textEmb))).reshape(blocked.shape)
        bothScores = (networkEmb_Scores + textEmb_Scores)/2
        binaryScores = torch.round(bothScores)
        mask = (binaryScores == blocked)
        bothRightGuesses += ([int(x) for x in (userID[mask])])
        yTrue = yTrue + list(blocked.cpu())
        yPred = yPred + list(binaryScores.cpu())

# Print the confusion matrix + F1 score for the bagging model
f1Score = f1_score(yTrue, yPred)
tn, fp, fn, tp = (confusion_matrix(yTrue, yPred).ravel())
print("True negative: %d" % tn)
print("False positive: %d" % fp)
print("False negative: %d" % fn)
print("True positive: %d" % tp)
print("F1 score: %.5f" % f1Score)

NameError: name 'networkEmb_model' is not defined

In [None]:
# Figure out which accounts the networkEmb model guesses right on 
networkEmb_RightGuesses = []
networkEmb_model.eval()
with torch.no_grad():
    for (userID, networkEmb, textEmb, blocked) in loader_test:
        binaryScores = torch.round(torch.sigmoid(networkEmb_model(networkEmb))).reshape(blocked.shape)
        mask = (binaryScores == blocked)
        networkEmb_RightGuesses += ([int(x) for x in (userID[mask])])
        
# Figure out which accounts the textEmb model guesses right on 
textEmb_RightGuesses = []
textEmb_model.eval()
with torch.no_grad():
    for (userID, networkEmb, textEmb, blocked) in loader_test:
        binaryScores = torch.round(torch.sigmoid(textEmb_model(textEmb))).reshape(blocked.shape)
        mask = (binaryScores == blocked)
        textEmb_RightGuesses += ([int(x) for x in (userID[mask])])

In [None]:
networkEmbRightGuessesSet = set(networkEmb_RightGuesses)
textEmbRightGuessesSet = set(textEmb_RightGuesses)
bothRightGuessesSet = set(bothRightGuesses)
print("There were %d accounts in the test set..." % len(testIndices))
print("The networkEmb model got %d accounts correct (%.2f%% accuracy)" % (len(networkEmbRightGuessesSet), 100 * (len(networkEmbRightGuessesSet)/len(testIndices))))
print("The textEmb model got %d accounts correct (%.2f%% accuracy)" % (len(textEmbRightGuessesSet), 100 * (len(textEmbRightGuessesSet)/len(testIndices))))
print("The bagged model got %d accounts correct (%.2f%% accuracy)" % (len(bothRightGuessesSet), 100 * (len(bothRightGuessesSet)/len(testIndices))))

### Mixed Model #2: Split

The following cell will make predictions based on *both* data types! 

In [37]:
# This method will return the splitModel
def getSplitModel(trainData, testData, epochAmt=150):

    dropoutStrength = 0.5

    networkEmb_DimCount = len(accounts[0][1])
    networkEmb_Hidden1 = 512
    networkEmb_Hidden2 = 256
    networkEmb_Hidden3 = 128
    networkEmb_CutoffAmt = 64
    networkEmb_Model_cutoff = nn.Sequential(nn.Linear(networkEmb_DimCount, networkEmb_Hidden1),
                          nn.LeakyReLU(),
                          #nn.BatchNorm1d(networkEmb_Hidden1),
                          nn.Linear(networkEmb_Hidden1, networkEmb_Hidden2),
                          nn.LeakyReLU(),
                          nn.Linear(networkEmb_Hidden2, networkEmb_Hidden3),
                          nn.LeakyReLU(),
                          #nn.BatchNorm1d(networkEmb_Hidden3),
                          nn.Linear(networkEmb_Hidden3, networkEmb_CutoffAmt),
                          nn.Dropout(dropoutStrength))


    textEmb_DimCount = len(accounts[0][2])
    textEmb_Hidden1 = 2048 
    textEmb_Hidden2 = 1024
    textEmb_Hidden3 = 512
    textEmb_CutoffAmt = 64 
    textEmb_Model_cutoff = nn.Sequential(nn.BatchNorm1d(textEmb_DimCount),
                               nn.Linear(textEmb_DimCount, textEmb_Hidden1),
                               nn.LeakyReLU(),
                               nn.Linear(textEmb_Hidden1, textEmb_Hidden2),
                               nn.LeakyReLU(),
                               nn.Linear(textEmb_Hidden2, textEmb_Hidden3),
                               nn.LeakyReLU(),
                               #nn.BatchNorm1d(textEmb_Hidden3),
                               nn.Linear(textEmb_Hidden3, textEmb_CutoffAmt),
                               nn.Dropout(dropoutStrength))


    splitInput = textEmb_CutoffAmt+networkEmb_CutoffAmt
    splitHidden1 = 1024
    splitHidden2 = 512 
    splitHidden3 = 256
    splitHidden4 = 128
    splitModel = nn.Sequential(nn.Linear(splitInput, splitHidden1),
                               nn.LeakyReLU(),
                               nn.Dropout(dropoutStrength),
                               nn.Linear(splitHidden1, splitHidden2),
                               nn.LeakyReLU(),
                               nn.Linear(splitHidden2, splitHidden3),
                               nn.LeakyReLU(),
                               # nn.Dropout(dropoutStrength),
                               nn.Linear(splitHidden3, splitHidden4),
                               nn.LeakyReLU(),
                               nn.Linear(splitHidden4, 1))


    splitModel = splitModel.to(device)
    textEmb_Model_cutoff = textEmb_Model_cutoff.to(device)
    networkEmb_Model_cutoff = networkEmb_Model_cutoff.to(device)

    splitEpochs = epochAmt
    splitLR = 0.0001
    textEmb_LR = 0.00001
    networkEmb_LR = 0.00001

    splitOptimizer = optim.Adam(splitModel.parameters(), lr=splitLR)
    networkEmb_CutoffOptimizer = optim.Adam(networkEmb_Model_cutoff.parameters(), lr=networkEmb_LR)
    textEmb_CutoffOptimizer = optim.Adam(textEmb_Model_cutoff.parameters(), lr=textEmb_LR)

    bestModel = None
    bestTestAcc = 0

    for e in range(splitEpochs):
        for idx, (userID, networkEmb, textEmb, blocked) in enumerate(trainData):

            # Indicate that we're in training mode
            networkEmb_Model_cutoff.train()
            textEmb_Model_cutoff.train()
            splitModel.train()

            # Declaring a loss function
            loss_fn = nn.BCEWithLogitsLoss()

            # Perform a training step for the networkEmb_edding model 
            networkEmb_Scores = networkEmb_Model_cutoff(networkEmb)

            # Perform a training step for the textEmb_ model
            textEmb_Scores = textEmb_Model_cutoff(textEmb)

            # Now, use these scores to perform a training step for the split model 
            inputScores = torch.cat((networkEmb_Scores, textEmb_Scores), dim=1)

            splitScores = splitModel(inputScores)
            splitScores = splitScores.reshape(blocked.shape)
            splitLoss = loss_fn(splitScores, blocked)
            splitOptimizer.zero_grad()
            textEmb_CutoffOptimizer.zero_grad()
            networkEmb_CutoffOptimizer.zero_grad()
            splitLoss.backward()
            splitOptimizer.step()
            textEmb_CutoffOptimizer.step()
            networkEmb_CutoffOptimizer.step()


        # Print the accuracy of the split model
        splitTestAcc = CheckAccuracy(testData, (networkEmb_Model_cutoff, textEmb_Model_cutoff, splitModel), "split", False)
        splitTrainAcc = CheckAccuracy(trainData, (networkEmb_Model_cutoff, textEmb_Model_cutoff, splitModel), "split")

        if (splitTestAcc > bestTestAcc):
            bestModel = (copy.deepcopy(networkEmb_Model_cutoff), copy.deepcopy(textEmb_Model_cutoff), copy.deepcopy(splitModel))
            bestTestAcc = splitTestAcc
            
    return (bestModel, bestTestAcc)

## Visualizations & Results

Below, I used the best models that I'd been able to train, and then tested them on various different splits of the data. 

In [38]:
splitTestAcc = CheckAccuracy(loader_test, splitModel, "split", True)

NameError: name 'splitModel' is not defined

In [None]:
networkTestAcc = CheckAccuracy(loader_test, networkEmb_model, "network", True)

In [None]:
textTestAcc = CheckAccuracy(loader_test, textEmb_model, "text", True)

## Cross-Validation

After running the models a couple of times, I realized that their performance was varying a fair amount depending on what data was being used for training/testing. In order to ensure I had a solid idea of how each model was performing, I defined the following setup to perform cross-validation. 

In [39]:
# Defining the folds, and a couple of data structures that we'll use
# to store the results of the cross validation
foldAmt = 4
kfold = KFold(n_splits=foldAmt, shuffle=True)
modelDict = {"network": [], "text": [], "split": []}

In [40]:
# Run through cross-validation for each of the different models
for fold, (train_ids, test_ids) in enumerate(kfold.split(accounts)):
    
    # Print the fold you're on
    print("\n\n\nCROSS VALIDATION - FOLD %d\n" % (fold+1))
    print(len(train_ids))
    print(len(test_ids))
    
    # Declare the DataLoaders with this split of training / testing data
    batchSize=4
    trainSampler = SubsetRandomSampler(train_ids)
    testSampler = SubsetRandomSampler(test_ids)
    trainLoader = torch.utils.data.DataLoader(accounts, batch_size=batchSize, sampler=trainSampler)
    testLoader = torch.utils.data.DataLoader(accounts, batch_size=batchSize, sampler=testSampler)
    
    # Get the social network model 
    print("Training the social network model...")
    networkModel, testAcc = getNetworkModel(trainLoader, testLoader, epochAmt=200)
    modelDict["network"].append((networkModel, testAcc))
    print("Model recieved a test accuracy of %.4f\n" % testAcc)
    
    # Get the text model 
    print("Training the text model...")
    textModel, testAcc = getTextModel(trainLoader, testLoader, epochAmt=200)
    modelDict["text"].append((textModel, testAcc))
    print("Model recieved a test accuracy of %.4f\n" % testAcc)
    
    # Get the split model 
    print("Training the split model...")
    splitModel, testAcc = getSplitModel(trainLoader, testLoader, epochAmt=300)
    modelDict["split"].append((splitModel, testAcc))
    print("Model recieved a test accuracy of %.4f\n" % testAcc)




CROSS VALIDATION - FOLD 1

127
43
Training the social network model...
Model recieved a test accuracy of 0.9302

Training the text model...


KeyboardInterrupt: 

In [30]:
# Run CheckAccuracy for each of the folds to get the average of each model accuracy
runningAccDict = {"network": [], "text": [], "split": [], "bagging": []}
runningF1Dict = {"network": [], "text": [], "split": [], "bagging": []}
for fold, (train_ids, test_ids) in enumerate(kfold.split(accounts)): 
    
    print("\n\nRUNNING FOLD %d" % (fold+1))
    
    # Declare the DataLoaders with this split of training / testing data
    batchSize=4
    trainSampler = SubsetRandomSampler(train_ids)
    testSampler = SubsetRandomSampler(test_ids)
    trainLoader = torch.utils.data.DataLoader(accounts, batch_size=batchSize, sampler=trainSampler)
    testLoader = torch.utils.data.DataLoader(accounts, batch_size=batchSize, sampler=testSampler)
    
    # Retrieving each of the models
    networkModel = modelDict["network"][fold][0]
    textModel = modelDict["text"][fold][0]
    splitModel = modelDict["split"][fold][0]
    
    # Getting the accuracies of each model 
    print("Running CheckAccuracy() for the network model...")
    testAcc_network, f1_network, confusionMatrix_network = CheckAccuracy(testLoader, networkModel, "network", True, True)
    print("Running CheckAccuracy() for the text model...")
    testAcc_text, f1_text, confusionMatrix_text = CheckAccuracy(testLoader, textModel, "text", True, True)
    print("Running CheckAccuracy() for the split model...")
    testAcc_split, f1_split, confusionMatrix_split = CheckAccuracy(testLoader, splitModel, "split", True, True)
    
    # Getting the accuracies of the "bagged" model
    print("Getting the accuracy of the bagging model...")
    yTrue = []
    yPred = []
    networkModel.eval()
    textModel.eval()
    num_correct = 0
    num_samples = 0
    with torch.no_grad():
        for (userID, networkEmb, textEmb, blocked) in testLoader:
            networkEmb_Scores = (torch.sigmoid(networkModel(networkEmb))).reshape(blocked.shape)
            textEmb_Scores = (torch.sigmoid(textModel(textEmb))).reshape(blocked.shape)
            bothScores = (networkEmb_Scores + textEmb_Scores)/2
            binaryScores = torch.round(bothScores)
            mask = (binaryScores == blocked)
            yTrue = yTrue + list(blocked.cpu())
            yPred = yPred + list(binaryScores.cpu())

    # Grab some of the results from the bagging model 
    tn, fp, fn, tp = (confusion_matrix(yTrue, yPred).ravel())
    testAcc_bagging = (tn + tp) / + (tn + fp + fn + tp)
    f1_bagging = f1_score(yTrue, yPred)
    confusionMatrix_bagging = (tn, fp, fn, tp)
    
    # Storing the results
    runningAccDict["network"].append(testAcc_network)
    runningF1Dict["network"].append(f1_network)
    
    runningAccDict["text"].append(testAcc_text)
    runningF1Dict["text"].append(f1_text)
    
    runningAccDict["split"].append(testAcc_split)
    runningF1Dict["split"].append(f1_split)
    
    runningAccDict["bagging"].append(testAcc_bagging)
    runningF1Dict["bagging"].append(f1_bagging)



RUNNING FOLD 1
Running CheckAccuracy() for the network model...
Running CheckAccuracy() for the text model...
Running CheckAccuracy() for the split model...
Getting the accuracy of the bagging model...


RUNNING FOLD 2
Running CheckAccuracy() for the network model...
Running CheckAccuracy() for the text model...
Running CheckAccuracy() for the split model...
Getting the accuracy of the bagging model...


RUNNING FOLD 3
Running CheckAccuracy() for the network model...
Running CheckAccuracy() for the text model...
Running CheckAccuracy() for the split model...
Getting the accuracy of the bagging model...


RUNNING FOLD 4
Running CheckAccuracy() for the network model...
Running CheckAccuracy() for the text model...
Running CheckAccuracy() for the split model...
Getting the accuracy of the bagging model...


In [31]:
# Calculating averages for each model 
for modelType in ["network", "text", "split", "bagging"]:
    print("\n\nCalculating averages for %s..." % modelType)
    print("Average accuracy: %.3f" % (sum(runningAccDict[modelType]).item()/len(runningAccDict[modelType])))
    print("Average F1 score: %.3f" % (sum(runningF1Dict[modelType]).item()/len(runningF1Dict[modelType])))



Calculating averages for network...
Average accuracy: 0.888
Average F1 score: 0.887


Calculating averages for text...
Average accuracy: 0.948
Average F1 score: 0.951


Calculating averages for split...
Average accuracy: 0.971
Average F1 score: 0.968


Calculating averages for bagging...
Average accuracy: 0.953
Average F1 score: 0.955
