In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split

import time, os
import torch
from torch.utils import data

RANDOM_STATE = 42
N_KEYWORDS= 500
N_AUTHORS = 2302

# The following pytorch code is modified based on workshop 7's jupyter notebook by group 24

### Read train data set

In [4]:
train_df = json.load(open('../data/train.json'))
train_df = pd.DataFrame(train_df).T

### Pre-process train data

In [14]:
def preprocess_dataset(df, is_test_data= False):
    if is_test_data:
        pass
    else:
        df1 = pd.DataFrame(df.author)
        df1.columns=['coauthor']
        df2 = df.explode('author')
        df3 = df1.join(df2)
        df3['author_set'] = df3['author'].apply(lambda x: {x})
        df3['coauthor_set'] = df3['coauthor'].apply(lambda x: set(x))
        df3['coauthor'] = df3['coauthor_set'] - df3['author_set']
        df3['coauthor'] = df3['coauthor'].apply(lambda x: list(x))
        df3 = df3[['keywords', 'coauthor', 'author']]
        mlb_keywords = MultiLabelBinarizer(sparse_output=True, classes=list(range(500)))
        keywords_oh = mlb_keywords.fit_transform(df3.keywords).toarray()

        mlb_coauthor = MultiLabelBinarizer(sparse_output=True, classes=list(range(2302)))
        coauthor_oh = mlb_coauthor.fit_transform(df3.coauthor).toarray()

        features = np.hstack((keywords_oh, coauthor_oh))
        labels = df3.author.to_list()
    
    return features, labels, mlb_keywords, mlb_coauthor

In [None]:
features, labels, mlb_keywords, mlb_coauthor = preprocess_dataset(train_df)
feature_processed = torch.from_numpy(np.array(features)).float()
labels_processed = torch.from_numpy(np.array(labels)).long()

In [17]:
# split raw train data into train and validation set
X_train, X_val, y_train, y_val = train_test_split(feature_processed, labels_processed, test_size=0.2, random_state=RANDOM_STATE)

print(len(X_train), len(y_train))
print(len(X_val), len(y_val))

32160 32160
15840 15840


In [18]:
train_set, valid_set = [], []

for i in range(len(X_train)):
    train_set.append((X_train[i], y_train[i]))
    
for i in range(len(X_val)):
    valid_set.append((X_val[i], y_val[i]))

In [19]:
train_loader = torch.utils.data.DataLoader(train_set, batch_size=128, shuffle=True)
test_loader = torch.utils.data.DataLoader(valid_set, batch_size=128, shuffle=False)

torch.Size([128, 2802])
torch.Size([128])


In [20]:
import torch.nn as nn
import torch.nn.functional as F

class MulticlassLogisticRegression(nn.Module):
    
    def __init__(self, n_features, n_classes):
        super(MulticlassLogisticRegression, self).__init__()
        
        # Define weight and bias
        self.W = torch.nn.Parameter(torch.nn.init.xavier_uniform_(torch.empty([n_features, n_classes]))) # Weights 
        self.b = torch.nn.Parameter(torch.zeros([n_classes])) # Biases
        
    def forward(self, x):
        """
        Forward pass for logistic regression.
        Input: Tensor x of shape []
        Output: Logits W @ x + b
        """
        batch_size = x.shape[0]
        
        x = x.view(batch_size, -1) # Flatten data, retaining batch size
        out = torch.matmul(x, self.W) + self.b # compute scores 
        return out
    

In [21]:
n_features, n_classes = N_KEYWORDS + N_AUTHORS, N_AUTHORS
logistic_regression_model = MulticlassLogisticRegression(n_features, n_classes)

for p in logistic_regression_model.parameters():
    print(p.shape)

torch.Size([2802, 2302])
torch.Size([2302])


In [38]:
def test(model, criterion, test_loader):
    
    test_loss = 0.0
    test_preds = []
    test_labels = []
    for i, data in enumerate(test_loader):
        x, labels = data
        
        with torch.no_grad():
            logits = model(x)    # Compute scores
            predictions = torch.argmax(logits, dim=1)
            test_loss += criterion(input=logits, target=labels).item()
            test_preds.append(predictions)
            test_labels.append(labels)
            
    test_preds = torch.cat(test_preds)
    test_labels = torch.cat(test_labels)
    
    test_accuracy = torch.eq(test_preds, test_labels).float().mean().item()
    
    print(f'[TEST]  Mean loss {round(test_loss/len(test_loader), 4)} | Accuracy {round(test_accuracy, 4)}')

In [None]:
def train(model, train_loader, test_loader, optimizer, n_epochs=25):
    """
    Generic training loop for supervised multiclass learning
    """
    LOG_INTERVAL = 256
    running_loss, running_accuracy = list(), list()
    start_time = time.time()
    criterion = torch.nn.CrossEntropyLoss()
    
    # Iterate by number of epochs
    for epoch in range(n_epochs):
        epoch_loss = 0.0
        
        for i, data in enumerate(train_loader):
            x, labels = data
            logits = model(x)
            predictions = torch.argmax(logits, dim=1)
            train_acc = torch.mean(torch.eq(predictions, labels).float()).item()
            loss = criterion(input=logits, target=labels)
            
            loss.backward()          # Backward pass (compute parameter gradients)
            optimizer.step()         # Update weight parameter using SGD
            optimizer.zero_grad()    # Reset gradients to zero for next iteration

            running_loss.append(loss.item())
            running_accuracy.append(train_acc)

            epoch_loss += loss.item()

            if i % LOG_INTERVAL == 0:  # Log training stats
                deltaT = time.time() - start_time
                mean_loss = epoch_loss / (i+1)
                print(f'[TRAIN] Epoch {epoch} [{i}/{len(train_loader)}]| Mean loss {round(mean_loss, 4)} | Accuracy {round(train_acc, 4)} | Time {round(deltaT, 4)} s'.format(epoch, 
                    i, len(train_loader), mean_loss, train_acc, deltaT))
        
        print(f'>>>>>>  Epoch {epoch} Done| Mean loss {round(epoch_loss/len(train_loader), 4)}')

        test(model, criterion, test_loader)

    return running_loss, running_accuracy


In [None]:
optimizer = torch.optim.SGD(logistic_regression_model.parameters(), lr=0.2, momentum=0.9)

lr_loss, lr_acc = train(logistic_regression_model, train_loader, test_loader, optimizer, n_epochs=40)

### Transform test data

In [5]:
test_df = json.load(open('../data/test.json'))
test_df = pd.DataFrame(test_df).T
test_df.head()

Unnamed: 0,venue,keywords,year,coauthor,target
0,,"[260, 6, 390, 136, 7, 11, 17, 285, 288, 162, 4...",2017,[],988
1,94.0,"[260, 454, 137, 14, 400, 274, 339, 213, 280, 2...",2019,[1001],2123
2,31.0,"[390, 198, 7, 461, 462, 14, 404, 277, 24, 473,...",2014,[],1578
3,6.0,"[195, 6, 390, 10, 459, 464, 338, 146, 276, 466...",2010,[1347],2072
4,162.0,"[64, 1, 260, 457, 73, 147, 282, 27, 156, 43, 3...",2016,[1107],995


In [32]:
keywords_oh_test = mlb_keywords.transform(test_df.keywords).toarray()
coauthor_oh_test = mlb_coauthor.transform(test_df.coauthor).toarray()
features_test = np.hstack((keywords_oh_test, coauthor_oh_test))

features_test.shape

(2000, 2802)

### Predict Kaggle test data

In [33]:
tensor_test = torch.tensor(np.array(features_test)).float()

predictions = torch.matmul(tensor_test, logistic_regression_model.W) + logistic_regression_model.b

print(predictions)

tensor([[ 1.1056,  0.8017, -0.3196,  ..., -1.3770,  0.0972,  3.1753],
        [ 1.8976,  0.1104, -0.4413,  ..., -0.2809,  1.4967,  0.4374],
        [-2.3686, -1.3717, -0.4697,  ..., -1.3212, -1.4522, -0.4854],
        ...,
        [ 0.7753, -3.1361,  2.4074,  ...,  0.1092,  0.1472, -1.6790],
        [ 1.6290, -0.9589,  1.6027,  ..., -0.7128,  2.3460,  1.6931],
        [-0.9993, -0.0970,  0.6584,  ...,  0.3550, -0.1605, -1.2075]],
       grad_fn=<AddBackward0>)


In [34]:
predictions_arr = predictions.tolist()

print(predictions.shape)
predictions

torch.Size([2000, 2302])


tensor([[ 1.1056,  0.8017, -0.3196,  ..., -1.3770,  0.0972,  3.1753],
        [ 1.8976,  0.1104, -0.4413,  ..., -0.2809,  1.4967,  0.4374],
        [-2.3686, -1.3717, -0.4697,  ..., -1.3212, -1.4522, -0.4854],
        ...,
        [ 0.7753, -3.1361,  2.4074,  ...,  0.1092,  0.1472, -1.6790],
        [ 1.6290, -0.9589,  1.6027,  ..., -0.7128,  2.3460,  1.6931],
        [-0.9993, -0.0970,  0.6584,  ...,  0.3550, -0.1605, -1.2075]],
       grad_fn=<AddBackward0>)

In [35]:
prob_score_list = []

for index, target in test_df.target.iteritems():
    prob = predictions_arr[index][target]
    prob_score = 1 / (1 + np.exp(-prob))  # sigmoid function
    prob_score_list.append(prob_score) 

### Output result to CSV for Kaggle submission

In [352]:
import csv
with open('5_submission.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['Id', 'Predicted'])
    test_id = 0
    for i in prob_score_list:
        writer.writerow([test_id, i])
        test_id += 1