# Authorship Identification

## Section 4.1: Method 1 - Basic Features + FNN

### 1. Dataset & Data Loader

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

data_tran = pd.read_json('../data/data2/data_tran.json', orient='records', lines=True)
data_test = pd.read_json('../data/data2/data_test.json', orient='records', lines=True)

n_tran = data_tran.shape[0]
n_test = data_test.shape[0]

x_tran_coauthors = np.load("../data/data2/x_tran_coauthors.npy")
x_tran_venue_a   = np.load("../data/data2/x_tran_venue_a.npy")
x_tran_venue_b   = np.load("../data/data2/x_tran_venue_b.npy")
x_tran_text_a    = np.load("../data/data2/x_tran_text_a.npy")
x_tran_text_b    = np.load("../data/data2/x_tran_text_b.npy")

x_test_coauthors = np.load("../data/data2/x_test_coauthors.npy")
x_test_venue_a   = np.load("../data/data2/x_test_venue_a.npy")
x_test_venue_b   = np.load("../data/data2/x_test_venue_b.npy")
x_test_text_a    = np.load("../data/data2/x_test_text_a.npy")
x_test_text_b    = np.load("../data/data2/x_test_text_b.npy")

y_tran = np.load("../data/data2/y_tran.npy")
x_tran = np.concatenate((x_tran_coauthors, x_tran_venue_a, x_tran_venue_b, x_tran_text_a, x_tran_text_b), axis=1)
x_tran, x_vald, y_tran, y_vald = train_test_split(x_tran, y_tran, test_size=1000, random_state=42)
x_test = np.concatenate((x_test_coauthors, x_test_venue_a, x_test_venue_b, x_test_text_a, x_test_text_b), axis=1)

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

y_tran = torch.tensor(y_tran, dtype=torch.float32).to(device)
x_tran = torch.tensor(x_tran, dtype=torch.float32).to(device)
y_vald = torch.tensor(y_vald, dtype=torch.float32).to(device)
x_vald = torch.tensor(x_vald, dtype=torch.float32).to(device)
x_test = torch.tensor(x_test, dtype=torch.float32).to(device)

### 2. Model Structure

In [None]:
class FNN(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(FNN, self).__init__()
        self.fc1 = nn.Linear(input_dim, 512)
        self.fc2 = nn.Linear(512, 256)
        self.fc3 = nn.Linear(256, 128)
        self.fc4 = nn.Linear(128, output_dim)
        self.tanh = nn.Tanh() 
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid() 
        self.dropout = nn.Dropout(0.1) 

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.fc2(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.fc3(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.fc4(x)
        x = self.sigmoid(x) 
        return x

### 3. Model Training

In [None]:
model = FNN(input_dim=x_tran.shape[1], output_dim=y_tran.shape[1]).to(device)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.00001, weight_decay=0.00001)

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score

def calculate_metrics(pred_label, true_label):
    pred_label = pred_label.int()
    true_label = true_label.int()
    pc = precision_score(true_label.cpu(), pred_label.cpu(), average='macro', zero_division=0)
    rc = recall_score(true_label.cpu(), pred_label.cpu(), average='macro', zero_division=0)
    f1 = f1_score(true_label.cpu(), pred_label.cpu(), average='macro', zero_division=0)
    return pc, rc, f1

In [None]:
class EarlyStopping:
    def __init__(self, patience=10, delta=0.001):
        self.patience = patience
        self.delta = delta
        self.best_loss = None
        self.counter = 0
        self.early_stop = False

    def __call__(self, train_loss):
        if self.best_loss is None or train_loss < self.best_loss - self.delta:
            self.best_loss = train_loss
            self.counter = 0
        else:
            self.counter += 1
            if self.counter >= self.patience:
                self.early_stop = True

early_stopping = EarlyStopping(patience=10, delta=0.001)

In [None]:
epochs = 50000

for epoch in range(epochs):
    
    model.train()

    optimizer.zero_grad()

    y_tran_pred_prob = model(x_tran)
    loss = criterion(y_tran_pred_prob, y_tran.float())

    loss.backward()
    optimizer.step()

    if (epoch + 1) % 1000 == 0:
        model.eval()

        with torch.no_grad():
            y_tran_pred_prob = model(x_tran)
            y_tran_pred_labl = (y_tran_pred_prob > 0.5).int()
            tran_pc, tran_rc, tran_f1 = calculate_metrics(y_tran_pred_labl, y_tran)

        with torch.no_grad():
            y_vald_pred_prob = model(x_vald)
            y_vald_pred_labl = (y_vald_pred_prob > 0.5).int()
            vald_pc, vald_rc, vald_f1 = calculate_metrics(y_vald_pred_labl, y_vald)

        print(f"Epoch {epoch + 1}/{epochs}, Loss: {loss.item():.4f}")
        print(f"Train - Precision: {tran_pc:.4f}, Recall: {tran_rc:.4f}, F1 Score: {tran_f1:.4f}")
        print(f"Val   - Precision: {vald_pc:.4f}, Recall: {vald_rc:.4f}, F1 Score: {vald_f1:.4f}")
        print()

        early_stopping(loss)
        if early_stopping.early_stop:
            print("Early Stop !")
            break

with torch.no_grad():
    y_test_pred_prob = model(x_test)
    y_test_pred_labl = (y_test_pred_prob > 0.5).int()


### 4. Model Prediction

In [None]:
def generate_output_csv(x_test, y_test_pred_labl):
    
    result = []
    
    for i, row in enumerate(y_test_pred_labl):
        if ((x_test[i, :100] < 1).all() or (x_test[i, 100:200] == 0).all() or (x_test[i, 200:300] == 0).all() or (x_test[i, 300:400] == 0).all() or (x_test[i, 400:500] == 0).all()):
            result.append("-1")
        elif row.sum() == 0 or row[100] == 1:
            result.append("-1")
        else:
            indices = [str(idx) for idx, val in enumerate(row) if val == 1]
            result.append(" ".join(indices))
    
    result_df = pd.DataFrame({"ID": range(len(result)), "Predict": result})
    
    return result_df

generate_output_csv(x_test, y_test_pred_labl).to_csv("../data/data3/result_method1.csv", index=False)