In [1]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/gdrive')


Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


## Preprocessing

In [2]:
import json
from pathlib import Path

def flatten(list_of_list):
    return [item for sublist in list_of_list for item in sublist]

path_to_training = Path("/content/gdrive/MyDrive/inf554-extractive-summarization-2023/training")
path_to_test = Path("/content/gdrive/MyDrive/inf554-extractive-summarization-2023/test")

#####
# training and test sets of transcription ids
#####
training_set = ['ES2002', 'ES2005', 'ES2006', 'ES2007', 'ES2008', 'ES2009', 'ES2010', 'ES2012', 'ES2013', 'ES2015', 'ES2016', 'IS1000', 'IS1001', 'IS1002', 'IS1003', 'IS1004', 'IS1005', 'IS1006', 'IS1007', 'TS3005', 'TS3008', 'TS3009', 'TS3010', 'TS3011', 'TS3012']
training_set = flatten([[m_id+s_id for s_id in 'abcd'] for m_id in training_set])
training_set.remove('IS1002a')
training_set.remove('IS1005d')
training_set.remove('TS3012c')

test_set = ['ES2003', 'ES2004', 'ES2011', 'ES2014', 'IS1008', 'IS1009', 'TS3003', 'TS3004', 'TS3006', 'TS3007']
test_set = flatten([[m_id+s_id for s_id in 'abcd'] for m_id in test_set])

#####
# naive_baseline: all utterances are predicted important (label 1)
#####
test_labels = {}
for transcription_id in test_set:
    with open(path_to_test / f"{transcription_id}.json", "r") as file:
        transcription = json.load(file)

    test_labels[transcription_id] = [1] * len(transcription)

with open("test_labels_naive_baseline.json", "w") as file:
    json.dump(test_labels, file, indent=4)

## Text embedding

In [3]:
y_training = []
with open("/content/gdrive/MyDrive/inf554-extractive-summarization-2023/training_labels.json", "r") as file:
    training_labels = json.load(file)
word_training = []
for transcription_id in training_set:
    with open(path_to_training / f"{transcription_id}.json", "r") as file:
        transcription = json.load(file)

    for utterance in transcription:
        word_training.append(utterance["speaker"] + ": " + utterance["text"])

    y_training += training_labels[transcription_id]

#word_training = e5.encode(word_training, show_progress_bar=True, normalize_embeddings=True)
#word_training.shape

# RNN

In [4]:
!pip install datasets transformers



In [5]:
import time
time_start = time.time()

In [6]:
from datasets import load_dataset
import torch.utils.data as data
from transformers import AutoTokenizer, AutoModel
import torch
from torch.utils.data import DataLoader
import torch.optim as optim

from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict



# Split the data into train, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(word_training, y_training, test_size=0.2, stratify=y_training, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Create separate datasets for train, validation, and test
train_data = Dataset.from_dict({"text": X_train, "label": y_train})
val_data = Dataset.from_dict({"text": X_val, "label": y_val})
test_data = Dataset.from_dict({"text": X_test, "label": y_test})


# Print the sizes of the three sets
print("Train set size:", len(train_data))
print("Validation set size:", len(val_data))
print("Test set size:", len(test_data))


Train set size: 58098
Validation set size: 7262
Test set size: 7263


In [7]:
print(train_data[0])

{'text': "ID: and di I think they're quite easily printed on by machine ,", 'label': 0}


In [8]:
tokenizer = AutoTokenizer.from_pretrained('intfloat/multilingual-e5-base')

def collote_fn(batch_samples):
  batch_text = []
  batch_label = []
  for sample in batch_samples:
    batch_text.append(sample['text'])
    batch_label.append(int(sample['label']))
  X = tokenizer(
      batch_text,
      padding=True,
      truncation=True,
      return_tensors='pt'
  )
  y = torch.tensor(batch_label)
  return X,y

train_dataloader = DataLoader(train_data, batch_size=64, shuffle=True, collate_fn=collote_fn)
val_dataloader = DataLoader(val_data, batch_size=64, shuffle=True, collate_fn=collote_fn)
test_dataloader = DataLoader(test_data, batch_size=64, shuffle=True, collate_fn=collote_fn)

tokenizer_config.json:   0%|          | 0.00/418 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

In [12]:
from torch import nn

class LSTM(nn.Module):
  def __init__(self, hidden_size, output_size):
    super(LSTM, self).__init__()
    self.bert_encoder = AutoModel.from_pretrained('intfloat/multilingual-e5-base')
    self.hidden_size = hidden_size
    self.lstm = nn.LSTM(self.bert_encoder.config.hidden_size, self.hidden_size, batch_first=True)
    self.fc = nn.Linear(self.hidden_size, output_size)
  def forward(self, x):
    bert_outputs = self.bert_encoder(**x)
    last_hidden_states = bert_outputs.last_hidden_state
    outputs, hidden = self.lstm(last_hidden_states)
    out = self.fc(outputs[:, -1, :])
    return out

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = LSTM(256,2).to(device)
print(model)

LSTM(
  (bert_encoder): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              

In [13]:
from tqdm.auto import tqdm

def train(dataloader, model, loss_fun, optimizer, epoch, total_loss):
  process_bar = tqdm(range(len(dataloader)))
  process_bar.set_description(f'loss: {0:>7f}')
  finish_batch_num = (epoch-1)*len(dataloader)

  model.train()
  for batch, (X,y) in enumerate(dataloader, start=1):
    X,y = X.to(device),y.to(device)
    pred = model(X)
    loss = loss_fun(pred,y)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    total_loss += loss.item()
    process_bar.set_description(f'loss: {total_loss/(finish_batch_num + batch):>7f}')
    process_bar.update(1)
  return total_loss


def test(dataloader, model, mode='Test'):
    assert mode in ['Valid', 'Test']
    target_num = torch.zeros((1, 2)) # n_classes
    predict_num = torch.zeros((1, 2))
    acc_num = torch.zeros((1, 2))
    total_val_loss = 0
    avg_val_loss = 0

    model.eval()
    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            outputs = model(X)
            loss = loss_fun(outputs,y)
            total_val_loss += loss.item()
            pred = outputs.argmax(1)
            # correct += (pred.argmax(1) == y).type(torch.float).sum().item()
            pre_mask = torch.zeros(outputs.size()).scatter_(1, pred.cpu().view(-1, 1), 1.)
            predict_num += pre_mask.sum(0)
            tar_mask = torch.zeros(outputs.size()).scatter_(1, y.data.cpu().view(-1, 1), 1.)
            target_num += tar_mask.sum(0)
            acc_mask = pre_mask * tar_mask
            acc_num += acc_mask.sum(0)
        # calculate accuracy
        avg_val_loss = total_val_loss / len(dataloader)
        recall = torch.nan_to_num(acc_num / target_num, nan=0.0)
        precision = torch.nan_to_num(acc_num / predict_num, nan=0.0)
        F1 = torch.nan_to_num(2 * recall * precision / (recall + precision), nan=0.0)
        accuracy = 100. * acc_num.sum(1) / target_num.sum(1)

        avg_precision = torch.mean(precision)
        avg_recall = torch.mean(recall)
        avg_f1 = torch.mean(F1)

        print('{},loss{}, Acc {}, recal {}, precision {}, F1-score {}'.format(mode, avg_val_loss,accuracy.tolist(), avg_recall.tolist(), avg_precision.tolist(), avg_f1.tolist()))
    return avg_val_loss,accuracy,avg_precision,avg_recall,avg_f1

In [14]:
from sklearn.utils.class_weight import compute_class_weight
import numpy as np
# Calculez les poids inverses de la fréquence des classes
class_weights = compute_class_weight('balanced', classes=np.unique(y_training), y=y_training)
class_weights

array([0.61201564, 2.73183118])

In [15]:
learn_rate = 1e-5
num_epochs = 10

weights = torch.FloatTensor([0.61201564, 2.73183118]).to(device)
loss_fun = nn.CrossEntropyLoss(weight = weights)
optimizer = optim.Adam(model.parameters(), lr=learn_rate)

total_loss = 0
best_f1 = 0

import csv
with open('result.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['Epoch','vail_loss','vail_accuracy','vail_precision', 'vail_recall', 'vail_F1-score'])

    for t in range(num_epochs):
      total_loss = train(train_dataloader, model, loss_fun, optimizer, t+1, total_loss)
      avg_val_loss,vaild_acc,vaild_pre,vaild_recall,vaild_f1 = test(val_dataloader,model,mode='Valid')
      writer.writerow([t+1,avg_val_loss,vaild_acc,vaild_pre, vaild_recall, vaild_f1])
      if vaild_f1 > best_f1:
        best_f1 = vaild_f1
        torch.save(model.state_dict(), '/content/gdrive/MyDrive/inf554-extractive-summarization-2023/best_model.pt')

  0%|          | 0/908 [00:00<?, ?it/s]

Valid,loss0.39853296507346003, Acc [79.45469665527344], recal 0.8133207559585571, precision 0.7123843431472778, F1-score 0.7317326068878174


  0%|          | 0/908 [00:00<?, ?it/s]

Valid,loss0.39515608104697447, Acc [78.60093688964844], recal 0.811269998550415, precision 0.70747971534729, F1-score 0.7245626449584961


  0%|          | 0/908 [00:00<?, ?it/s]

Valid,loss0.40468945955498176, Acc [76.94850158691406], recal 0.8072167634963989, precision 0.6990508437156677, F1-score 0.7109484672546387


  0%|          | 0/908 [00:00<?, ?it/s]

Valid,loss0.4543679088615535, Acc [79.27568054199219], recal 0.8006519079208374, precision 0.7067692875862122, F1-score 0.7261722087860107


  0%|          | 0/908 [00:00<?, ?it/s]

Valid,loss0.4859993012066473, Acc [78.47700500488281], recal 0.7972030639648438, precision 0.7014169096946716, F1-score 0.71901535987854


  0%|          | 0/908 [00:00<?, ?it/s]

Valid,loss0.574782557785511, Acc [80.25337219238281], recal 0.7855263948440552, precision 0.7072665095329285, F1-score 0.7281855940818787


  0%|          | 0/908 [00:00<?, ?it/s]

Valid,loss0.6435105728737095, Acc [81.13467407226562], recal 0.7741492986679077, precision 0.7100538015365601, F1-score 0.730469822883606


  0%|          | 0/908 [00:00<?, ?it/s]

Valid,loss0.7981188242372713, Acc [81.16221618652344], recal 0.7636144161224365, precision 0.7072281837463379, F1-score 0.7263376712799072


  0%|          | 0/908 [00:00<?, ?it/s]

Valid,loss0.8159146763776478, Acc [82.12613677978516], recal 0.7486937046051025, precision 0.712335467338562, F1-score 0.7269452214241028


  0%|          | 0/908 [00:00<?, ?it/s]

Valid,loss0.9304363385104296, Acc [81.9471206665039], recal 0.736892819404602, precision 0.7077347040176392, F1-score 0.7199562191963196


In [16]:
model.load_state_dict(torch.load('best_model.pt'))
avg_test_loss,test_acc,test_pre,test_recall,test_f1 = test(test_dataloader,model,mode='Test')

Test,loss0.407616644052037, Acc [79.2509994506836], recal 0.7992537021636963, precision 0.7044773101806641, F1-score 0.7240293025970459


## Model Classifier  

## Test

In [17]:
test_labels = {}
for transcription_id in test_set:
    with open(path_to_test / f"{transcription_id}.json", "r") as file:
        transcription = json.load(file)

    X_test = []
    for utterance in transcription:
        X_test.append(utterance["speaker"] + ": " + utterance["text"])
        # Tokenize the text data
    X_test_encoded = tokenizer(X_test, padding=True, truncation=True, return_tensors="pt")

    # Move the encoded text data to the device
    X_test_encoded = {key: val.to(device) for key, val in X_test_encoded.items()}

    # Use the model to get predictions
    with torch.no_grad():
        model.eval()
        outputs = model(X_test_encoded)

    # Get the predicted labels
    predicted_labels = outputs.argmax(1)

    # Convert the tensor of predicted labels to a list
    predicted_labels = predicted_labels.tolist()

    test_labels[transcription_id] = predicted_labels

with open("test_labels_text_submission3.json", "w") as file:
    json.dump(test_labels, file, indent=4)

## Evaluation

In [18]:
len(test_labels)

40

## Submission

In [19]:
!pip install jsonargparse

Collecting jsonargparse
  Downloading jsonargparse-4.27.1-py3-none-any.whl (189 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/189.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m184.3/189.7 kB[0m [31m5.6 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m189.7/189.7 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: jsonargparse
Successfully installed jsonargparse-4.27.1


In [21]:
"""
This script converts test_labels.json into submission.csv
python make_submission.py --json_path test_labels_naive_baseline.json
"""
import json
from pathlib import Path


def make_submission(json_path: Path = Path("test_labels_text_submission3.json")):
    with open(json_path, "r") as file:
        test_labels = json.load(file)

    file = open("submission_11.csv", "w")
    file.write("id,target_feature\n")
    for key, value in test_labels.items():
        u_id = [key + "_" + str(i) for i in range(len(value))]
        target = map(str, value)
        for row in zip(u_id, target):
            file.write(",".join(row))
            file.write("\n")
    file.close()

from jsonargparse import CLI

make_submission(Path("test_labels_text_submission3.json"))