In [None]:
# Intalling and importing required libraries

!pip install sentence_transformers networkx torch pandas numpy matplotlib tqdm

import numpy as np
import pandas as pd
import torch
from sentence_transformers import SentenceTransformer
import json
from pathlib import Path
import networkx as nx
from collections import defaultdict


## Preprocessing

In [5]:
# Copilot build the description everywhere


# Path to the training and test sets
path_to_training = Path("training")
path_to_test = Path("test")

def flatten(list_of_list):
    """
    Flatten a list of lists into a single list.
    """
    
    return [item for sublist in list_of_list for item in sublist]


training_set = ['ES2002', 'ES2005', 'ES2006', 'ES2007', 'ES2008', 'ES2009', 'ES2010', 'ES2012', 'ES2013', 'ES2015', 'ES2016', 'IS1000', 'IS1001', 'IS1002', 'IS1003', 'IS1004', 'IS1005', 'IS1006', 'IS1007', 'TS3005', 'TS3008', 'TS3009', 'TS3010', 'TS3011', 'TS3012']
training_set = flatten([[m_id+s_id for s_id in 'abcd'] for m_id in training_set])
training_set.remove('IS1002a')
training_set.remove('IS1005d')
training_set.remove('TS3012c')

test_set = ['ES2003', 'ES2004', 'ES2011', 'ES2014', 'IS1008', 'IS1009', 'TS3003', 'TS3004', 'TS3006', 'TS3007']
test_set = flatten([[m_id+s_id for s_id in 'abcd'] for m_id in test_set])


def read_discourse_graph(file_path):
    """
    Read a discourse graph from a file and return a list of edges.
    """
    edges = []
    with open(file_path, 'r') as file:
        for line in file:
            parts = line.strip().split()
            source = int(parts[0])
            relation = parts[1]
            target = int(parts[2])
            edges.append((source, target, {'relation' : relation}))

    return edges

def create_graph(edges):
    """
    Create a networkx graph from a list of edges.
    """
    
    graph = nx.DiGraph()
    graph.add_edges_from(edges)
    return graph


# Creating the training set
y_training = [] # list of training_labels
word_training = [] # list of all utterances
graph_training = [] # list of all the graphs
tab_train = [defaultdict(int) for _ in range(100000)] # keeps the dataframe counting the relaions handled by each utterance
n= 0

with open("training_labels.json", "r") as file:
    training_labels = json.load(file)

for transcription_id in training_set:
    # Read the text file for the discourse graph and construct the dataframe
    graph_file_path = path_to_training / f"{transcription_id}.txt"
    edges = read_discourse_graph(graph_file_path)
    graph = create_graph(edges)
    graph_training.append(graph)
    for source , target , relation in edges:
      rel  =relation['relation']
      tab_train[n + source][rel] +=1
      tab_train[n + target][rel] +=1


    # Read the JSON file for the transcription
    with open(path_to_training / f"{transcription_id}.json", "r") as file:
        transcription = json.load(file)

    for utterance in transcription:
        word_training.append(utterance["speaker"] + ": " + utterance["text"])

    y_training += training_labels[transcription_id]
    n += len(training_labels[transcription_id])
tab_train = tab_train[:n]

# dataframe of utterances relations counts

ensemble_cles = set()
for dictionnaire in tab_train:
    ensemble_cles.update(dictionnaire.keys())
ensemble_cles = list(ensemble_cles)

matrice = []
for dictionnaire in tab_train:
    ligne = [dictionnaire.get(cle, 0) for cle in ensemble_cles]
    matrice.append(ligne)

# Creating the dataframe
tab_train = pd.DataFrame(matrice, columns=ensemble_cles)


In [None]:
# Adding data by 2 by 2 merging
word_training_augmented = list(word_training)
y_training_augmented = list(y_training)
tab_training_augmented = tab_train.copy()


i = 0
while i < len(y_training) - 1:
    if y_training[i] == 1:
        j = i+1
        while j < len(y_training) and y_training[j] == 0:
            j+=1
        if j < len(y_training):
            new_utterance = word_training[i] + ' ' + word_training[j]
            word_training_augmented.append(new_utterance)
            y_training_augmented.append(1)
            tab_training_augmented = tab_training_augmented.append(tab_train.iloc[i] + tab_train.iloc[j], ignore_index=True)

        i = j
    i+=1

print(tab_training_augmented[-10:])


# Embedding

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'Using device : {device}')
e5 = SentenceTransformer('intfloat/e5-large-v2').to(device)

In [None]:
X_training = e5.encode(word_training_augmented, show_progress_bar=True, normalize_embeddings=True)
df_X_training = pd.DataFrame(X_training)

# Concatenate tab_train and X_training along the columns
X_training = pd.concat([tab_training_augmented, df_X_training], axis=1)

# Model Building

In [None]:
from scipy.stats import uniform, randint
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score, make_scorer
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from tqdm import tqdm


# Define the parameter distribution
param_dist = {
    'n_estimators': randint(2000, 3000),
    'max_depth': randint(1, 6),
    'learning_rate': uniform(0.01, 0.02),
    'gamma': uniform(0, 1),
    'subsample': uniform(0.5, 0.5),
    'colsample_bytree': uniform(0.5, 0.5),
    'lambda': uniform(1, 3),
    'alpha': uniform(0, 1),
    'scale_pos_weight': uniform(1, 10)
}

# Initialize the model
model = XGBClassifier(tree_method='hist', device = device)

# Perform 5-fold cross-validation
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

# Define your custom scorer
scorer = make_scorer(f1_score)

# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=model,
    param_distributions=param_dist,
    scoring=scorer,
    cv=cv,
    n_jobs=-1,
    n_iter=20,  # Number of parameter settings that are sampled
    random_state=40,
    verbose = 1
)


with tqdm(total=20, desc="RandomizedSearchCV") as pbar:
    # Fit RandomizedSearchCV
    random_search.fit(X_training, y_training_augmented)
    pbar.update(20)  # Manually update the progress bar to its completion


# Get the best parameters
best_params = random_search.best_params_

# Print the best parameters
print(f'Best parameters: {best_params}')

RandomizedSearchCV:   0%|          | 0/20 [00:00<?, ?it/s]

Fitting 3 folds for each of 20 candidates, totalling 60 fits


In [None]:

# Initialize the model with the best parameters
model = XGBClassifier(tree_method='hist', device = device, **best_params )

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Perform 5-fold cross-validation
cv_scores = cross_val_score(model, X_training, y_training_augmented, cv=cv, scoring=scorer, n_jobs=-1)


# Print cross-validated F1-Score
print(f'Cross-validated F1-Score: {cv_scores.mean()}')


In [None]:

# Fit the model to the training data
model.fit(X_training, y_training_augmented)

# Predict on the validation set
y_pred = model.predict(X_training)

# Evaluate the model using f1-score
score = f1_score(y_training_augmented, y_pred)

print(f'F1-Score: {score}')

In [None]:
# score on the original dataset

X_training_0 = e5.encode(word_training, show_progress_bar=True, normalize_embeddings=True)
df_X_training_0 = pd.DataFrame(X_training_0)

# Concatenate tab_train and X_training along the columns

X_training_0 = pd.concat([tab_train, df_X_training_0], axis=1)

# Predict on the validation set
y_pred_0 = model.predict(X_training_0)

# Evaluate the model using f1-score
score = f1_score(y_training, y_pred_0)

print(f'F1-Score: {score}')

# Testing

In [None]:
tab_test = [defaultdict(int) for _ in range(100000)] # keeps the dataframe counting the relaions handled by each utterance
n= 0
graph_test = []
test_labels = {}

for transcription_id in test_set:
    X_test = []


    # Read the JSON file for the transcription
    with open(path_to_test / f"{transcription_id}.json", "r") as file:
        transcription = json.load(file)

    for utterance in transcription:
        X_test.append(utterance["speaker"] + ": " + utterance["text"])
        n+=1

    # Read the text file for the discourse graph
    tab_test = [defaultdict(int) for _ in range(len(transcription))]
    graph_file_path_t = path_to_test / f"{transcription_id}.txt"
    edges = read_discourse_graph(graph_file_path_t)
    graph = create_graph(edges)
    graph_test.append(graph)
    for source , target , relation in edges:
        rel  =  relation['relation']
        tab_test[source][rel] +=1
        tab_test[target][rel] +=1

    X_test = e5.encode(X_test, show_progress_bar=True, normalize_embeddings=True)

    df_X_test = pd.DataFrame(X_test)

    matrice = []
    for dictionnaire in tab_test:
        ligne = [dictionnaire.get(cle, 0) for cle in ensemble_cles]
        matrice.append(ligne)

    # Creating the dataframe
    tab_test = pd.DataFrame(matrice, columns=ensemble_cles)

    # Concatenate tab_test and X_test along the columns
    X_testing = pd.concat([tab_test, df_X_test], axis=1)

    y_pred = model.predict(X_testing)
    test_labels[transcription_id] = y_pred.tolist()


with open("test_labels_text_submission25.json", "w") as file:
    json.dump(test_labels, file, indent=4)

In [None]:
!pip install jsonargparse

Collecting jsonargparse
  Downloading jsonargparse-4.27.1-py3-none-any.whl (189 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m189.7/189.7 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: jsonargparse
Successfully installed jsonargparse-4.27.1


In [None]:
"""
This script converts test_labels.json into submission.csv
python make_submission.py --json_path test_labels_naive_baseline.json
"""
import json
from pathlib import Path


def make_submission(json_path: Path = Path("test_labels_text_submission25.json")):
    with open(json_path, "r") as file:
        test_labels = json.load(file)

    file = open("submission_with_augmentation_1.csv", "w")
    file.write("id,target_feature\n")
    for key, value in test_labels.items():
        u_id = [key + "_" + str(i) for i in range(len(value))]
        target = map(str, value)
        for row in zip(u_id, target):
            file.write(",".join(row))
            file.write("\n")
    file.close()

from jsonargparse import CLI

make_submission(Path("test_labels_text_submission25.json"))