In [1]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/gdrive')


Mounted at /content/gdrive


## Preprocessing

In [2]:
import json
from pathlib import Path

def flatten(list_of_list):
    return [item for sublist in list_of_list for item in sublist]

path_to_training = Path("/content/gdrive/MyDrive/inf554-extractive-summarization-2023/training")
path_to_test = Path("/content/gdrive/MyDrive/inf554-extractive-summarization-2023/test")

#####
# training and test sets of transcription ids
#####
training_set = ['ES2002', 'ES2005', 'ES2006', 'ES2007', 'ES2008', 'ES2009', 'ES2010', 'ES2012', 'ES2013', 'ES2015', 'ES2016', 'IS1000', 'IS1001', 'IS1002', 'IS1003', 'IS1004', 'IS1005', 'IS1006', 'IS1007', 'TS3005', 'TS3008', 'TS3009', 'TS3010', 'TS3011', 'TS3012']
training_set = flatten([[m_id+s_id for s_id in 'abcd'] for m_id in training_set])
training_set.remove('IS1002a')
training_set.remove('IS1005d')
training_set.remove('TS3012c')

test_set = ['ES2003', 'ES2004', 'ES2011', 'ES2014', 'IS1008', 'IS1009', 'TS3003', 'TS3004', 'TS3006', 'TS3007']
test_set = flatten([[m_id+s_id for s_id in 'abcd'] for m_id in test_set])

#####
# naive_baseline: all utterances are predicted important (label 1)
#####
test_labels = {}
for transcription_id in test_set:
    with open(path_to_test / f"{transcription_id}.json", "r") as file:
        transcription = json.load(file)

    test_labels[transcription_id] = [1] * len(transcription)

with open("test_labels_naive_baseline.json", "w") as file:
    json.dump(test_labels, file, indent=4)

## Text embedding

In [3]:
!pip install sentence-transformers

Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting sentencepiece (from sentence-transformers)
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: sentence-transformers
  Building wheel for sentence-transformers (setup.py) ... [?25l[?25hdone
  Created wheel for sentence-transformers: filename=sentence_transformers-2.2.2-py3-none-any.whl size=125923 sha256=6b43c301a04784f61ca764b2fcc7ed0e641edb5cba917e87a5fb600eb18138c9
  Stored in directory: /root/.cache/pip/wheels/62/f2/10/1e606fd5f02395388f74e7462910fe851042f97238cbbd902f
Successfully built sentence-tr

In [4]:
#####
# text_baseline: utterances are embedded with SentenceTransformer, then train a classifier.
#####
import torch
from sentence_transformers import SentenceTransformer
# Check if a GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load SentenceTransformer model
e5 = SentenceTransformer('intfloat/e5-large-v2').to(device)

Using device: cpu


.gitattributes:   0%|          | 0.00/1.48k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/201 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/67.8k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/616 [00:00<?, ?B/s]

handler.py:   0%|          | 0.00/1.12k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/57.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/387 [00:00<?, ?B/s]

In [None]:
y_training = []
with open("/content/gdrive/MyDrive/inf554-extractive-summarization-2023/training_labels.json", "r") as file:
    training_labels = json.load(file)
X_training = []
for transcription_id in training_set:
    with open(path_to_training / f"{transcription_id}.json", "r") as file:
        transcription = json.load(file)

    for utterance in transcription:
        X_training.append(utterance["speaker"] + ": " + utterance["text"])

    y_training += training_labels[transcription_id]

X_training = bert.encode(X_training, show_progress_bar=True, normalize_embeddings=True)

In [5]:
!pip install node2vec

Collecting node2vec
  Downloading node2vec-0.4.6-py3-none-any.whl (7.0 kB)
Collecting networkx<3.0,>=2.5 (from node2vec)
  Downloading networkx-2.8.8-py3-none-any.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: networkx, node2vec
  Attempting uninstall: networkx
    Found existing installation: networkx 3.2.1
    Uninstalling networkx-3.2.1:
      Successfully uninstalled networkx-3.2.1
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
lida 0.0.10 requires fastapi, which is not installed.
lida 0.0.10 requires kaleido, which is not installed.
lida 0.0.10 requires python-multipart, which is not installed.
lida 0.0.10 requires uvicorn, which is not installed.[0m[31m
[0mSuccessfully installed networkx-2.8.8 node2vec-0.4.6


There are several ways to embed graphs, and the best method often depends on the specific requirements of your project. Here are a few methods you might consider:

1. **Node2Vec**: This is a popular method for embedding graphs. It treats walks through the graph as sentences and nodes as words, and applies the Word2Vec algorithm to these sentences to create embeddings¹.

2. **Graph Convolutional Networks (GCNs)**: These are a type of neural network designed specifically for graphs. They can create embeddings for nodes in a graph based on their local neighborhood¹.

3. **Spectral Embedding**: This method uses the eigenvectors of the graph Laplacian to create embeddings. It's particularly effective for certain types of graphs, such as community-based graphs where you want to identify clusters of nodes¹.

4. **Laplacian Eigenmaps**: This is another method that uses the graph Laplacian. It's similar to spectral embedding, but has some differences in how it constructs the embeddings¹.

5. **Graph Factorization**: This method factorizes the adjacency matrix of the graph to create embeddings. It's a simpler method than some of the others, but can still be effective¹.

6. **DeepWalk**: This algorithm learns latent representations of vertices in a network. These representations can be used as features for a range of tasks¹.

7. **LINE (Large-scale Information Network Embedding)**: This is a method for embedding very large information networks. It preserves both the local and global network structures¹.

8. **GraphSAGE (Graph Sample and Aggregated)**: This method generates embeddings by sampling and aggregating features from a node's local neighborhood¹.

Remember, each of these methods has its own strengths and weaknesses, and the best choice often depends on the specific characteristics of your graph and the problem you're trying to solve. You might need to experiment with several methods to see which one works best for your project.

(1) Drawing — NetworkX 3.2.1 documentation. https://networkx.org/documentation/stable/reference/drawing.html.
(2) python - Combine (join) networkx Graphs - Stack Overflow. https://stackoverflow.com/questions/32652149/combine-join-networkx-graphs.
(3) Python | Visualize graphs generated in NetworkX using Matplotlib. https://www.geeksforgeeks.org/python-visualize-graphs-generated-in-networkx-using-matplotlib/.
(4) Creating a graph — NetworkX 1.7 documentation. https://networkx.org/documentation/networkx-1.7/tutorial/tutorial.html.
(5) undefined. http://matplotlib.org/.
(6) undefined. http://pygraphviz.github.io/.
(7) undefined. https://www.graphviz.org.
(8) undefined. http://www.graphviz.org/doc/info/lang.html.

In [6]:
import networkx as nx
from sklearn.preprocessing import normalize
from node2vec import Node2Vec
import numpy as np
# Initialize an empty directed graph
X_training = []

for transcription_id in training_set:
    G = nx.DiGraph()

    # Open the transcription file and read each utterance
    with open(path_to_training / f"{transcription_id}.json", "r") as file:
        transcription = json.load(file)

    # Add each utterance to the graph
    for utterance in transcription:
        G.add_node(utterance["index"], text=utterance["text"])

    # Open the edge list file and read each line
    with open(path_to_training / f"{transcription_id}.txt", 'r') as f:
        for line in f:
            # Split the line into source, target, and edge attribute

            source, attribute, target = line.strip().split()

            # Convert source and target to integers
            source = int(source)
            target = int(target)

            # Add the edge to the graph
            G.add_edge(source, target, attribute=attribute)
    node2vec = Node2Vec(G, dimensions=64, walk_length=30, num_walks=200, workers=4)

    # Train Node2Vec model
    model = node2vec.fit(window=10, min_count=1, batch_words=4)

    # Embed nodes

    graph_embeddings = normalize(np.mean(model.wv.vectors, axis=0).reshape(-1,1), axis=0, norm='l2')
    print(graph_embeddings.shape)


Computing transition probabilities:   0%|          | 0/396 [00:00<?, ?it/s]

(64, 1)


Computing transition probabilities:   0%|          | 0/897 [00:00<?, ?it/s]

(64, 1)


Computing transition probabilities:   0%|          | 0/924 [00:00<?, ?it/s]

(64, 1)


Computing transition probabilities:   0%|          | 0/1207 [00:00<?, ?it/s]

(64, 1)


Computing transition probabilities:   0%|          | 0/126 [00:00<?, ?it/s]

(64, 1)


Computing transition probabilities:   0%|          | 0/815 [00:00<?, ?it/s]

(64, 1)


Computing transition probabilities:   0%|          | 0/1057 [00:00<?, ?it/s]

(64, 1)


Computing transition probabilities:   0%|          | 0/669 [00:00<?, ?it/s]

(64, 1)


Computing transition probabilities:   0%|          | 0/403 [00:00<?, ?it/s]

(64, 1)


Computing transition probabilities:   0%|          | 0/806 [00:00<?, ?it/s]

(64, 1)


Computing transition probabilities:   0%|          | 0/869 [00:00<?, ?it/s]

(64, 1)


Computing transition probabilities:   0%|          | 0/1047 [00:00<?, ?it/s]

(64, 1)


Computing transition probabilities:   0%|          | 0/415 [00:00<?, ?it/s]

(64, 1)


Computing transition probabilities:   0%|          | 0/547 [00:00<?, ?it/s]

(64, 1)


Computing transition probabilities:   0%|          | 0/765 [00:00<?, ?it/s]

(64, 1)


Computing transition probabilities:   0%|          | 0/518 [00:00<?, ?it/s]

(64, 1)


Computing transition probabilities:   0%|          | 0/339 [00:00<?, ?it/s]

(64, 1)


Computing transition probabilities:   0%|          | 0/856 [00:00<?, ?it/s]

(64, 1)


Computing transition probabilities:   0%|          | 0/860 [00:00<?, ?it/s]

(64, 1)


Computing transition probabilities:   0%|          | 0/1364 [00:00<?, ?it/s]

(64, 1)


Computing transition probabilities:   0%|          | 0/791 [00:00<?, ?it/s]

(64, 1)


Computing transition probabilities:   0%|          | 0/593 [00:00<?, ?it/s]

(64, 1)


Computing transition probabilities:   0%|          | 0/785 [00:00<?, ?it/s]

(64, 1)


Computing transition probabilities:   0%|          | 0/1082 [00:00<?, ?it/s]

(64, 1)


Computing transition probabilities:   0%|          | 0/229 [00:00<?, ?it/s]

(64, 1)


Computing transition probabilities:   0%|          | 0/648 [00:00<?, ?it/s]

(64, 1)


Computing transition probabilities:   0%|          | 0/722 [00:00<?, ?it/s]

(64, 1)


Computing transition probabilities:   0%|          | 0/563 [00:00<?, ?it/s]

(64, 1)


Computing transition probabilities:   0%|          | 0/345 [00:00<?, ?it/s]

(64, 1)


Computing transition probabilities:   0%|          | 0/738 [00:00<?, ?it/s]

(64, 1)


Computing transition probabilities:   0%|          | 0/915 [00:00<?, ?it/s]

(64, 1)


Computing transition probabilities:   0%|          | 0/450 [00:00<?, ?it/s]

(64, 1)


Computing transition probabilities:   0%|          | 0/248 [00:00<?, ?it/s]

(64, 1)


Computing transition probabilities:   0%|          | 0/672 [00:00<?, ?it/s]

(64, 1)


Computing transition probabilities:   0%|          | 0/718 [00:00<?, ?it/s]

(64, 1)


Computing transition probabilities:   0%|          | 0/755 [00:00<?, ?it/s]

(64, 1)


Computing transition probabilities:   0%|          | 0/357 [00:00<?, ?it/s]

(64, 1)


Computing transition probabilities:   0%|          | 0/973 [00:00<?, ?it/s]

(64, 1)


Computing transition probabilities:   0%|          | 0/960 [00:00<?, ?it/s]

(64, 1)


Computing transition probabilities:   0%|          | 0/1143 [00:00<?, ?it/s]

(64, 1)


Computing transition probabilities:   0%|          | 0/481 [00:00<?, ?it/s]

(64, 1)


Computing transition probabilities:   0%|          | 0/713 [00:00<?, ?it/s]

(64, 1)


Computing transition probabilities:   0%|          | 0/557 [00:00<?, ?it/s]

(64, 1)


Computing transition probabilities:   0%|          | 0/647 [00:00<?, ?it/s]

(64, 1)


Computing transition probabilities:   0%|          | 0/585 [00:00<?, ?it/s]

(64, 1)


Computing transition probabilities:   0%|          | 0/838 [00:00<?, ?it/s]

(64, 1)


Computing transition probabilities:   0%|          | 0/722 [00:00<?, ?it/s]

(64, 1)


Computing transition probabilities:   0%|          | 0/1377 [00:00<?, ?it/s]

(64, 1)


Computing transition probabilities:   0%|          | 0/254 [00:00<?, ?it/s]

(64, 1)


Computing transition probabilities:   0%|          | 0/772 [00:00<?, ?it/s]

(64, 1)


Computing transition probabilities:   0%|          | 0/515 [00:00<?, ?it/s]

(64, 1)


Computing transition probabilities:   0%|          | 0/379 [00:00<?, ?it/s]

(64, 1)


Computing transition probabilities:   0%|          | 0/782 [00:00<?, ?it/s]

(64, 1)


Computing transition probabilities:   0%|          | 0/754 [00:00<?, ?it/s]

(64, 1)


Computing transition probabilities:   0%|          | 0/497 [00:00<?, ?it/s]

(64, 1)


Computing transition probabilities:   0%|          | 0/358 [00:00<?, ?it/s]

(64, 1)


Computing transition probabilities:   0%|          | 0/641 [00:00<?, ?it/s]

(64, 1)


Computing transition probabilities:   0%|          | 0/869 [00:00<?, ?it/s]

(64, 1)


Computing transition probabilities:   0%|          | 0/1095 [00:00<?, ?it/s]

(64, 1)


Computing transition probabilities:   0%|          | 0/196 [00:00<?, ?it/s]

(64, 1)


Computing transition probabilities:   0%|          | 0/672 [00:00<?, ?it/s]

(64, 1)


Computing transition probabilities:   0%|          | 0/839 [00:00<?, ?it/s]

(64, 1)


Computing transition probabilities:   0%|          | 0/886 [00:00<?, ?it/s]

(64, 1)


Computing transition probabilities:   0%|          | 0/249 [00:00<?, ?it/s]

(64, 1)


Computing transition probabilities:   0%|          | 0/614 [00:00<?, ?it/s]

(64, 1)


Computing transition probabilities:   0%|          | 0/572 [00:00<?, ?it/s]

(64, 1)


Computing transition probabilities:   0%|          | 0/357 [00:00<?, ?it/s]

(64, 1)


Computing transition probabilities:   0%|          | 0/838 [00:00<?, ?it/s]

(64, 1)


Computing transition probabilities:   0%|          | 0/870 [00:00<?, ?it/s]

(64, 1)


Computing transition probabilities:   0%|          | 0/1148 [00:00<?, ?it/s]

(64, 1)


Computing transition probabilities:   0%|          | 0/336 [00:00<?, ?it/s]

(64, 1)


Computing transition probabilities:   0%|          | 0/422 [00:00<?, ?it/s]

(64, 1)


Computing transition probabilities:   0%|          | 0/667 [00:00<?, ?it/s]

(64, 1)


Computing transition probabilities:   0%|          | 0/717 [00:00<?, ?it/s]

(64, 1)


Computing transition probabilities:   0%|          | 0/542 [00:00<?, ?it/s]

(64, 1)


Computing transition probabilities:   0%|          | 0/1318 [00:00<?, ?it/s]

(64, 1)


Computing transition probabilities:   0%|          | 0/1156 [00:00<?, ?it/s]

(64, 1)


Computing transition probabilities:   0%|          | 0/2160 [00:00<?, ?it/s]

(64, 1)


Computing transition probabilities:   0%|          | 0/471 [00:00<?, ?it/s]

(64, 1)


Computing transition probabilities:   0%|          | 0/901 [00:00<?, ?it/s]

(64, 1)


Computing transition probabilities:   0%|          | 0/934 [00:00<?, ?it/s]

(64, 1)


Computing transition probabilities:   0%|          | 0/1328 [00:00<?, ?it/s]

(64, 1)


Computing transition probabilities:   0%|          | 0/714 [00:00<?, ?it/s]

(64, 1)


Computing transition probabilities:   0%|          | 0/1207 [00:00<?, ?it/s]

(64, 1)


Computing transition probabilities:   0%|          | 0/1275 [00:00<?, ?it/s]

(64, 1)


Computing transition probabilities:   0%|          | 0/1215 [00:00<?, ?it/s]

(64, 1)


Computing transition probabilities:   0%|          | 0/212 [00:00<?, ?it/s]

(64, 1)


Computing transition probabilities:   0%|          | 0/524 [00:00<?, ?it/s]

(64, 1)


Computing transition probabilities:   0%|          | 0/689 [00:00<?, ?it/s]

(64, 1)


Computing transition probabilities:   0%|          | 0/649 [00:00<?, ?it/s]

(64, 1)


Computing transition probabilities:   0%|          | 0/486 [00:00<?, ?it/s]

(64, 1)


Computing transition probabilities:   0%|          | 0/985 [00:00<?, ?it/s]

(64, 1)


Computing transition probabilities:   0%|          | 0/1017 [00:00<?, ?it/s]

(64, 1)


Computing transition probabilities:   0%|          | 0/911 [00:00<?, ?it/s]

(64, 1)


Computing transition probabilities:   0%|          | 0/373 [00:00<?, ?it/s]

(64, 1)


Computing transition probabilities:   0%|          | 0/1487 [00:00<?, ?it/s]

(64, 1)


Computing transition probabilities:   0%|          | 0/1283 [00:00<?, ?it/s]

(64, 1)


In [7]:
X_training = np.array(X_training)
np.save('/content/gdrive/MyDrive/inf554-extractive-summarization-2023/X_training_e5_node2vec_text_graph.npy', X_training)

## Model Classifier  

Best parameters for XGBoost: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100}
Best score for XGBoost: 0.41430686322433263


In [None]:
from scipy.stats import uniform, randint
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score, make_scorer
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV


# Define the parameter distribution
param_dist = {
    'n_estimators': randint(2000, 3000),
    'max_depth': randint(1, 6),
    'learning_rate': uniform(0.01, 0.02),
    'gamma': uniform(0, 1),
    'subsample': uniform(0.5, 0.5),
    'colsample_bytree': uniform(0.5, 0.5),
    'lambda': uniform(1, 3),
    'alpha': uniform(0, 1),
    'scale_pos_weight': uniform(1, 10)
}

# Initialize the model
model = XGBClassifier(tree_method='gpu_hist')

# Perform 5-fold cross-validation
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

# Define your custom scorer
scorer = make_scorer(f1_score)

# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=model,
    param_distributions=param_dist,
    scoring=scorer,
    cv=cv,
    n_jobs=-1,
    n_iter=30,  # Number of parameter settings that are sampled
    random_state=42
)

# Fit RandomizedSearchCV
random_search.fit(X_training, y_training)

# Get the best parameters
best_params = random_search.best_params_

# Print the best parameters
print(f'Best parameters: {best_params}')


    E.g. tree_method = "hist", device = "cuda"



Best parameters: {'alpha': 0.28571208628186073, 'colsample_bytree': 0.9342995640947301, 'gamma': 0.22359583851945264, 'lambda': 3.889667618321834, 'learning_rate': 0.010243089493796327, 'max_depth': 5, 'n_estimators': 2828, 'scale_pos_weight': 4.226863882775836, 'subsample': 0.5217003916490863}


Best parameters: {'alpha': 0.8392001248555798, 'colsample_bytree': 0.6238654947505787, 'gamma': 0.7119453573025232, 'lambda': 1.5156922209287382, 'learning_rate': 0.012878697725951174, 'max_depth': 5, 'n_estimators': 2008, 'scale_pos_weight': 4.114133093912942, 'subsample': 0.9897552643107543}
f1 = 0.5819510885085816

In [None]:
from scipy.stats import uniform, randint
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score, make_scorer
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV


# Define the parameter distribution
param_dist = {'alpha': 0.28571208628186073, 'colsample_bytree': 0.9342995640947301, 'gamma': 0.22359583851945264, 'lambda': 3.889667618321834, 'learning_rate': 0.010243089493796327, 'max_depth': 5, 'n_estimators': 2828, 'scale_pos_weight': 4.226863882775836, 'subsample': 0.5217003916490863}

# Initialize the model
model = XGBClassifier(tree_method='gpu_hist')

# Perform 5-fold cross-validation
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

# Define your custom scorer
scorer = make_scorer(f1_score)

# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=model,
    param_distributions=param_dist,
    scoring=scorer,
    cv=cv,
    n_jobs=-1,
    n_iter=30,  # Number of parameter settings that are sampled
    random_state=42
)

# Fit RandomizedSearchCV
random_search.fit(X_training, y_training)

# Get the best parameters
best_params = random_search.best_params_

# Print the best parameters
print(f'Best parameters: {best_params}')

TypeError: ignored

In [None]:

from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, make_scorer

# Define your custom scorer
scorer = make_scorer(f1_score)

# Initialize the model with the best parameters
best_params = {'alpha': 0.8392001248555798, 'colsample_bytree': 0.6238654947505787,
 'gamma': 0.7119453573025232, 'lambda': 1.5156922209287382,
 'learning_rate': 0.012878697725951174, 'max_depth': 5, 'n_estimators': 2008,
 'scale_pos_weight': 4.114133093912942, 'subsample': 0.9897552643107543}
# Initialize the model with the best parameters
model = XGBClassifier(
    tree_method='hist',
    device = device,
    n_estimators=best_params['n_estimators'],
    max_depth=best_params['max_depth'],
    learning_rate=best_params['learning_rate'],
    gamma=best_params['gamma'],
    subsample=best_params['subsample'],
    colsample_bytree=best_params['colsample_bytree'],
    reg_lambda=best_params['lambda'],
    alpha=best_params['alpha'],
    scale_pos_weight=best_params['scale_pos_weight']
)

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Perform 5-fold cross-validation
cv_scores = cross_val_score(model, X_training, y_training, cv=cv, scoring=scorer, n_jobs=-1)


# Print cross-validated F1-Score
print(f'Cross-validated F1-Score: {cv_scores.mean()}')


In [None]:

from xgboost import XGBClassifier
from sklearn.metrics import f1_score, make_scorer

# Define your custom scorer
scorer = make_scorer(f1_score)

# Initialize the model with the best parameters
# Initialize the model with the best parameters
model = XGBClassifier(
    tree_method='gpu_hist',
    n_estimators=best_params['n_estimators'],
    max_depth=best_params['max_depth'],
    learning_rate=best_params['learning_rate'],
    gamma=best_params['gamma'],
    subsample=best_params['subsample'],
    colsample_bytree=best_params['colsample_bytree'],
    reg_lambda=best_params['lambda'],
    alpha=best_params['alpha'],
    scale_pos_weight=best_params['scale_pos_weight']
)

# Fit the model to your training data
model.fit(X_training, y_training)

# Predict on your validation set
y_pred = model.predict(X_training)

# Evaluate the model using f1-score
score = f1_score(y_training, y_pred)

print(f'F1-Score: {score}')



    E.g. tree_method = "hist", device = "cuda"


    E.g. tree_method = "hist", device = "cuda"

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




F1-Score: 0.7073607612650434


In [None]:
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
from torch.utils.data import DataLoader
from transformers import AdamW
import torch

# Load the DistilBERT tokenizer and model
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')

# Assuming that `texts` is your list of documents and `labels` is your list of labels
texts = ["your text data here..."]
labels = ["your labels here..."]

# Tokenize the texts
inputs = tokenizer(texts, truncation=True, padding=True, return_tensors="pt")

# Convert the labels to tensors
labels = torch.tensor(labels)

# Create a DataLoader
dataset = torch.utils.data.TensorDataset(inputs['input_ids'], inputs['attention_mask'], labels)
dataloader = DataLoader(dataset, batch_size=16)

# Fine-tune the model
optimizer = AdamW(model.parameters(), lr=1e-5)
for epoch in range(epochs):
    for batch in dataloader:
        optimizer.zero_grad()
        input_ids, attention_mask, labels = batch
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

# Now the model is fine-tuned and ready to be used on your task
from sklearn.metrics import f1_score, confusion_matrix
import numpy as np

# Make predictions on the validation set
inputs = tokenizer(texts_val, truncation=True, padding=True, return_tensors="pt")
labels_val = torch.tensor(labels_val)
predictions = model(**inputs).logits
predictions = np.argmax(predictions.detach().numpy(), axis=1)

# Compute the F1 score
f1 = f1_score(labels_val, predictions, average='weighted')
print(f'F1 Score: {f1}')

# Compute and plot the confusion matrix
cm = confusion_matrix(labels_val, predictions)
plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
plt.title('Confusion matrix')
plt.colorbar()
plt.show()


## Test

In [None]:
test_labels = {}
for transcription_id in test_set:
    with open(path_to_test / f"{transcription_id}.json", "r") as file:
        transcription = json.load(file)

    X_test = []
    for utterance in transcription:
        X_test.append(utterance["speaker"] + ": " + utterance["text"])

    X_test = bert.encode(X_test)

    y_test = model.predict(X_test)
    test_labels[transcription_id] = y_test.tolist()

with open("test_labels_text_submission2.json", "w") as file:
    json.dump(test_labels, file, indent=4)

## Evaluation

## Submission

In [None]:
!pip install jsonargparse

Collecting jsonargparse
  Downloading jsonargparse-4.27.0-py3-none-any.whl (188 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/188.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m184.3/188.9 kB[0m [31m5.3 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m188.9/188.9 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: jsonargparse
Successfully installed jsonargparse-4.27.0


In [None]:
"""
This script converts test_labels.json into submission.csv
python make_submission.py --json_path test_labels_naive_baseline.json
"""
import json
from pathlib import Path


def make_submission(json_path: Path = Path("test_labels_text_submission2.json")):
    with open(json_path, "r") as file:
        test_labels = json.load(file)

    file = open("submission_2.csv", "w")
    file.write("id,target_feature\n")
    for key, value in test_labels.items():
        u_id = [key + "_" + str(i) for i in range(len(value))]
        target = map(str, value)
        for row in zip(u_id, target):
            file.write(",".join(row))
            file.write("\n")
    file.close()

from jsonargparse import CLI

make_submission(Path("test_labels_text_submission2.json"))