## Inference on Dataset for topic "International Peace and Security"

Meant to be run on a Google Colab Instance, not locally. T4, A100 or V100 should all be sufficient. 

#### Install relevant packages

In [1]:
!pip install accelerate -U
!pip install transformers[sentencepiece]
!pip install datasets
!pip install --force-reinstall -v "openpyxl==3.0.10"
!pip install xformers

Collecting accelerate
  Downloading accelerate-0.29.2-py3-none-any.whl (297 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/297.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m112.6/297.4 kB[0m [31m3.2 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m297.4/297.4 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from 

In [2]:
## Load general packages
import pandas as pd
import numpy as np
from google.colab.data_table import DataTable
from sklearn.model_selection import train_test_split
from google.colab import drive
from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer
import torch
import os
import tqdm

### Connect to drive

In [3]:
drive.mount('/content/drive')

Mounted at /content/drive


### Download Model from Drive

In [4]:
from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer
import torch

model_name_custom = f"deberta-base-int_sec_final_20240321_8911"
mode_custom_path = "/content/drive/MyDrive/unga_int_sec/" + model_name_custom
device = "cuda:0" if torch.cuda.is_available() else "cpu"  # use GPU (cuda) if available, otherwise use CPU

model = AutoModelForSequenceClassification.from_pretrained(mode_custom_path)
tokenizer = AutoTokenizer.from_pretrained(mode_custom_path, use_fast=True, model_max_length=512)

pipe_classifier = pipeline(
    "text-classification",
    model=model,
    tokenizer=tokenizer,
    framework="pt",
    device=device,
    batch_size=32
)

In [5]:
with torch.no_grad():
  torch.cuda.empty_cache()

### Download Data

In [6]:
oos_test = pd.read_csv('https://nextcloud.swp-berlin.org/s/REDACTED/download')
oos_test =  oos_test[['text', 'detail_vote_number', "group_id_alt"]]
oos_test


Unnamed: 0,text,detail_vote_number,group_id_alt
0,"The General Assembly,",4033262,1.0
1,Recalling its resolution 78/160 of 19 December...,4033262,2.0
2,Reaffirming its resolutions 53/199 of 15 Decem...,4033262,3.0
3,Recognizing the importance of creating synergi...,4033262,4.0
4,"Stressing the important role of science, techn...",4033262,5.0
...,...,...,...
309468,Requests the Secretary-General to continue to ...,284003,11.0
309469,Requests all States and international organiza...,284003,12.0
309470,Also requests the Secretary-General to report ...,284003,13.0
309471,,284001,


In [8]:
## drop missing texts, this is because we have all resolutions in the dataset,
## even the ones where the PDF is faulty or we dont have a link
oos_test = oos_test.dropna(subset=['text'])

## delete the text from paragraphs that are simply too long for the classifier
## use only the first 6000 characters of the
oos_test["text"] = oos_test.apply(lambda row: row['text'][:6000] if len(row['text']) > 25000 else row['text'], axis=1)

oos_test

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  oos_test["text"] = oos_test.apply(lambda row: row['text'][:6000] if len(row['text']) > 25000 else row['text'], axis=1)


Unnamed: 0,text,detail_vote_number,group_id_alt
0,"The General Assembly,",4033262,1.0
1,Recalling its resolution 78/160 of 19 December...,4033262,2.0
2,Reaffirming its resolutions 53/199 of 15 Decem...,4033262,3.0
3,Recognizing the importance of creating synergi...,4033262,4.0
4,"Stressing the important role of science, techn...",4033262,5.0
...,...,...,...
309466,Notes with satisfaction the national efforts o...,284003,9.0
309467,"Commends the international community, includin...",284003,10.0
309468,Requests the Secretary-General to continue to ...,284003,11.0
309469,Requests all States and international organiza...,284003,12.0


In [9]:
dir_path = '/content/drive/MyDrive/unga_int_sec/preds_20240416'
if not os.path.exists(dir_path):
    os.makedirs(dir_path)

In [10]:
text_lst = oos_test["text"].tolist()

# in principle it would be better to use a Dataset object from the transformers library
# however, that does not allow for saving intermediate results and since it is a lot of text
# sometimes the google colab crashes or gets disconnected. This way it probably takes
# about half an hour longer but that doesn´t matter
def save_output(output, chunk_idx):
    """Save intermediate results to a CSV file."""
    file_path = f"{dir_path}/predictions_eval_set_chunk_{chunk_idx}.csv"
    df_temp = pd.DataFrame([data[0] for data in output])
    eval_temp = pd.concat([oos_test.iloc[chunk_idx*1000:(chunk_idx+1)*1000].reset_index(drop=True), df_temp], axis=1, ignore_index=True)
    eval_temp.to_csv(file_path, index=False)

def process_texts(texts):
    """Process a list of texts in chunks and save outputs."""
    num_chunks = len(texts) // 1000 + (1 if len(texts) % 1000 != 0 else 0)
    with torch.no_grad():
      torch.cuda.empty_cache()

    with tqdm.tqdm(total=len(texts)) as pbar:
        for chunk_idx in range(num_chunks):
            file_path = f"{dir_path}/predictions_eval_set_chunk_{chunk_idx}.csv"

            # Check if the file is already saved
            if os.path.exists(file_path):
                pbar.update(1000)  # Update the progress bar by 1000 steps
                continue

            start_idx = chunk_idx * 1000
            end_idx = start_idx + 1000
            current_chunk_output = [pipe_classifier(text) for text in texts[start_idx:end_idx]]

            save_output(current_chunk_output, chunk_idx)
            pbar.update(1000)  # Update the progress bar by 1000 steps

process_texts(text_lst)

310000it [00:00, 7457695.18it/s]          
