In [1]:
import pandas as pd
import os
import numpy as np

## Load model
we will use apre-trained bert for ukrainian, english and russian `MaxVortman/bert-base-ukr-eng-rus-uncased`

In [51]:
from transformers import AutoTokenizer
from transformers import AutoModelForTokenClassification
#modelname = "bert-base-multilingual-uncased"
#modelname = "MaxVortman/bert-base-ukr-eng-rus-uncased"
modelname = "distilbert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(modelname)
model = AutoModelForTokenClassification.from_pretrained(modelname,num_labels=3)

OSError: distilbert-base-multilingual-cased does not appear to have a file named config.json. Checkout 'https://huggingface.co/distilbert-base-multilingual-cased/None' for available files.

In [71]:
for param in list(model.distilbert.parameters()):
    param.requires_grad = False

## Where our data is

In [72]:
dataset_dir = "./nlp-telegram-locations-extractions"
ru_geo_dataset_path = os.path.join(dataset_dir, "ru_geo_dataset.csv")
uk_geo_dataset_path = os.path.join(dataset_dir, "uk_geo_dataset.csv")

In [73]:
df = pd.read_csv(uk_geo_dataset_path, skiprows=lambda x: x > 10000)
df = pd.DataFrame([i for i in df.values if len(i[1]) > 7], columns=df.columns)

In [74]:
df.head()

Unnamed: 0,text,loc_markers,org_markers,per_markers,is_valid
0,Подібні розіграші проводили в Великій Британії.,"[(30, 46)]",[],[],0
1,У Львові 34-річний мешканець Яворівського раи...,"[(2, 8), (30, 50)]",[],[],0
2,"Нагадаємо, президент України Володимир Зеленсь...","[(21, 28)]","[(80, 94)]","[(29, 49)]",0
3,"Слід зауважити, що протягом останнього часу в ...","[(85, 92), (139, 147)]",[],[],0
4,"Тим часом, О.Паращій вважає, що зміна глави Мі...","[(87, 94)]","[(44, 51), (97, 100)]","[(11, 20)]",0


In [75]:
df2 = pd.read_csv(ru_geo_dataset_path, skiprows=lambda x: x > 10000)
df2 = pd.DataFrame([i for i in df2.values if len(i[1]) > 7], columns=df2.columns)

In [76]:
df2.head()

Unnamed: 0,text,loc_markers,org_markers,per_markers,doc_id,sent_id
0,Вице-премьер по социальным вопросам Татьяна Го...,"[(82, 88)]","[(149, 160)]","[(36, 52)]",0,0
1,"По словам Голиковой, чаще всего онкологические...","[(89, 98), (100, 108), (110, 118), (121, 139),...",[],"[(10, 19)]",0,1
2,"Вице-премьер напомнила, что главные факторы см...","[(57, 63)]",[],[],0,2
3,Австрийские правоохранительные органы не предс...,"[(0, 11)]",[],[],1,0
4,Об этом сообщил посол России в Вене Дмитрий Лю...,"[(22, 28), (31, 35)]","[(154, 158)]","[(36, 53)]",1,1


## Turn the string location markers to indices

In [77]:
def string_to_indices(s):
    if len(s) < 3:
        return []
    ints = [int(l[:-1]) if i%2 == 1 else int(l[1:]) for i,l in enumerate(s[1:-1].split(", "))]
    return [ints[i:i+2] for i in range(0,len(ints),2)]

## A function that creates NER labels with the given pre-trained tokenizer

## Train

In [78]:
ru_dataset, uk_dataset

(Dataset({
     features: ['text', 'loc_markers', 'org_markers', 'per_markers'],
     num_rows: 298
 }),
 Dataset({
     features: ['text', 'loc_markers', 'org_markers', 'per_markers'],
     num_rows: 228
 }))

In [79]:
label_names = {
    1: "B-LOC",
    0: "O",
    2: "I-LOC"
}

In [80]:
def to_data(df):
    Samples = tuple(df['text'])
    Markers = df["loc_markers"]

    tokenized_samples = tokenizer(Samples, max_length=512, padding='max_length')
    Labels = []
    Spans = []
    Tokens = []
    Ner = []

    for ID,(sample,markers) in enumerate(zip(Samples,Markers)):
        marker_indices = string_to_indices(markers)

        if len(marker_indices) > 0:

            marker_indices.sort(key = lambda x: x[0]) # don't overlap

            lbl = []
            s = 0
            for ms,me in marker_indices:
                lbl.append(((s,ms),0))
                lbl.append(((ms,me),1))
                s = me
            if marker_indices[0][0] == 0:
                lbl.pop(0)
            if lbl[-1][1] != len(sample):
                lbl.append(((s,len(sample)),0))

            span = []
            label = []
            tokenized = []
            for i,((s,e),k) in enumerate(lbl):
                t = tokenizer.tokenize(sample[s:e])
                if i == 0:
                    label += [0]*len(t)
                else:
                    label += [1]+[2]*(len(t)-1)
                    span.append([f"LOC: {tt}" for tt in t])
                tokenized += t
            label = label
        else:
            label = [0]*(len(tokenized_samples['input_ids'][ID])-2)
            tokenized = tokenizer.tokenize(sample)
            span = []

        """"
        try
            tokenized = tokenizer.tokenize(sample)
            assert(len(label)==(len(tokenized)+2))
        except:
            label = [-100]+[0]*(len(tokenized))+[-100]
            print(len(label), len(tokenized), tokenized, "\n\n")
        """

        Ner.append([label_names[i] for i in label])
        Labels.append([-100]+label+[-100])
        Tokens.append(tokenized)
        Spans.append(span)

    tokenized_samples["labels"] = Labels
    #tokenized_samples["ner_tags"] = Ner
    #tokenized_samples["labels"] = Ner
    tokenized_samples["spans"] = Spans
    tokenized_samples["tokens"] = Tokens
    return tokenized_samples



In [81]:
tokenizer("Hello World")

{'input_ids': [101, 31178, 10315, 102], 'attention_mask': [1, 1, 1, 1]}

In [82]:
from datasets import load_dataset, Dataset
from datasets import concatenate_datasets

# Load your CSV file
df = pd.read_csv(uk_geo_dataset_path, skiprows=lambda x: x > 1000)
df2 = pd.read_csv(ru_geo_dataset_path, skiprows=lambda x: x > 1000)
#ru_dataset = load_dataset('csv', data_files=[ru_geo_dataset_path])
#uk_dataset = load_dataset('csv', data_files=[uk_geo_dataset_path])
uk_dataset = Dataset.from_pandas(pd.DataFrame([row for row in df.values if row[1] != "[]"], columns=df.columns))
ru_dataset = Dataset.from_pandas(pd.DataFrame([row for row in df2.values if row[1] != "[]"], columns=df2.columns))

ru_dataset = ru_dataset.remove_columns(['doc_id', 'sent_id'])
uk_dataset = uk_dataset.remove_columns(['is_valid'])


dataset = concatenate_datasets([ru_dataset, uk_dataset])
dataset = dataset.train_test_split(test_size=0.3, shuffle=True)

tokenized_dataset = dataset.map(to_data, batched=True)
tokenized_dataset = tokenized_dataset.remove_columns(["org_markers", "per_markers"])

  block_group = [InMemoryTable(cls._concat_blocks(list(block_group), axis=axis))]
  table = cls._concat_blocks(blocks, axis=0)


Map:   0%|          | 0/368 [00:00<?, ? examples/s]

Map:   0%|          | 0/158 [00:00<?, ? examples/s]

In [83]:
from transformers import TrainingArguments

BATCH_SIZE = 4
NUM_EPOCHS = 3

args = TrainingArguments(
    modelname,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    logging_strategy="steps",
    evaluation_strategy="epoch",
    save_strategy="steps",
    num_train_epochs=NUM_EPOCHS,
    overwrite_output_dir=True,
    do_train=True,
    do_eval=True,
    include_tokens_per_second=True,
    use_cpu=True
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [84]:
args.set_dataloader(train_batch_size=BATCH_SIZE, eval_batch_size=BATCH_SIZE)

TrainingArguments(
_n_gpu=0,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=True,
do_predict=False,
do_train=True,
eval_accumulation_steps=None,
eval_delay=0,
eval_steps=None,
evaluation_strategy=epoch,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False},
fsdp_min_num_params=0,
fsdp_transformer_layer_cls_to_wrap=None,
full_determinism=False,
gradient_accumulation_steps=1,
gradient_checkpointing=False,
gradient_checkpointing_kwargs=None,
greater_is_better=None,
group_by_length=False,
half_precision_backend=a

In [85]:
from transformers import get_linear_schedule_with_warmup
from torch.optim import AdamW

optimizer = AdamW([
    {'params': list(model.classifier.parameters()), 'lr': 1e-3}
])

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=100,
    num_training_steps=500)

In [86]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer, padding="max_length")

In [87]:
from datasets import load_metric
import evaluate

seqeval = evaluate.load("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_names[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    flattened_results = {
        "overall_precision": results["overall_precision"],
        "overall_recall": results["overall_recall"],
    }

    return flattened_results


In [88]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    optimizers=(optimizer, scheduler),
)


In [89]:
os.environ["WANDB_DISABLED"] = "true"

In [90]:
trainer.train()

Epoch,Training Loss,Validation Loss,Overall Precision,Overall Recall
1,No log,0.650479,0.128713,0.188406
2,No log,0.634982,0.098684,0.124224
3,No log,0.61225,0.161145,0.221532


TrainOutput(global_step=276, training_loss=0.6175286666206692, metrics={'train_runtime': 5255.3485, 'train_samples_per_second': 0.21, 'train_steps_per_second': 0.053, 'train_tokens_per_second': 107.557, 'total_flos': 144243630489600.0, 'train_loss': 0.6175286666206692, 'epoch': 3.0})

In [91]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'loc_markers', 'input_ids', 'attention_mask', 'labels', 'spans', 'tokens'],
        num_rows: 368
    })
    test: Dataset({
        features: ['text', 'loc_markers', 'input_ids', 'attention_mask', 'labels', 'spans', 'tokens'],
        num_rows: 158
    })
})

In [92]:
trainer.save_model("model1")

In [93]:
m = AutoModelForTokenClassification.from_pretrained("model1")

In [94]:
for param in list(m.distilbert.parameters()):
    param.requires_grad = False

In [99]:
# Load your CSV file
df1 = pd.read_csv(uk_geo_dataset_path, skiprows=lambda x: x > 25000)
df21 = pd.read_csv(ru_geo_dataset_path, skiprows=lambda x: x > 25000)
#ru_dataset = load_dataset('csv', data_files=[ru_geo_dataset_path])
#uk_dataset = load_dataset('csv', data_files=[uk_geo_dataset_path])
uk_dataset1 = Dataset.from_pandas(pd.DataFrame([row for row in df1.values if row[1] != "[]"], columns=df.columns))
ru_dataset1 = Dataset.from_pandas(pd.DataFrame([row for row in df21.values if row[1] != "[]"], columns=df2.columns))

ru_dataset1 = ru_dataset1.remove_columns(['doc_id', 'sent_id'])
uk_dataset1 = uk_dataset1.remove_columns(['is_valid'])


dataset1 = concatenate_datasets([ru_dataset1, uk_dataset1])
dataset1 = dataset1.train_test_split(test_size=0.3, shuffle=True)

tokenized_dataset1 = dataset1.map(to_data, batched=True)
tokenized_dataset1 = tokenized_dataset1.remove_columns(["org_markers", "per_markers"])

  block_group = [InMemoryTable(cls._concat_blocks(list(block_group), axis=axis))]
  table = cls._concat_blocks(blocks, axis=0)


Map:   0%|          | 0/8797 [00:00<?, ? examples/s]

Map:   0%|          | 0/3771 [00:00<?, ? examples/s]

In [100]:
from transformers import Trainer

trainer = Trainer(
    model=m,
    args=args,
    train_dataset=tokenized_dataset1["train"],
    eval_dataset=tokenized_dataset1["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    optimizers=(optimizer, scheduler),
)


In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss


In [None]:
trainer.save_model("model2")

In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    optimizers=(optimizer, scheduler),
)


In [None]:
p = trainer.predict(tokenized_dataset["test"])

In [None]:
predictions, labels = p[0], p[1]
predictions = np.argmax(predictions, axis=2)

# Remove ignored index (special tokens)
true_predictions = [
    [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [label_names[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

results = seqeval.compute(predictions=true_predictions, references=true_labels)
flattened_results = {
    "overall_precision": results["overall_precision"],
    "overall_recall": results["overall_recall"],
}

In [None]:
flattened_results