In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
import sys
import subprocess
import numpy as np
import pandas as pd

import transformers as tr
import torch
import torch.nn as nn
from torch.utils.data import Dataset

In [None]:
DATA_PATH = 'Data/7allV03.csv'
df = pd.read_csv(DATA_PATH)
print(f'Shape of the data :{df.shape}')

Shape of the data :(4900, 2)


In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'device set to {device}')

device set to cpu


In [None]:
labels = df['category'].unique().tolist()
labels = [s.strip() for s in labels]
print(f'No of labels = {len(labels)}')
print(f'labels = {labels}')

No of labels = 7
labels = ['siyaset', 'dunya', 'ekonomi', 'kultur', 'saglik', 'spor', 'teknoloji']


In [None]:
# creating label2id and id2label maps
NUM_LABELS = len(labels)
id2label = {id: label for id, label in enumerate(labels)}
label2id = {label: id for id, label in enumerate(labels)}

print(f'id2label = {id2label}')
print(f'label2id = {label2id}')

id2label = {0: 'siyaset', 1: 'dunya', 2: 'ekonomi', 3: 'kultur', 4: 'saglik', 5: 'spor', 6: 'teknoloji'}
label2id = {'siyaset': 0, 'dunya': 1, 'ekonomi': 2, 'kultur': 3, 'saglik': 4, 'spor': 5, 'teknoloji': 6}


In [None]:
df['labels'] = df.category.map(lambda x: label2id[x.strip()])

In [None]:
tokenizer = tr.BertTokenizerFast.from_pretrained("dbmdz/bert-base-turkish-uncased", max_length=512)
model = tr.BertForSequenceClassification.from_pretrained(
    "dbmdz/bert-base-turkish-uncased",
    num_labels=NUM_LABELS, id2label=id2label, label2id=label2id)
model.to(device)

# splitting dataset into train/test/validation
SIZE = df.shape[0]

train_texts = list(df.text[:SIZE // 2])
val_texts = list(df.text[SIZE // 2:(3 * SIZE) // 4])
test_texts = list(df.text[(3 * SIZE) // 4:])

train_labels = list(df.labels[:SIZE // 2])
val_labels = list(df.labels[SIZE // 2:(3 * SIZE) // 4])
test_labels = list(df.labels[(3 * SIZE) // 4:])

train_size,val_size,test_size = (len(train_texts), len(val_texts), len(test_texts))
print(f'train size = {train_size} ; test size = {test_size} ; val size ={val_size}')

train_encodings = tokenizer(train_texts,truncation=True,padding=True)
val_encodings = tokenizer(val_texts,truncation=True,padding=True)
test_encodings = tokenizer(test_texts,truncation=True,padding=True)

Some weights of the model checkpoint at dbmdz/bert-base-turkish-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification wer

train size = 2450 ; test size = 1225 ; val size =1225


In [None]:
type(train_encodings) # BatchEncoding object

transformers.tokenization_utils_base.BatchEncoding

In [None]:
train_encodings.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

The token_type_id refers to the segment IDs. BERT can take two sequences as input, such as a question and an answer, and it differentiates between the two by assigning different segment IDs.

In [None]:
# np.array(train_encodings['input_ids'][0])

Setting all mask_ids to one is commonly used in tasks where the entire input sequence is important for the model's prediction. For example, in text classification or sequence labeling tasks, where the model needs to make predictions based on the entire input sequence without ignoring any tokens, you would set all mask_ids to one.

In [None]:
# np.array(train_encodings['attention_mask'][0])

In [None]:
# np.array(train_encodings['token_type_ids'][0])

In [None]:
# idx = 0
# {key:torch.tensor(value[idx],dtype=torch.float32) for key,value in train_encodings.items()}

In [None]:
class DataLoader:
    def __init__(self,encodings,labels):
        self.encodings = encodings
        self.labels = labels
    def __getitem__(self,index):
        item = {key:torch.tensor(value[index]) for key,value in train_encodings.items()}
        item['label'] = torch.tensor(self.labels[index])
        return item
    def __len__(self):
        return len(self.labels)

In [None]:
train_dataloader = DataLoader(train_encodings, train_labels)
val_dataloader = DataLoader(val_encodings, val_labels)
test_dataset = DataLoader(test_encodings, test_labels)

In [None]:
tr.TrainingArguments

transformers.training_args.TrainingArguments

In [None]:
tr.Trainer

transformers.trainer.Trainer

In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [None]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, preds, average='macro'
    )
    acc = accuracy_score(labels, preds)
    return {
        'Accuracy': acc,
        'F1': f1,
        'Precision': precision,
        'Recall': recall
    }

In [None]:
training_args = tr.TrainingArguments(
    output_dir='./TTC4900Model',
    do_train=True,
    do_eval=True,
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    warmup_steps=100,
    weight_decay=0.01,
    logging_strategy='steps',
   # TensorBoard log directory
    logging_dir='./multi-class-logs',
    logging_steps=50,
    evaluation_strategy="steps",
    eval_steps=50,
    save_strategy="steps",
    # fp16=True,
    load_best_model_at_end=True
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [None]:
trainer = tr.Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataloader,
    eval_dataset=val_dataloader,
    compute_metrics= compute_metrics
)

In [None]:
trainer.train()

***** Running training *****
  Num examples = 2450
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 462
  Number of trainable parameters = 110622727


Step,Training Loss,Validation Loss



KeyboardInterrupt

