In [1]:
import os 
import numpy as np
import pandas as pd

import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, get_linear_schedule_with_warmup,  DistilBertForSequenceClassification,AutoModelForMaskedLM
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, random_split 
from tqdm import tqdm
from sklearn.preprocessing import StandardScaler

In [2]:
torch.cuda.empty_cache()

In [4]:
traindata = pd.read_csv('')
testdata = pd.read_csv('')
valdata = pd.read_csv('')

In [5]:
label2ids = {'Positive': 1, 'Negative': 0, 'Neutral': 2}
traindata.labels = traindata.labels.apply(lambda x: label2ids[x])
testdata.labels = testdata.labels.apply(lambda x: label2ids[x])
valdata.labels = valdata.labels.apply(lambda x: label2ids[x])
 
device = "cuda" if torch.cuda.is_available() else "cpu"
MODEL_NAME = 'dbmdz/bert-base-turkish-cased'
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=len(label2ids))
model.to(device)


tokenizer_config.json:   0%|          | 0.00/60.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/251k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/445M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dbmdz/bert-base-turkish-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(32000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [6]:
def encode(dataset):
    input_ids = []
    attention_masks = []

    for text in tqdm(dataset.text.values, total=dataset.shape[0]): 
        encoded_text = tokenizer(text, add_special_tokens=True, return_tensors='pt', truncation=True, padding='max_length')
        input_ids.append(encoded_text['input_ids'])
        attention_masks.append(encoded_text['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    labels = torch.tensor(dataset.labels.values)

    return TensorDataset(input_ids, attention_masks, labels)

BATCH_SIZE = 16
train_loader = DataLoader(encode(traindata), batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(encode(testdata), batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(encode(valdata),  batch_size=BATCH_SIZE, shuffle=True)

100%|██████████| 6059/6059 [00:03<00:00, 1835.64it/s]
100%|██████████| 1189/1189 [00:00<00:00, 2035.91it/s]
100%|██████████| 674/674 [00:00<00:00, 2068.94it/s]


In [9]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=5e-5)

training_stats = []
NUM_EPOCHS = 1

for epoch in tqdm(range(NUM_EPOCHS)):
    model.train()
    total_correct_predictions_train = 0
    total_epoch_loss_train = 0
    steps = 0

    for batch in train_loader:
        input_ids, attention_masks, labels = [r.to("cuda") for r in batch]

        outputs = model(input_ids, attention_mask=attention_masks)
        logits = outputs.logits
        loss = criterion(logits, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        steps += 1
        predictions = torch.argmax(logits, dim=1)
        total_correct_predictions_train += (predictions == labels).sum().item()
        total_epoch_loss_train += loss.item()

    avg_train_loss = total_epoch_loss_train / steps
    train_accuracy = total_correct_predictions_train / len(train_loader.dataset)
    print(f'Epoch {epoch + 1} Training Accuracy: {train_accuracy:.4f}, Average Loss: {avg_train_loss:.4f}')

    # Validation
    model.eval()
    total_correct_predictions_val = 0
    total_eval_loss_val = 0
    eval_steps = 0

    with torch.no_grad():
        for batch in val_loader:
            input_ids, attention_masks, labels = [r.to("cuda") for r in batch]

            outputs = model(input_ids, attention_mask=attention_masks)
            logits = outputs.logits
            loss = criterion(logits, labels)
            predictions = torch.argmax(logits, dim=1)

            total_correct_predictions_val += (predictions == labels).sum().item()
            total_eval_loss_val += loss.item()
            eval_steps += 1

    avg_val_loss = total_eval_loss_val / eval_steps
    val_accuracy = total_correct_predictions_val / len(val_loader.dataset)
    print(f'Epoch {epoch + 1} Validation Accuracy: {val_accuracy:.4f}, Validation Loss: {avg_val_loss:.4f}')

    training_stats.append({
        'epoch': epoch + 1,
        'Training Loss': avg_train_loss,
        'Valid. Loss': avg_val_loss,
        'Valid. Accur.': val_accuracy
    })

    print(f'Epoch: {epoch + 1} / {NUM_EPOCHS} - Average Training Loss: {avg_train_loss:.4f}')


  0%|          | 0/1 [00:00<?, ?it/s]

Epoch 1 Training Accuracy: 0.7924, Average Loss: 0.5239


100%|██████████| 1/1 [09:58<00:00, 598.00s/it]

Epoch 1 Validation Accuracy: 0.8175, Validation Loss: 0.5124
Epoch: 1 / 1 - Average Training Loss: 0.5239





In [11]:
model.eval()
correct_predictions = 0
total_predictions = 0

all_predictions = []
all_labels = []

for batch in test_loader:
    batch = [r.to("cuda") for r in batch]
    input_ids, attention_masks, labels = batch
    
    all_labels.extend(labels.cpu().numpy())

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_masks)
        
    predictions = torch.argmax(outputs.logits, dim=1) 
    all_predictions.extend(predictions.cpu().numpy())  

    correct_predictions += (predictions == labels).sum().item()
    total_predictions += labels.size(0)

accuracy = (correct_predictions / total_predictions) * 100
print(f"\nTEST ACCURACY: {accuracy:.2f}%")



TEST ACCURACY: 82.25%


In [13]:
from sklearn.metrics import f1_score
import numpy as np

flat_true_labels = np.array(all_labels)
flat_pred_labels = np.array(all_predictions)
f1 = f1_score(flat_true_labels, flat_pred_labels, average='weighted')
print(f"F1 Score: {f1:.2f}")

F1 Score: 0.82


In [15]:
output_directory = ""
os.makedirs(output_directory)

In [17]:
from transformers import GenerationConfig

# Creating the GenerationConfig with a max length of 128
generation_config = GenerationConfig(max_length=128)

# Save the GenerationConfig to the specified directory
generation_config.save_pretrained(output_directory)

# Ensure that all model parameters are contiguous (efficient memory layout)
for name, param in model.named_parameters():
    if not param.is_contiguous():
        param.data = param.data.contiguous()

# save the actual model 
model_to_save = model.module if hasattr(model, 'module') else model
model_to_save.save_pretrained(output_directory)
