In [None]:
# Install required libraries
!pip install transformers datasets torch

Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.1-py3-none-any.whl (471 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.6/471.6 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.5 MB/s[0m eta [36m0:00:

In [None]:
import torch
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from transformers.modeling_utils import unwrap_model
from sklearn.metrics import accuracy_score, classification_report
import numpy as np
from collections import Counter
from typing import Optional

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Load the TR-News dataset
dataset = load_dataset("batubayk/TR-News")
print(dataset)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/614 [00:00<?, ?B/s]

train.csv:   0%|          | 0.00/686M [00:00<?, ?B/s]

validation.csv:   0%|          | 0.00/36.4M [00:00<?, ?B/s]

test.csv:   0%|          | 0.00/37.6M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/277573 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/14610 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/15379 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['abstract', 'author', 'content', 'date', 'source', 'tags', 'title', 'topic', 'url'],
        num_rows: 277573
    })
    validation: Dataset({
        features: ['abstract', 'author', 'content', 'date', 'source', 'tags', 'title', 'topic', 'url'],
        num_rows: 14610
    })
    test: Dataset({
        features: ['abstract', 'author', 'content', 'date', 'source', 'tags', 'title', 'topic', 'url'],
        num_rows: 15379
    })
})


In [None]:
def clean_and_group_categories(dataset, min_samples=10, groupings=None):
    # Default groupings
    if groupings is None:
        groupings = {
            'Spor': ['spor', 'Basketbol', 'Futbol', 'Voleybol', '2016 Avrupa Şampiyonası', 'EURO 2016', 'Tenis', 'Motor Sporları', 'Olimpiyat', 'Spor', 'basketbol', 'futbol', 'diger_sporlar'],
            'Ekonomi': ['Ekonomi', 'Döviz', 'Para', 'Makro Ekonomi', 'ekonomi'],
            'Dünya': ['Dünya', 'dunya'],
            'Kültür-Sanat': ['Kültür-Sanat', 'Sanat', 'Müzik', 'kultur-sanat'],
            'Yaşam': ['Yaşam', 'yasam', 'İş-Yaşam'],
            'Teknoloji': ['Teknoloji', 'bilim_ve_teknoloji'],
            'Siyaset': ['siyaset', '2019 Yerel Seçim'],
            'Eğitim': ['Eğitim', 'egitim'],
            'Sağlık': ['Sağlık', 'saglik'],
            'Türkiye': ['Türkiye', 'turkiye'],
            'Röportajlar': ['Röportajlar', 'Özel Röportajlar'],
        }

    category_counts = Counter(dataset['topic'])
    new_data = []

    for item in dataset:
        category = item['topic']
        for new_category, old_categories in groupings.items():
            if category in old_categories:
                category = new_category
                break
        if category_counts[category] < min_samples and category not in groupings.keys():
            category = 'Other'
        new_data.append({'abstract': item['abstract'], 'topic': category})

    new_dataset = {
        'abstract': [item['abstract'] for item in new_data],
        'topic': [item['topic'] for item in new_data]
    }

    new_category_counts = Counter(new_dataset['topic'])
    print("New category distribution:")
    for category, count in new_category_counts.items():
        print(f"{category}: {count}")

    return new_dataset

In [None]:
def get_top_categories(dataset, top_n=10):
    category_counts = Counter(dataset['topic'])
    top_categories = dict(category_counts.most_common(top_n))

    filtered_data = {
        'abstract': [],
        'topic': []
    }

    for abstract, topic in zip(dataset['abstract'], dataset['topic']):
        if topic in top_categories:
            filtered_data['abstract'].append(abstract)
            filtered_data['topic'].append(topic)

    filtered_dataset = Dataset.from_dict(filtered_data)

    print(f"\nTop {top_n} categories and their counts:")
    for category, count in top_categories.items():
        print(f"{category}: {count}")

    return filtered_dataset

In [None]:
# Load and preprocess the dataset
train_data = dataset['train'].select_columns(['abstract', 'topic'])
cleaned_data = clean_and_group_categories(train_data, min_samples=50)
top_categories_dataset = get_top_categories(cleaned_data)

New category distribution:
Fiskos: 1417
Türkiye: 106224
Dünya: 39944
Spor: 30812
Kültür-Sanat: 8712
Gündem: 10423
Yaşam: 16077
Sağlık: 16175
Seyahat: 567
H. Bunu Konuşuyor: 736
Ekonomi: 19603
cevre: 179
Eğitim: 5082
Siyaset: 4704
None: 2719
Teknoloji: 7003
BBC: 775
Merak Edilenler: 194
Emlak: 289
Polemik: 252
Otomobil: 1447
diger: 410
english: 66
Televizyon: 429
Kadına Şiddet: 207
Turizm: 226
Enerji: 241
yerel_yonetimler: 136
Magazin: 482
Sosyal Güvenlik: 172
Other: 790
Medya: 308
Röportajlar: 119
cumhuriyet_ege: 139
Tarifler: 99
Dünyadan: 167
Şipşak: 155
Diğer: 93

Top 10 categories and their counts:
Türkiye: 106224
Dünya: 39944
Spor: 30812
Ekonomi: 19603
Sağlık: 16175
Yaşam: 16077
Gündem: 10423
Kültür-Sanat: 8712
Teknoloji: 7003
Eğitim: 5082


In [None]:
# Categories to remove
categories_to_remove = ["Gündem", "Yaşam"]

filtered_data = {
    'abstract': [],
    'topic': []
}

for abstract, topic in zip(top_categories_dataset['abstract'], top_categories_dataset['topic']):
    if topic not in categories_to_remove:
        filtered_data['abstract'].append(abstract)
        filtered_data['topic'].append(topic)

# Convert to Dataset object
filtered_dataset = Dataset.from_dict(filtered_data)

print("Categories and their counts after removal ('Gundem' and 'Yasam'):")
category_counts = Counter(filtered_dataset['topic'])
for category, count in category_counts.items():
    print(f"{category}: {count}")

Categories and their counts after removal ('Gundem' and 'Yasam'):
Türkiye: 106224
Dünya: 39944
Spor: 30812
Kültür-Sanat: 8712
Sağlık: 16175
Ekonomi: 19603
Eğitim: 5082
Teknoloji: 7003


In [None]:
# Limit the dataset
max_samples = 50000
if len(filtered_dataset) > max_samples:
    filtered_dataset = filtered_dataset.shuffle(seed=42).select(range(max_samples))

print(f"Final dataset size: {len(filtered_dataset)}")

# Use filtered_dataset for further processing
new_dataset = filtered_dataset

Final dataset size: 50000


In [None]:
# Create label mappings
label_list = list(set(new_dataset['topic']))
label_to_id = {label: i for i, label in enumerate(label_list)}
id_to_label = {i: label for label, i in label_to_id.items()}

print("\nLabel to ID mapping:")
for label, id in label_to_id.items():
    print(f"{label}: {id}")


Label to ID mapping:
Eğitim: 0
Dünya: 1
Türkiye: 2
Teknoloji: 3
Kültür-Sanat: 4
Sağlık: 5
Ekonomi: 6
Spor: 7


In [None]:
# Tokenization function
def tokenize_and_prepare(examples, tokenizer):
    tokenized = tokenizer(examples["abstract"], truncation=True, padding="max_length", max_length=128)
    tokenized['labels'] = [label_to_id[label] for label in examples['topic']]
    return tokenized

In [None]:
# Metrics computation function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {"accuracy": accuracy_score(labels, predictions)}

In [None]:
# Custom Trainer class
class ContiguousTrainer(Trainer):
    def save_model(self, output_dir: Optional[str] = None, _internal_call: bool = False):
        for name, param in self.model.named_parameters():
            if not param.is_contiguous():
                param.data = param.data.contiguous()
        super().save_model(output_dir, _internal_call)

In [None]:
# Training arguments
training_args = TrainingArguments(
    output_dir="./results_gpt2",
    num_train_epochs=3,
    per_device_train_batch_size=16,  # Reduced batch size due to larger model
    per_device_eval_batch_size=32,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs_gpt2",
    logging_steps=100,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    gradient_accumulation_steps=4,
    learning_rate=2e-5,
    fp16=True,
    label_names=["labels"],
    report_to="tensorboard",
    logging_first_step=True,
    dataloader_num_workers=2,
)



In [None]:
# Model name
model_name = "ytu-ce-cosmos/turkish-gpt2-large"

In [None]:
# Tokenize datasets
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Set the pad token to the eos token
tokenizer.pad_token = tokenizer.eos_token

tokenized_datasets = new_dataset.map(
    lambda examples: tokenize_and_prepare(examples, tokenizer),
    batched=True,
    remove_columns=new_dataset.column_names
)

tokenizer_config.json:   0%|          | 0.00/537 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/927k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/585k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.37M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/438 [00:00<?, ?B/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [None]:
# Split dataset
train_size = int(0.8 * len(tokenized_datasets))
train_dataset = tokenized_datasets.shuffle(seed=42).select(range(train_size))
eval_dataset = tokenized_datasets.shuffle(seed=42).select(range(train_size, len(tokenized_datasets)))

print(f"Number of training examples: {len(train_dataset)}")
print(f"Number of validation examples: {len(eval_dataset)}")

Number of training examples: 40000
Number of validation examples: 10000


In [None]:
# Load model
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(label_list))
model.config.pad_token_id = tokenizer.pad_token_id

# Initialize trainer
trainer = ContiguousTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
)

config.json:   0%|          | 0.00/894 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.10G [00:00<?, ?B/s]

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at ytu-ce-cosmos/turkish-gpt2-large and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [None]:
# Train the model
print("\nTraining the model...")
trainer.train()


Training the model...


Epoch,Training Loss,Validation Loss,Accuracy
1,0.3727,0.326492,0.8805
2,0.2199,0.32656,0.8862
3,0.0747,0.416085,0.8883


TrainOutput(global_step=1875, training_loss=0.282709605662028, metrics={'train_runtime': 1411.4013, 'train_samples_per_second': 85.022, 'train_steps_per_second': 1.328, 'total_flos': 6.52862029824e+16, 'train_loss': 0.282709605662028, 'epoch': 3.0})

In [None]:
# Evaluate the model
print("\nEvaluating the model...")
eval_results = trainer.evaluate()
print(f"Evaluation results: {eval_results}")


Evaluating the model...


Evaluation results: {'eval_loss': 0.4160846769809723, 'eval_accuracy': 0.8883, 'eval_runtime': 28.5633, 'eval_samples_per_second': 350.099, 'eval_steps_per_second': 10.958, 'epoch': 3.0}


In [None]:
# Make predictions
predictions = trainer.predict(eval_dataset)
preds = np.argmax(predictions.predictions, axis=-1)

In [None]:
# Print classification report
true_labels = [id_to_label[label] for label in eval_dataset['labels']]
predicted_labels = [id_to_label[pred] for pred in preds]
print("\nClassification Report:")
print(classification_report(true_labels, predicted_labels))


Classification Report:
              precision    recall  f1-score   support

       Dünya       0.89      0.92      0.90      1749
     Ekonomi       0.84      0.81      0.82       855
      Eğitim       0.86      0.74      0.79       235
Kültür-Sanat       0.77      0.74      0.76       402
      Sağlık       0.87      0.88      0.87       637
        Spor       0.97      0.98      0.97      1332
   Teknoloji       0.75      0.72      0.73       323
     Türkiye       0.90      0.90      0.90      4467

    accuracy                           0.89     10000
   macro avg       0.85      0.84      0.84     10000
weighted avg       0.89      0.89      0.89     10000



In [None]:
# Save the model
model_save_path = f"/content/drive/MyDrive/news-classifier-models/turkish-gpt2-large_news_classifier"
model_to_save = unwrap_model(trainer.model)
for param in model_to_save.parameters():
    param.data = param.data.contiguous()
model_to_save.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)

print("Training and evaluation completed for Turkish GPT-2 Large model.")