In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

df = pd.read_excel('/content/drive/MyDrive/Cs549/CS549_project.xlsx')

df = df[['Stock_Report', 'News']]

train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)

train_df.to_csv('train.csv', index=False)
val_df.to_csv('val.csv', index=False)


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

df = pd.read_excel('/content/drive/MyDrive/Cs549/CS549_project.xlsx')
df = df[['Stock_Report', 'News']].dropna()

train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)

train_df.to_csv('train.csv', index=False)
val_df.to_csv('val.csv', index=False)

from datasets import Dataset, DatasetDict

dataset = DatasetDict({
    'train': Dataset.from_pandas(train_df),
    'validation': Dataset.from_pandas(val_df)
})

from transformers import MT5ForConditionalGeneration, MT5Tokenizer

tokenizer = MT5Tokenizer.from_pretrained('ozcangundes/mt5-small-turkish-summarization')
model = MT5ForConditionalGeneration.from_pretrained('ozcangundes/mt5-small-turkish-summarization')

def tokenize_function(examples):
    inputs = ["haberle≈ütir: " + text for text in examples["Stock_Report"]]
    targets = examples["News"]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=256, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=['Stock_Report', 'News'])

from transformers import Seq2SeqTrainingArguments

training_args = Seq2SeqTrainingArguments(
    output_dir='./kap_news_model',
    evaluation_strategy='epoch',
    learning_rate=1e-4,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=70,
    weight_decay=0.01,
    save_total_limit=1,
    predict_with_generate=True,
    generation_max_length=256,
    logging_dir='./logs',
    logging_steps=10,
    push_to_hub=False
)

from transformers import Seq2SeqTrainer, DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator
)

trainer.train()



Map:   0%|          | 0/360 [00:00<?, ? examples/s]



Map:   0%|          | 0/40 [00:00<?, ? examples/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss
1,3.2863,2.196536
2,2.1038,1.759583
3,1.9598,1.675701
4,1.7554,1.63128
5,1.6689,1.596638
6,1.6799,1.576433
7,1.6923,1.557438
8,1.6837,1.540171
9,1.4672,1.522428
10,1.5622,1.514502


Checkpoint destination directory ./kap_news_model/checkpoint-4500 already exists and is non-empty. Saving will proceed but saved results may be invalid.


TrainOutput(global_step=6300, training_loss=1.243396153222947, metrics={'train_runtime': 1136.0506, 'train_samples_per_second': 22.182, 'train_steps_per_second': 5.546, 'total_flos': 1.3324492210176e+16, 'train_loss': 1.243396153222947, 'epoch': 70.0})

In [None]:
def generate_news(stock_report_text):
    input_text = "haberle≈ütir: " + stock_report_text
    inputs = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=512).to(model.device)
    output = model.generate(
        **inputs,
        max_new_tokens=256,
        num_beams=4,
        repetition_penalty=2.5,
        length_penalty=2.0,
        early_stopping=True
    )
    return tokenizer.decode(output[0], skip_special_tokens=True)

example_text = "≈ûirketimizin 21.03.2025 tarihli √∂zel durum a√ßƒ±klamasƒ±nda, paylarƒ±nƒ±n tamamƒ±na sahip olduƒüumuz baƒülƒ± ortaklƒ±ƒüƒ±mƒ±z Marba≈ü Menkul Deƒüerler A≈û'nin (Marba≈ü Menkul), 150.000.000 TL olan √ßƒ±karƒ±lmƒ±≈ü sermayesinin 360.000.000 TL'ye √ßƒ±karƒ±lmasƒ±na, artƒ±rƒ±lan 210.000.000 TL sermayenin i√ß kaynaklardan kar≈üƒ±lanmasƒ±na ili≈ükin olarak Marba≈ü Menkul tarafƒ±ndan Sermaye Piyasasƒ± Kurulu'na yapƒ±lan ba≈üvurunun olumlu kar≈üƒ±landƒ±ƒüƒ± duyurulmu≈ütur."
print("üì∞ Olu≈üturulan Haber:\n", generate_news(example_text))


üì∞ Olu≈üturulan Haber:
 Marba≈ü Menkul Deƒüerler A.≈û., baƒülƒ± ortaklƒ±ƒüƒ± Marba≈ü Menkul Deƒüerler A.≈û.‚Äônin sermayesini 220 milyon TL‚Äôye √ßƒ±karma kararƒ± aldƒ±. 150 milyon TL olan √ßƒ±karƒ±lmƒ±≈ü sermaye, i√ß kaynaklardan kar≈üƒ±lanacak ve i√ß kaynaklardan kar≈üƒ±lanacak. Sermaye Piyasasƒ± Kurulu‚Äôna yapƒ±lan ba≈üvuruyu olumlu kar≈üƒ±layarak Marba≈ü tarafƒ±ndan yapƒ±lan ba≈üvurunun olumlu kar≈üƒ±landƒ±ƒüƒ± bildirildi. Bu geli≈üme, Marba≈ü tarafƒ±ndan yapƒ±lan ba≈üvuruda olumlu kar≈üƒ±landƒ±. Artƒ±≈üƒ±n tamamƒ±na sahip olduƒüu baƒülƒ± ortaklƒ±k Marba≈ü Menkul Deƒüerler A.≈û.‚Äônin sermaye artƒ±rƒ±mƒ± s√ºrecine ili≈ükin √∂nemli bir adƒ±m olarak deƒüerlendiriliyor. A√ßƒ±k√ßasƒ± bu hamle, hem finansal yapƒ±yƒ± desteklemek hem de yatƒ±rƒ±mcƒ±ya g√ºven vermek a√ßƒ±sƒ±ndan olduk√ßa pozitif bir geli≈üme olmu≈ü.


In [None]:
example_text = "Sermayesine %49 oranƒ±nda ortaklƒ±ƒüƒ±mƒ±zƒ±n bulunduƒüu i≈ütiraklerimiz Sour Turizm A.≈û. ve √áaƒürankaya Turizm A.≈û.'nin sermayelerinin 250.000.000 TL'den 400.000.000 TL'ye √ßƒ±karƒ±lmasƒ±na karar verilmi≈ütir."
print("üì∞ Olu≈üturulan Haber:\n", generate_news(example_text))



üì∞ Olu≈üturulan Haber:
 ≈ûirket, %49 oranƒ±nda ortaklƒ±ƒüƒ± olan Sour Turizm ve √áaƒürankaya Turizm A.≈û.‚Äônin sermayesini 250 milyon TL‚Äôden 40 milyon TL‚Äôye √ßƒ±karma kararƒ± aldƒ±. 25 milyon TL‚Äôlik pay sahiplerinin bulunduƒüu i≈ütiraklerden biri olan Sour Turizm ve √áaƒürankaya Turizm‚Äôin sermayesi ise 400 milyon TL‚Äôye √ßƒ±karƒ±lacak. Bu hamle, ≈üirketin b√ºy√ºme stratejisinin bir par√ßasƒ± olarak deƒüerlendiriliyor. Sermayenin %49 oranƒ±nda ortaklƒ±ƒüƒ± bulunduƒüu i≈ütiraklerinden biri olan Sour Turizm ve √áaƒürankaya Turizm‚Äôin bor√ßlanma ≈üirketleri tarafƒ±ndan saƒülanacak. A√ßƒ±k√ßasƒ± bu geli≈üme, uzun vadeli yatƒ±rƒ±mcƒ±lar a√ßƒ±sƒ±ndan olduk√ßa deƒüerli bir adƒ±m olmu≈ü.


In [None]:
!pip install accelerate==0.28.0




In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"


In [None]:
!pip install transformers==4.38.2


Collecting transformers==4.38.2
  Downloading transformers-4.38.2-py3-none-any.whl.metadata (130 kB)
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m130.7/130.7 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
Downloading transformers-4.38.2-py3-none-any.whl (8.5 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m8.5/8.5 MB[0m [31m80.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.39.3
    Uninstalling transformers-4.39.3:
      Successfully uninstalled transformers-4.39.3
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
sentence-transformers 4.1.0 requires

In [None]:
!pip uninstall -y transformers tokenizers huggingface_hub


Found existing installation: transformers 4.39.3
Uninstalling transformers-4.39.3:
  Successfully uninstalled transformers-4.39.3
Found existing installation: tokenizers 0.15.2
Uninstalling tokenizers-0.15.2:
  Successfully uninstalled tokenizers-0.15.2
Found existing installation: huggingface-hub 0.31.4
Uninstalling huggingface-hub-0.31.4:
  Successfully uninstalled huggingface-hub-0.31.4


In [None]:
import transformers
print(transformers.__version__)


4.38.2
