In [8]:
import pandas as pd
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer
)
from peft import LoraConfig, get_peft_model
import torch
import numpy as np

import os

In [32]:
def create_lora_model(model_name, num_labels=2):
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=num_labels
    )
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    lora_config = LoraConfig(
        r=16,
        lora_alpha=16,
        lora_dropout=0.1,
        target_modules=['lin1', 'lin2'],
        task_type="SEQ_CLS",
    )

    model = get_peft_model(model, lora_config)
    model.print_trainable_parameters()
    
    return model, tokenizer


def train_for_company(company, hf_dataset, model_name):
    model, tokenizer = create_lora_model(model_name)
    
    def tokenize(batch):
        return tokenizer(
            batch["message"], 
            padding="max_length", 
            truncation=True, 
            max_length=128
        )


    train_dataset = hf_dataset[company]["train"].map(tokenize, batched=True)
    test_dataset = hf_dataset[company]["test"].map(tokenize, batched=True)

    training_args = TrainingArguments(
        output_dir=f"./results/{company}",
        per_device_train_batch_size=64,
        per_device_eval_batch_size=64,
        learning_rate=3e-4,
        num_train_epochs=5,
        evaluation_strategy="steps",
        eval_steps=400,
        save_steps=400,
        logging_steps=10,
        logging_dir=f"./logs/{company}",
        save_strategy="epoch",
    )

    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        predictions = np.argmax(logits, axis=-1)
        accuracy = (predictions == labels).astype(np.float32).mean()
        return {"accuracy": accuracy}

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        compute_metrics=compute_metrics,
    )

    trainer.train()

    model.save_pretrained(f"./models/{company}")
    tokenizer.save_pretrained(f"./models/{company}")

    return model

In [30]:
df = pd.read_csv('final_data/df.csv')

KeyboardInterrupt: 

In [4]:
df['date'] = pd.to_datetime(df['date'])
df.sort_values('date', inplace=True)

In [5]:
stocks = ['VTBR', 'GAZP', 'SBER', 'NVTK', 'ROSN']

hf_datasets = {}

for stock in stocks:
    stock_df = df[df[stock] == True].copy()
    stock_df = stock_df[['date', 'message', 'views', 'forwards', 'fwd_from', f'1 мин. {stock} close_bin']]
    stock_df.rename(columns={f'1 мин. {stock} close_bin': 'label'}, inplace=True)
    
    if len(stock_df) == 0:
        continue
    
    stock_df = stock_df.sort_values('date')
    
    split_idx = int(len(stock_df) * 0.9)
    train_data = stock_df.iloc[:split_idx]
    test_data = stock_df.iloc[split_idx:]

    hf_datasets[stock] = DatasetDict({
        'train': Dataset.from_pandas(train_data),
        'test': Dataset.from_pandas(test_data)
    })

In [6]:
dataset = DatasetDict(hf_datasets)

In [71]:
dataset.save_to_disk('final_data/hf_dataset')

Saving the dataset (1/1 shards): 100%|██████████| 23324/23324 [00:00<00:00, 148402.24 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 2592/2592 [00:00<00:00, 113474.34 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 53810/53810 [00:00<00:00, 274589.11 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 5979/5979 [00:00<00:00, 150131.97 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 25711/25711 [00:00<00:00, 234742.47 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 2857/2857 [00:00<00:00, 101539.01 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 9154/9154 [00:00<00:00, 235629.56 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 1018/1018 [00:00<00:00, 93271.91 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 12282/12282 [00:00<00:00, 241906.35 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 1365/1365 [00:00<00:00, 133302.87 examples/s]


In [7]:
model_name = "DeepPavlov/distilrubert-base-cased-conversational"

In [14]:
os.makedirs('models', exist_ok=True)
os.makedirs('logs', exist_ok=True)

In [None]:
train_for_company('VTBR', dataset, model_name)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at DeepPavlov/distilrubert-base-cased-conversational and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 1,329,410 || all params: 136,655,620 || trainable%: 0.9728


Map: 100%|██████████| 23324/23324 [00:02<00:00, 8503.37 examples/s]
Map: 100%|██████████| 2592/2592 [00:00<00:00, 9810.43 examples/s] 


Step,Training Loss,Validation Loss
