In [1]:
import pandas as pd
from datasets import Dataset, DatasetDict, load_from_disk
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer
)
from peft import LoraConfig, get_peft_model
import torch
import numpy as np

import os


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def create_lora_model(model_name, num_labels=2):
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=num_labels
    )
    tokenizer = AutoTokenizer.from_pretrained(model_name)

    lora_config = LoraConfig(
        r=16,
        lora_alpha=16,
        lora_dropout=0.1,
        target_modules=['lin1', 'lin2'],
        task_type="SEQ_CLS",
    )

    model = get_peft_model(model, lora_config)
    model = model.to("cuda")
    model.print_trainable_parameters()
    
    return model, tokenizer


def train_for_company(company, hf_dataset, model_name):
    model, tokenizer = create_lora_model(model_name)
    
    def tokenize(batch):
        return tokenizer(
            batch["message"], 
            padding="max_length", 
            truncation=True, 
            max_length=128
        )


    train_dataset = hf_dataset[company]["train"].map(tokenize, batched=True)
    test_dataset = hf_dataset[company]["test"].map(tokenize, batched=True)

    training_args = TrainingArguments(
        output_dir=f"./results/{company}",
        per_device_train_batch_size=64,
        per_device_eval_batch_size=64,
        learning_rate=3e-4,
        num_train_epochs=20,
        evaluation_strategy="steps",
        eval_steps=400,
        save_steps=400,
        logging_steps=400,
        logging_dir=f"./logs/{company}",
        save_strategy="epoch",
    )

    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        predictions = np.argmax(logits[0], axis=-1)
        accuracy = (predictions == labels).astype(np.float32).mean()
        return {"accuracy": accuracy}

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        compute_metrics=compute_metrics,
    )

    trainer.train()

    model.save_pretrained(f"./models/{company}")
    tokenizer.save_pretrained(f"./models/{company}")

    return model

In [3]:
df = pd.read_csv('final_data/df.csv')

  df = pd.read_csv('final_data/df.csv')


In [4]:
df['date'] = pd.to_datetime(df['date'])
df.sort_values('date', inplace=True)

In [5]:
stocks = ['VTBR', 'GAZP', 'SBER', 'NVTK', 'ROSN']

hf_datasets = {}

for stock in stocks:
    stock_df = df[df[stock] == True].copy()
    stock_df = stock_df[['date', 'message', 'views', 'forwards', 'fwd_from', f'1 мин. {stock} close_bin']]
    stock_df.rename(columns={f'1 мин. {stock} close_bin': 'label'}, inplace=True)
    
    if len(stock_df) == 0:
        continue
    
    stock_df = stock_df.sort_values('date')
    
    split_idx = -1000
    train_data = stock_df.iloc[:split_idx]
    test_data = stock_df.iloc[split_idx:]

    hf_datasets[stock] = DatasetDict({
        'train': Dataset.from_pandas(train_data),
        'test': Dataset.from_pandas(test_data)
    })

In [6]:
dataset = DatasetDict(hf_datasets)

In [7]:
dataset.save_to_disk('final_data/hf_dataset')

Saving the dataset (1/1 shards): 100%|██████████| 24916/24916 [00:00<00:00, 93324.62 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 1000/1000 [00:00<00:00, 53117.33 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 58789/58789 [00:00<00:00, 114196.51 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 1000/1000 [00:00<00:00, 17422.98 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 27568/27568 [00:00<00:00, 97454.32 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 1000/1000 [00:00<00:00, 46964.48 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 9172/9172 [00:00<00:00, 106174.65 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 1000/1000 [00:00<00:00, 27797.65 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 12647/12647 [00:00<00:00, 91679.92 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 1000/1000 [00:00<00:00, 28391.30 examples/s]


In [9]:
dataset = load_from_disk('final_data/hf_dataset')

In [8]:
model_name = "DeepPavlov/distilrubert-base-cased-conversational"

In [9]:
os.makedirs('models', exist_ok=True)
os.makedirs('logs', exist_ok=True)

In [17]:
model = train_for_company('VTBR', dataset, model_name)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at DeepPavlov/distilrubert-base-cased-conversational and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 1,329,410 || all params: 136,655,620 || trainable%: 0.9728


Map: 100%|██████████| 24916/24916 [00:03<00:00, 7075.79 examples/s]
Map: 100%|██████████| 1000/1000 [00:00<00:00, 7289.36 examples/s]


Step,Training Loss,Validation Loss,Accuracy
400,0.6614,0.727608,0.526
800,0.6158,0.780309,0.473
1200,0.5754,0.793988,0.534
1600,0.5083,0.94544,0.504
2000,0.4292,1.031902,0.478
2400,0.3545,1.261414,0.503
2800,0.295,1.424311,0.479
3200,0.2442,1.503634,0.497
3600,0.1999,1.718668,0.51
4000,0.1737,1.727441,0.487


In [10]:
model = train_for_company('GAZP', dataset, model_name)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at DeepPavlov/distilrubert-base-cased-conversational and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 1,329,410 || all params: 136,655,620 || trainable%: 0.9728


Map: 100%|██████████| 58789/58789 [00:08<00:00, 6774.57 examples/s]
Map: 100%|██████████| 1000/1000 [00:00<00:00, 6472.18 examples/s]


Step,Training Loss,Validation Loss,Accuracy
400,0.6754,0.723992,0.447
800,0.64,0.71817,0.42
1200,0.624,0.720203,0.44
1600,0.614,0.704411,0.527
2000,0.5964,0.753997,0.444
2400,0.5879,0.731755,0.493
2800,0.5733,0.748886,0.477
3200,0.5396,0.764351,0.528
3600,0.5282,0.785062,0.513
4000,0.4808,0.846829,0.534


In [None]:
model = train_for_company('SBER', dataset, model_name)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at DeepPavlov/distilrubert-base-cased-conversational and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 1,329,410 || all params: 136,655,620 || trainable%: 0.9728


Map: 100%|██████████| 27568/27568 [00:04<00:00, 5819.21 examples/s]
Map: 100%|██████████| 1000/1000 [00:00<00:00, 5173.59 examples/s]


Step,Training Loss,Validation Loss,Accuracy
400,0.653,0.720183,0.498
800,0.6076,0.72389,0.508
1200,0.5747,0.767928,0.499
1600,0.5241,0.783316,0.493
2000,0.463,0.862782,0.501
2400,0.4016,1.01855,0.51
2800,0.3501,1.097298,0.487


In [None]:
model = train_for_company('NVTK', dataset, model_name)

In [None]:
model = train_for_company('ROSN', dataset, model_name)