In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
df1 = pd.read_csv('./war-news.csv')
df1.shape

(5653, 6)

In [3]:
df1['news_headline'] = df1['Headlines']
df1['category_num'] = 1

In [4]:
df1 = df1.drop(columns=['Unnamed: 0', 'Summary', 'Press', 'Date', 'Keyword', 'Headlines'])
df1.head()

Unnamed: 0,news_headline,category_num
0,I served in Iraq and Afghanistan but the horro...,1
1,The forever war in Afghanistan is nowhere near...,1
2,"Hell at Abbey Gate: Chaos, Confusion and Death...",1
3,�A second Afghanistan�: Doubts over Russia�s w...,1
4,Afghanistan: Former army general vows new war ...,1


In [5]:
df2 = pd.read_csv('./inshort_news_data-1.csv')
df2.shape

(4817, 4)

In [6]:
df2['category_num'] = 0

In [7]:
df2 = df2.drop(columns=['Unnamed: 0','news_article', 'news_category'])
df2.head()

Unnamed: 0,news_headline,category_num
0,50-year-old problem of biology solved by Artif...,0
1,Microsoft Teams to stop working on Internet Ex...,0
2,Hope US won't erect barriers to cooperation: C...,0
3,Global smartphone sales in Q3 falls 5.7% to 36...,0
4,EU hoping Biden will clarify US position on di...,0


In [8]:
df = pd.concat([df1, df2], ignore_index=True)
df

Unnamed: 0,news_headline,category_num
0,I served in Iraq and Afghanistan but the horro...,1
1,The forever war in Afghanistan is nowhere near...,1
2,"Hell at Abbey Gate: Chaos, Confusion and Death...",1
3,�A second Afghanistan�: Doubts over Russia�s w...,1
4,Afghanistan: Former army general vows new war ...,1
...,...,...
10465,AstraZeneca to test combining Oxford vaccine w...,0
10466,"Humanity will face 60°C temperatures in 2050s,...",0
10467,Human-made objects to outweigh living things b...,0
10468,"IUCN lists 31 species as 'Extinct', European B...",0


In [9]:
df.to_csv('your_dataset.csv', index=False)


In [11]:
train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)


In [12]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

max_length = 200
# Пример предобработки текста
text = "Сегодня был взрыв в Москве"
tokens = tokenizer.encode_plus(text, add_special_tokens=True, max_length=max_length, padding='max_length', truncation=True, return_tensors='pt')


  from .autonotebook import tqdm as notebook_tqdm
Downloading tokenizer_config.json: 100%|██████████| 28.0/28.0 [00:00<?, ?B/s]
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Downloading vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 769kB/s]
Downloading tokenizer.json: 100%|██████████| 466k/466k [00:00<00:00, 1.11MB/s]
Downloading config.json: 100%|██████████| 570/570 [00:00<?, ?B/s] 


In [13]:
import torch
from torch.utils.data import DataLoader, TensorDataset

batch_size = 16

def create_data_loader(df, tokenizer, max_length, batch_size):
    # Преобразование текста в токены
    tokens = tokenizer.batch_encode_plus(df['news_headline'].tolist(), add_special_tokens=True, max_length=max_length, padding='max_length', truncation=True, return_tensors='pt')
    
    # Создание DataLoader
    dataset = TensorDataset(tokens['input_ids'], tokens['attention_mask'], torch.tensor(df['category_num'].tolist()))
    data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    return data_loader

# Создание DataLoader для тренировочного, валидационного и тестового наборов
train_loader = create_data_loader(train_df, tokenizer, max_length, batch_size)
val_loader = create_data_loader(val_df, tokenizer, max_length, batch_size)
test_loader = create_data_loader(test_df, tokenizer, max_length, batch_size)


In [15]:
from transformers import BertForSequenceClassification

learning_rate = 2e-5  

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
criterion = torch.nn.CrossEntropyLoss()


Downloading model.safetensors: 100%|██████████| 440M/440M [05:51<00:00, 1.25MB/s] 
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
def calculate_accuracy(model, data_loader):
    model.eval()
    correct = 0
    total = 0

    with torch.no_grad():
        for batch in data_loader:
            input_ids, attention_mask, labels = batch
            outputs = model(input_ids, attention_mask=attention_mask)
            predictions = torch.argmax(outputs.logits, dim=1)
            correct += (predictions == labels).sum().item()
            total += len(labels)

    accuracy = correct / total
    return accuracy


In [20]:
num_epochs = 5

for epoch in range(num_epochs):
    model.train()
    for batch_idx, batch in enumerate(train_loader):
        optimizer.zero_grad()
        input_ids, attention_mask, labels = batch
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        # Вывод логов (пример)
        print(f'Epoch: {epoch + 1}/{num_epochs}, Batch: {batch_idx + 1}/{len(train_loader)}, Batch Loss: {loss.item()}')

    val_accuracy = calculate_accuracy(model, val_loader)
    print(f'Epoch: {epoch + 1}/{num_epochs}, Validation Accuracy: {val_accuracy}')


Epoch: 1/5, Batch: 1/524, Batch Loss: 0.07729209214448929


KeyboardInterrupt: 

In [None]:
model.eval()
with torch.no_grad():
    for batch in val_loader:
        input_ids, attention_mask, labels = batch
        outputs = model(input_ids, attention_mask=attention_mask)
        val_preds = torch.argmax(outputs.logits, dim=1)


In [None]:
torch.save(model.state_dict(), 'bert_model.pth')