In [1]:
import pandas as pd

import torch
import transformers
from torch.utils.data import Dataset, DataLoader
from torch import cuda

from transformers import BertTokenizer, BertModel

import re

from sklearn import preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestCentroid
from sklearn.metrics import accuracy_score

from nltk.corpus import stopwords

import gensim
from gensim.utils import simple_preprocess

import warnings
warnings.filterwarnings('ignore')

2022-07-22 16:24:44.578886: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-07-22 16:24:44.743995: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2022-07-22 16:24:44.751253: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-07-22 16:24:44.751273: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if yo

In [2]:
device = 'cuda' if cuda.is_available() else 'cpu'

## Общий препроцессинг корпуса

In [3]:
df = pd.read_csv("train.csv")
df = df.rename(columns={'Текст Сообщения':'text', 'Категория': 'label'})
test_dataset = pd.read_csv("test.csv")
test_dataset = test_dataset.rename(columns={'Текст Сообщения':'text'})
df = pd.concat([df, test_dataset])

In [4]:
CLEANR = re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')

def cleanhtml(raw_html):
    '''Очистка текста от html'''
    cleantext = re.sub(CLEANR, '', raw_html)
    return cleantext

df['text'] = df['text'].apply(cleanhtml)

In [5]:
# LabelEncoder для столбца с тематикой
le_theme = preprocessing.LabelEncoder()
le_theme.fit(df['Тематика'])
# df['Тематика'] = le_theme.transform(df['Тематика'])

LabelEncoder()

In [6]:
stop_words = stopwords.words('russian')

def strip_newline(series):
    '''Разделитель строк'''
    return [review.replace('\n','') for review in series]

def sent_to_words(sentences):
    '''Токенайзер'''
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))

def remove_stopwords(texts):
    '''Очистка от стопслов'''
    out = [[word for word in simple_preprocess(str(doc))
            if word not in stop_words]
            for doc in texts]
    return out

def bigrams(words, bi_min=5):
    '''Выбор слов на основании биграм'''
    bigram = gensim.models.Phrases(words, min_count = bi_min)
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    return bigram_mod

def get_corpus(df):
    '''Подготовка корпуса'''
    df['text'] = strip_newline(df.text)
    words = list(sent_to_words(df.text))
    words = remove_stopwords(words)
    bigram_mod = bigrams(words)
    bigram = [bigram_mod[review] for review in words]    
    return bigram

In [7]:
bigram_train = get_corpus(df)

In [8]:
#Преобразование текстов в разреженную матрицу весов TF/IDF
def identity_tokenizer(text):
    return text

tfidf = TfidfVectorizer(tokenizer=identity_tokenizer, lowercase=False)    
tfidf = tfidf.fit(bigram_train)

## Препроцессинг train

In [9]:
df = pd.read_csv("train.csv")
df = df.rename(columns={'Текст Сообщения':'text', 'Категория': 'label'})
df['text'] = df['text'].apply(cleanhtml)

In [10]:
#Upsampling обучающей выборки
MAX_TEXT_LENGTH = 256 #Максимальная длина текста 
head_part = df["text"].str.slice(0, MAX_TEXT_LENGTH)
tail_part = df[df["text"].str.len() > MAX_TEXT_LENGTH]["text"].str.slice(MAX_TEXT_LENGTH)

while len(tail_part):
    head_part = pd.concat([head_part, tail_part.str.slice(0, MAX_TEXT_LENGTH)])
    tail_part = tail_part[tail_part.str.len() > MAX_TEXT_LENGTH].str.slice(MAX_TEXT_LENGTH)

df = df[["label", "Ответственное лицо", "Тематика"]].merge(pd.concat([head_part, tail_part]), left_index=True, right_index=True)

In [11]:
df['Тематика'] = le_theme.transform(df['Тематика']) 

In [12]:
bigram_train = get_corpus(df)

In [13]:
X_train_tfidf = tfidf.transform(bigram_train)

In [14]:
X_train_tfidf.shape

(3848, 22175)

In [15]:
def get_centroid(sparse, serie):
    '''Определение ближайшего центроида кластера тематики к тексту'''
    res=[]
    clf = NearestCentroid()
    clf.fit(sparse, serie)
    for row in sparse:
        res.append(clf.predict(row)[0])
    return res

In [16]:
df['theme'] = le_theme.inverse_transform(get_centroid(X_train_tfidf, df['Тематика']))

In [17]:
accuracy_score(df['Тематика'], le_theme.transform(df['theme']))

0.9397089397089398

In [18]:
#Объединяем текст с колонкой theme, а не с колонкой тематики, т.к. она теперь носит условный характер
df['text'] = df['theme']+' '+df["text"]
df = df[['text', 'label']]

In [19]:
df

Unnamed: 0,text,label
0,"Нарушения, связанные с содержанием электросети...",3
1,Аварийные деревья По фасаду дома по адресу ул....,3
1,Аварийные деревья жет просто сломаться. А это ...,3
1,"Аварийные деревья ахнуть газом, будет уже позд...",3
2,Безнадзорные животные Агресивные собаки. На ра...,1
...,...,...
1996,Парковки на дорогах в границах городских округ...,0
1996,Парковки на дорогах в границах городских округ...,0
1997,Аварийные деревья Состоят 3 засохшие дерева (2...,3
1998,Нарушение дорожного покрытия (ямы) на дорогах...,0


## Препроцессинг test

In [20]:
test_data = pd.read_csv("test.csv")
test_data = test_data.rename(columns={'Текст Сообщения':'text'})
test_data['text'] = test_data['text'].apply(cleanhtml)
bigram_train = get_corpus(test_dataset)
X_test_tfidf = tfidf.transform(bigram_train)

In [21]:
test_data['Тематика'] = le_theme.transform(test_data['Тематика'])

In [22]:
test_data['theme'] = le_theme.inverse_transform(get_centroid(X_test_tfidf, test_data['Тематика']))

In [23]:
#Точность с которой мы определяем тематику
accuracy_score(test_dataset['Тематика'], test_data['theme'])

0.991

In [24]:
#Выбираем только колонку с текстом, чтобы в тест не попали лики
test_data['text'] = test_data['theme']+' '+test_data["text"] 
test_data = test_data[['text']]
test_data['ENCODE_CAT'] = 0

## BERT Finetuning

In [25]:
encode_dict = {}

def encode_cat(x):
    """Словарь кодировки лейблов"""
    if x not in encode_dict.keys():
        encode_dict[x]=len(encode_dict)
    return encode_dict[x]

df['ENCODE_CAT'] = df['label'].apply(lambda x: encode_cat(x))
df = df[['text', 'ENCODE_CAT']]

In [26]:
#Основные параметры для обучения Bert
MAX_LEN = 256 
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 2
EPOCHS = 10
LEARNING_RATE = 1e-05
tokenizer = BertTokenizer.from_pretrained('DeepPavlov/rubert-base-cased-conversational')

In [27]:
class Triage(Dataset):
    '''Даталоадер'''
    def __init__(self, dataframe, tokenizer, max_len):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __getitem__(self, index):
        title = str(self.data.text[index])
        title = " ".join(title.split())
        inputs = self.tokenizer.encode_plus(
            title,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True,
            truncation=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'targets': torch.tensor(self.data.ENCODE_CAT[index], dtype=torch.long)
        } 
    
    def __len__(self):
        return self.len

In [28]:
train_size = 0.87
train_dataset=df.sample(frac=train_size,random_state=200)
test_dataset=df.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)


print("FULL Dataset: {}".format(df.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

training_set = Triage(train_dataset, tokenizer, MAX_LEN)
testing_set = Triage(test_dataset, tokenizer, MAX_LEN)

FULL Dataset: (3848, 2)
TRAIN Dataset: (3348, 2)
TEST Dataset: (150, 2)


In [29]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [30]:

class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.l1 = BertModel.from_pretrained('DeepPavlov/rubert-base-cased-conversational')
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.3)
        self.classifier = torch.nn.Linear(768, 17)

    def forward(self, input_ids, attention_mask):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.ReLU()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output

In [31]:
# device = 'cpu'
model = BERTClass()
model.to(device)

Some weights of the model checkpoint at DeepPavlov/rubert-base-cased-conversational were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BERTClass(
  (l1): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
   

In [32]:
loss_function = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [33]:
def calcuate_accu(big_idx, targets):
    n_correct = (big_idx==targets).sum().item()
    return n_correct

In [34]:
def train(epoch):
    tr_loss = 0
    n_correct = 0
    nb_tr_steps = 0
    nb_tr_examples = 0
    model.train()
    for _,data in enumerate(training_loader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.long)

        outputs = model(ids, mask)
        loss = loss_function(outputs, targets)
        tr_loss += loss.item()
        big_val, big_idx = torch.max(outputs.data, dim=1)
        n_correct += calcuate_accu(big_idx, targets)

        nb_tr_steps += 1
        nb_tr_examples+=targets.size(0)
        
        if _%5000==0:
            loss_step = tr_loss/nb_tr_steps
            accu_step = (n_correct*100)/nb_tr_examples 
            print(f"Training Loss per 5000 steps: {loss_step}")
            print(f"Training Accuracy per 5000 steps: {accu_step}")

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f'The Total Accuracy for Epoch {epoch}: {(n_correct*100)/nb_tr_examples}')
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    print(f"Training Loss Epoch: {epoch_loss}")
    print(f"Training Accuracy Epoch: {epoch_accu}")

    return 

In [35]:
for epoch in range(EPOCHS):
    train(epoch)

Training Loss per 5000 steps: 2.783912420272827
Training Accuracy per 5000 steps: 0.0
The Total Accuracy for Epoch 0: 79.62962962962963
Training Loss Epoch: 0.9143766787498834
Training Accuracy Epoch: 79.62962962962963
Training Loss per 5000 steps: 0.16351543366909027
Training Accuracy per 5000 steps: 100.0
The Total Accuracy for Epoch 1: 94.35483870967742
Training Loss Epoch: 0.300846173981458
Training Accuracy Epoch: 94.35483870967742
Training Loss per 5000 steps: 0.05974167585372925
Training Accuracy per 5000 steps: 100.0
The Total Accuracy for Epoch 2: 95.48984468339307
Training Loss Epoch: 0.2117280488331449
Training Accuracy Epoch: 95.48984468339307
Training Loss per 5000 steps: 0.14980089664459229
Training Accuracy per 5000 steps: 100.0
The Total Accuracy for Epoch 3: 96.50537634408602
Training Loss Epoch: 0.15211264612619946
Training Accuracy Epoch: 96.50537634408602
Training Loss per 5000 steps: 0.06378297507762909
Training Accuracy per 5000 steps: 100.0
The Total Accuracy for

In [36]:
# Валидация для test

'''def valid(model, testing_loader):
    model.eval()
    n_correct = 0; n_wrong = 0; total = 0
    tr_loss = 0
    nb_tr_steps=0
    nb_tr_examples=0
    with torch.no_grad():
        for _, data in enumerate(testing_loader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.long)
            outputs = model(ids, mask).squeeze()
            loss = loss_function(outputs, targets)
            tr_loss += loss.item()
            big_val, big_idx = torch.max(outputs.data, dim=1)
            n_correct += calcuate_accu(big_idx, targets)

            nb_tr_steps += 1
            nb_tr_examples+=targets.size(0)
            
            if _%5000==0:
                loss_step = tr_loss/nb_tr_steps
                accu_step = (n_correct*100)/nb_tr_examples
                print(f"Validation Loss per 100 steps: {loss_step}")
                print(f"Validation Accuracy per 100 steps: {accu_step}")
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    print(f"Validation Loss Epoch: {epoch_loss}")
    print(f"Validation Accuracy Epoch: {epoch_accu}")
    
    return epoch_accu'''

'def valid(model, testing_loader):\n    model.eval()\n    n_correct = 0; n_wrong = 0; total = 0\n    tr_loss = 0\n    nb_tr_steps=0\n    nb_tr_examples=0\n    with torch.no_grad():\n        for _, data in enumerate(testing_loader, 0):\n            ids = data[\'ids\'].to(device, dtype = torch.long)\n            mask = data[\'mask\'].to(device, dtype = torch.long)\n            targets = data[\'targets\'].to(device, dtype = torch.long)\n            outputs = model(ids, mask).squeeze()\n            loss = loss_function(outputs, targets)\n            tr_loss += loss.item()\n            big_val, big_idx = torch.max(outputs.data, dim=1)\n            n_correct += calcuate_accu(big_idx, targets)\n\n            nb_tr_steps += 1\n            nb_tr_examples+=targets.size(0)\n            \n            if _%5000==0:\n                loss_step = tr_loss/nb_tr_steps\n                accu_step = (n_correct*100)/nb_tr_examples\n                print(f"Validation Loss per 100 steps: {loss_step}")\n      

In [37]:
output_model_file = 'pytorch_bert.bin'
output_vocab_file = 'vocab_bert.bin'

model_to_save = model
torch.save(model_to_save, output_model_file)
tokenizer.save_vocabulary(output_vocab_file)

print('Данные сохранены')

Данные сохранены


In [38]:
testing_set = Triage(test_data, tokenizer, MAX_LEN)
test_params = {'batch_size': 8,
                'shuffle': False,
                'num_workers': 0
                }
testing_loader = DataLoader(testing_set, **test_params)

## Валидация

In [39]:
def valid(model, testing_loader):
    model.eval()
    all_preds = []
    with torch.no_grad():
        for _, data in enumerate(testing_loader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            outputs = model(ids, mask).squeeze()
            big_val, big_idx = torch.max(outputs.data, dim=1)
            all_preds.append(big_idx)           
    
    return all_preds

In [40]:
all_preds = valid(model, testing_loader)
flat_list = [x.tolist() for xs in all_preds for x in xs]

In [41]:
new=[]
for search in flat_list:
    for lbl, enc in encode_dict.items():
        if enc == search:
            new.append(lbl)

In [42]:
test_dataset = pd.read_csv("test.csv")
test_dataset = test_dataset[['id']]
test_dataset['Категория']=new
test_dataset.to_csv('sample_solution.csv', index=False)