## **Libraries**

In [None]:
!pip install transformers
!pip install catboost

In [4]:
import numpy as np
import pandas as pd

import transformers
from transformers import BertTokenizer, BertModel

import torch
from catboost import CatBoostClassifier

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## **Base model**

In [6]:
tokenizer = BertTokenizer.from_pretrained("DeepPavlov/rubert-base-cased")

model = BertModel.from_pretrained("DeepPavlov/rubert-base-cased").to(device)

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/1.65M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/642 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of the model checkpoint at DeepPavlov/rubert-base-cased were not used when initializing BertModel: ['cls.predictions.decoder.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


## **data and embeddings**

In [7]:
data = pd.read_csv("/content/train_r.csv")

def get_bert_embeddings(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=575).to(device)
    outputs = model(**inputs)
    embeddings = outputs.last_hidden_state[:, 0, :].detach().cpu().numpy()
    return embeddings

data["embeddings"] = data["answer"].apply(get_bert_embeddings)

data["embeddings"] = data["embeddings"].apply(lambda x: x.reshape(-1))

data["label_encoded"] = data["label"].map({"ai": 0, "people": 1})

## **Classifier**

In [8]:
catboost_model = CatBoostClassifier(verbose=350,
                                    loss_function='Logloss',
                                    task_type='GPU',
                                    iterations=2600,
                                    depth=6)

catboost_model.fit(data["embeddings"].tolist(), data["label_encoded"])

Learning rate set to 0.012959
0:	learn: 0.6876976	total: 135ms	remaining: 5m 50s
350:	learn: 0.3359366	total: 23.5s	remaining: 2m 30s
700:	learn: 0.2751348	total: 46s	remaining: 2m 4s
1050:	learn: 0.2350586	total: 1m 7s	remaining: 1m 38s
1400:	learn: 0.2041647	total: 1m 29s	remaining: 1m 16s
1750:	learn: 0.1799684	total: 1m 51s	remaining: 54.3s
2100:	learn: 0.1604144	total: 2m 12s	remaining: 31.5s
2450:	learn: 0.1435564	total: 2m 35s	remaining: 9.45s
2599:	learn: 0.1372852	total: 2m 44s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x7f3803d805b0>

## **Submit**

In [10]:
test_data = pd.read_csv('/content/public_test.csv')
test_data["embeddings"] = test_data["answer"].apply(get_bert_embeddings)
test_data["embeddings"] = test_data["embeddings"].apply(lambda x: x.reshape(-1))

predictions = catboost_model.predict(test_data["embeddings"].tolist())
predictions_labels = pd.Series(predictions).map({0: "ai", 1: "people"})

predictions_labels.to_csv("predictions.csv", index=False, header=["label"])