In [17]:
import pandas as pd

import re

from sklearn.metrics import roc_auc_score
from transformers import pipeline as pipe
from typing import Union, Tuple, List, Match
import torch
from lib.config import model_path, device, CONTACT
from tqdm.auto import tqdm

import warnings

warnings.filterwarnings('ignore')

# Задача 1

## BERT tiny

In [5]:
model_path, device

('./bert_final_model', device(type='cpu'))

In [2]:
# train = pd.read_csv('data/train.csv')
val = pd.read_csv('data/val.csv')

In [3]:
def predict_proba(text: str, pipeline) -> float:
    """Получение вероятности контакта в тексте"""
    predict = pipeline(text)[0]
    if predict["label"] == "LABEL_0":
        return 1 - predict["score"]
    return predict["score"]

def task1(description: pd.Series) -> List[float]:
    """Получение результатов предсказания модели поиска контактов в тексте"""
    clf = pipe("text-classification", model=model_path, device=device)
    dataset_pbar = tqdm(description)
    result = [predict_proba(data, clf) for data in dataset_pbar]
    return result

In [6]:
y_pred = pd.Series(task1(val.description), index=val.index)

  0%|          | 0/16237 [00:00<?, ?it/s]

In [8]:
y_test = val.is_bad
calegories = np.unique(val.category.tolist())
roc_auc_category = {}

for cat in calegories:
    idx = val[val.category == cat].index
    roc_auc = roc_auc_score(y_test[idx], y_pred[idx])
    roc_auc_category[cat] = roc_auc
    print(f'{cat} - {roc_auc:0.2f}')
    
print(f'\nROC_AUC = {np.mean(list(roc_auc_category.values())):.2f}')

Бытовая электроника - 0.97
Для бизнеса - 0.97
Для дома и дачи - 0.97
Животные - 0.96
Личные вещи - 0.89
Недвижимость - 0.99
Работа - 0.96
Транспорт - 1.00
Услуги - 0.95
Хобби и отдых - 0.95

ROC_AUC = 0.96


# Задача 2
## Поиск позиции контактов

In [12]:
CONTACT

'(?:(\\+?[7|8|9])([\\-\\(\\)а-яА-Яa-zA-Z_ ]{0,10}))?(\\d{3})([\\-\\(\\)а-яА-Яa-zA-Z_ ]{0,10})(\\d{3})([\\-\\(\\)а-яА-Яa-zA-Z_ ]{0,10})(\\d{2})([\\-\\(\\)а-яА-Яa-zA-Z_ ]{0,10})(\\d{2});(@\\w{4,32})|(?:(https?:\\/?\\/)?t(elegram)?\\.me\\/(\\w{5,}));(http:\\/?\\/?|https:\\/?\\/?)?(www.)?(vk\\.com|vkontakte\\.ru)\\/(id\\d|[a-zA-Z0-9_.]){2,}'

In [18]:
def get_regular(text: str, regular: str) -> List[Match[str]]:
    """Получение списка с результатами поиска паттернов"""
    regex_split = regular.split(";")
    result = [list(re.finditer(reg, text)) for reg in regex_split]
    return sum(result, [])


def task2(
    description: str, regular=CONTACT
) -> Union[Tuple[int, int], Tuple[None, None]]:
    """Получение результатов предсказания позиции контакта в строке"""
    result = get_regular(description, regular)
    if len(result) == 1:
        return result[0].start(), result[0].end() - 1
    return None, None

In [19]:
task2_prediction = pd.DataFrame(columns=["index", "start", "finish"])
task2_prediction["index"] = val.index
task2_prediction[["start", "finish"]] = val["description"].apply(
    lambda x: pd.Series(task2(description=x)),
)

In [20]:
task2_prediction[:5]

Unnamed: 0,index,start,finish
0,0,8.0,18.0
1,1,,
2,2,263.0,277.0
3,3,,
4,4,,
