# Задача 1


В первой задаче необходимо оценить вероятность наличия в объявлении контактной информации. 
Результатом работы модели является `pd.DataFrame` с колонками:
* `index`: `int`, положение записи в файле;
* `prediction`: `float` от 0 до 1.

# Решение
## Импорт данных

In [20]:
import re
import numpy as np
import pandas as pd
from lightgbm import LGBMClassifier

import pickle

from scipy.sparse import csr_matrix
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import make_column_transformer


from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

from scipy import sparse

RANDOM_STATE = 42

import warnings
warnings.filterwarnings('ignore')

In [21]:
dtype_dict = {
    'subcategory': 'category',
    'category': 'category',
    'price': 'float32',
    'region': 'category',
    'city': 'category',
    'is_bad': 'bool'
}

train = pd.read_csv('data/train.csv',
                    dtype=dtype_dict)
val = pd.read_csv('data/val.csv',
                  dtype=dtype_dict)

In [22]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 984487 entries, 0 to 984486
Data columns (total 9 columns):
 #   Column              Non-Null Count   Dtype   
---  ------              --------------   -----   
 0   title               984487 non-null  object  
 1   description         984487 non-null  object  
 2   subcategory         984487 non-null  category
 3   category            984487 non-null  category
 4   price               926439 non-null  float32 
 5   region              984487 non-null  category
 6   city                984487 non-null  category
 7   datetime_submitted  984487 non-null  object  
 8   is_bad              984487 non-null  bool    
dtypes: bool(1), category(4), float32(1), object(3)
memory usage: 32.1+ MB


In [23]:
val.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16237 entries, 0 to 16236
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype   
---  ------              --------------  -----   
 0   title               16237 non-null  object  
 1   description         16237 non-null  object  
 2   subcategory         16237 non-null  category
 3   category            16237 non-null  category
 4   price               15374 non-null  float32 
 5   region              16237 non-null  category
 6   city                16237 non-null  category
 7   datetime_submitted  16237 non-null  object  
 8   is_bad              16237 non-null  bool    
dtypes: bool(1), category(4), float32(1), object(3)
memory usage: 586.3+ KB


## Преобразование признаков

- Добавим признак `flg_col` - информация о найденном контакте при помощи регулярного выражения
- Преобразуем `datetime_submitted`
- Закодируем категориальные признаки (для линейной модели)
- Для простоты пока на этапе безлайна заполним пропуски медианой в `price`
- Преобразуем в векторы столбцы `description` и `title` при помощи Tfidf

In [24]:
phone = r"(?:(\+?[7|8|9])([\-\(\)а-яА-Яa-zA-Z_ ]{0,10}))?(\d{3})([\-\(\)а-яА-Яa-zA-Z_ ]{0,10})(\d{3})([\-\(\)а-яА-Яa-zA-Z_ ]{0,10})(\d{2})([\-\(\)а-яА-Яa-zA-Z_ ]{0,10})(\d{2})"
username = r"(@\w{4,32})|(?:(https?:\/?\/)?t(elegram)?\.me\/(\w{5,}))"
vk = r"(http:\/?\/?|https:\/?\/?)?(www.)?(vk\.com|vkontakte\.ru)\/(id\d|[a-zA-Z0-9_.]){2,}"


CONTACT = '|'.join([phone, username, vk])

In [25]:
class FindRegex(BaseEstimator, TransformerMixin):
    def __init__(self, pattern, **kwargs):
        self.pattern = pattern
        super().__init__(**kwargs)

    def fit(self, X, y=None):
        return self

    def find_regex(self, data: str):
        if re.search(self.pattern, data.lower()):
            return 1
        return 0

    def transform(self, X, y=None):
        result = csr_matrix(X.apply(self.find_regex)).T
        return result

    def get_feature_names(self):
        return ['flg_col']

Для простоты построим pipline, чтобы далее можно было удобно применять на тестовых данных

In [26]:
%%time

pipeline = make_column_transformer(
    (OneHotEncoder(handle_unknown='ignore',
                   drop='first'), ['subcategory', 'category', 'region', 'city']),
    (SimpleImputer(strategy='median'), ['price']),
    (FindRegex(pattern=CONTACT), 'description'),
    (TfidfVectorizer(min_df=1000, ngram_range=(1, 2),
                     max_features=1000), 'description'),
    (TfidfVectorizer(min_df=1000, ngram_range=(1, 2),
                     max_features=500), 'title'))

X_train = pipeline.fit_transform(train.drop(columns=['is_bad']))
X_test = pipeline.transform(val.drop(columns=['is_bad']))

y_train = train.is_bad
y_test = val.is_bad

CPU times: user 9min 23s, sys: 3min 18s, total: 12min 42s
Wall time: 13min 59s


Сохраним результат

In [27]:
# sparse.save_npz("X_train.npz", X_train)
# sparse.save_npz("X_test.npz", X_test)

In [28]:
# X_train = sparse.load_npz("X_train.npz")
# X_test = sparse.load_npz("X_test.npz")

## Тренировка модели

### LogisticRegression

In [29]:
baseline_lr = Pipeline(steps=[('sc', StandardScaler(with_mean=False)),
                              ('classifier',
                               LogisticRegression(random_state=RANDOM_STATE,
                                                  class_weight='balanced'))])

In [30]:
%%time
baseline_lr.fit(X_train, y_train)

CPU times: user 48.3 s, sys: 3.94 s, total: 52.2 s
Wall time: 21.9 s


Pipeline(steps=[('sc', StandardScaler(with_mean=False)),
                ('classifier',
                 LogisticRegression(class_weight='balanced', random_state=42))])

### Результаты

Посмотрим на значение метрик в каждой из категории, а также на среднее значение `ROC AUC`

In [31]:
y_pred = pd.Series(baseline_lr.predict_proba(X_test)[:, 1], index=y_test.index)
calegories = np.unique(train.category.tolist())
roc_auc_category = {}

for cat in calegories:
    idx = val[val.category == cat].index
    roc_auc = roc_auc_score(y_test[idx], y_pred[idx])
    roc_auc_category[cat] = roc_auc
    print(f'{cat} - {roc_auc:0.2f}')
    
print(f'\nROC_AUC = {np.mean(list(roc_auc_category.values())):.2f}')

Бытовая электроника - 0.89
Для бизнеса - 0.75
Для дома и дачи - 0.88
Животные - 0.83
Личные вещи - 0.84
Недвижимость - 0.93
Работа - 0.82
Транспорт - 0.96
Услуги - 0.82
Хобби и отдых - 0.88

ROC_AUC = 0.86


### Преобразование признаков для LGBMClassifier

Пропустим признак `price`, так как LGBM хорошо умеет справляться с пропусками

In [32]:
%%time

pipeline = make_column_transformer(
    (OneHotEncoder(handle_unknown='ignore'),
     ['subcategory', 'category', 'region', 'city']),
    ('passthrough', ['price']), 
    (FindRegex(pattern=CONTACT), 'description'),
    (TfidfVectorizer(min_df=1000, ngram_range=(1, 2)), 'description'),
    (TfidfVectorizer(min_df=1000, ngram_range=(1, 2)), 'title'))

X_train = pipeline.fit_transform(train.drop(columns=['is_bad']))
X_test = pipeline.transform(val.drop(columns=['is_bad']))

y_train = train.is_bad
y_test = val.is_bad

print(f'X_train: {X_train.shape}, X_test: {X_test.shape}')

X_train: (984487, 14170), X_test: (16237, 14170)
CPU times: user 7min 39s, sys: 1min 32s, total: 9min 12s
Wall time: 10min 3s


In [33]:
# sparse.save_npz("X_train_cat.npz", X_train)
# sparse.save_npz("X_test_cat.npz", X_test)

### LGBMClassifier

In [34]:
baseline_lgb = LGBMClassifier(n_estimators=500,
                              metric='auc',
                              is_unbalance=True,
                              random_state=RANDOM_STATE)

In [35]:
%%time
baseline_lgb.fit(X_train,
                 y_train,
                 eval_set=[(X_train, y_train), (X_test, y_test)],
                 eval_metric='auc',
                 verbose=10,
                 early_stopping_rounds=40)

Training until validation scores don't improve for 40 rounds
[10]	training's auc: 0.928304	valid_1's auc: 0.904846
[20]	training's auc: 0.94	valid_1's auc: 0.920048
[30]	training's auc: 0.946664	valid_1's auc: 0.931423
[40]	training's auc: 0.951048	valid_1's auc: 0.937492
[50]	training's auc: 0.95413	valid_1's auc: 0.940693
[60]	training's auc: 0.956623	valid_1's auc: 0.943143
[70]	training's auc: 0.958573	valid_1's auc: 0.945209
[80]	training's auc: 0.960306	valid_1's auc: 0.94656
[90]	training's auc: 0.961611	valid_1's auc: 0.948454
[100]	training's auc: 0.962814	valid_1's auc: 0.949731
[110]	training's auc: 0.963822	valid_1's auc: 0.950572
[120]	training's auc: 0.964682	valid_1's auc: 0.951105
[130]	training's auc: 0.965457	valid_1's auc: 0.952042
[140]	training's auc: 0.966139	valid_1's auc: 0.952937
[150]	training's auc: 0.966822	valid_1's auc: 0.953326
[160]	training's auc: 0.967403	valid_1's auc: 0.953797
[170]	training's auc: 0.967986	valid_1's auc: 0.95446
[180]	training's auc

LGBMClassifier(is_unbalance=True, metric='auc', n_estimators=500,
               random_state=42)

In [36]:
# with open('lgbm_train_pickle.pkl','wb') as f:                                                                                                                                                                                                                                                        
#     pickle.dump(baseline_lgb, f) 

### Результаты

Посмотрим на значение метрик в каждой из категории, а также на среднее значение `ROC AUC`

In [37]:
y_pred = pd.Series(baseline_lgb.predict_proba(X_test)[:, 1], index=y_test.index)
calegories = np.unique(train.category.tolist())
roc_auc_category = {}

for cat in calegories:
    idx = val[val.category == cat].index
    roc_auc = roc_auc_score(y_test[idx], y_pred[idx])
    roc_auc_category[cat] = roc_auc
    print(f'{cat} - {roc_auc:0.2f}')
    
print(f'\nROC_AUC = {np.mean(list(roc_auc_category.values())):.2f}')

Бытовая электроника - 0.93
Для бизнеса - 0.85
Для дома и дачи - 0.93
Животные - 0.87
Личные вещи - 0.84
Недвижимость - 0.97
Работа - 0.91
Транспорт - 0.98
Услуги - 0.88
Хобби и отдых - 0.93

ROC_AUC = 0.91
