#### Продолжим работу с данными, которые были использованы в ДЗ2 и 3, продолжим решать задачу обнаружения мошеннических транзакций, что позволит получить полное решение задачи / полный пайплайн.

#### Задание 0: выбрать любую модель машнного обучения и зафиксировать любой тип валидации. Обучить базовую модель и зафиксировать базовое качество модели. В каждом следующем задании нужно будет обучить выбранную модель и оценивать ее качество на зафиксированной схеме валидации. После каждого задания, требуется сделать вывод о достигаемом качестве модели, по сравнению с качестом из предыдущего шага.



In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import rankdata
from scipy.stats.mstats import winsorize

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, train_test_split, KFold, GroupKFold
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score
import catboost as cb
from datetime import timedelta
import datetime as dt




In [2]:
# путь расположения выборки
path_train = '/kaggle/input/assignment/assignment_2_train.csv'
path_leaderboard = '/kaggle/input/assignment/assignment_2_test.csv'

In [3]:
# загрузка данных из файла
train = pd.read_csv(path_train)
test = pd.read_csv(path_leaderboard)

In [4]:
def fit_catboost(x_train, y_train, model_params, categorical, *args):
    """
    Обучение модели CatBoostClassifier.

    Parameters
    ----------
    x_train: pandas.core.frame.DataFrame
        Матрица признаков для обучения модели.

    y_valid: pandas.core.frame.Series
        Вектор целевой переменной для валидации модели.

    model_params: dict
        Словарь со значением гиперпараметров модели.

    categorical: List[str]
        Список с названием категориальных признаков.

    Returns
    -------
    model: catboost.core.CatBoostClassifier
        Экземпляр обученного классификатора.

    """
    eval_set = [(x_train, y_train)]

    if args == 2:
        eval_set.append((args[0], args[1]))

    model = cb.CatBoostClassifier(**model_params)
    model.fit(
        X=x_train,
        y=y_train,
        eval_set=eval_set,
        cat_features=categorical
    )

    return model

def evaluate_model(model, *args):
    """
    Оценка качества модели.

    Parameters
    ----------
    model: catboost.core.CatBoostClassifier
        Экземпляр обученного классификатора.

    args: pandas.core.frame.DataFrame
        Пары из обучающей выборки и вектора истинных ответов.
        Опциональные параметры.

    """
    eval_data = [(args[i], args[i+1]) for i in range(0, len(args), 2)]
    if eval_data:
        for sample, target in eval_data:
            y_pred = model.predict_proba(sample)[:, 1]
            score = roc_auc_score(target, y_pred)
            print(f"score = {round(score, 6)}")

def prepare_data(X, categorical, to_drop):
    """
    Преобразование данных для передачи в модель.

    Parameters
    ----------
    X: pandas.core.frame.DataFrame
        Матрица признаков для передачи в модель.

    categorical: List[str]
        Список с названием категориальных признаков.

    drop_features: List[str]
        Список с названием признаков, которые не должны
        участвовать в обучении.

    Returns
    -------
    X_transformed: pandas.core.frame.DataFrame
        Матрица признаков, подготовленная для передачи в модель.

    """
    X_transformed = X.copy()
    to_drop = set(X.columns) & set(to_drop)

    if to_drop:
        X_transformed = X_transformed.drop(to_drop, axis=1)

    X_transformed[categorical] = X_transformed[categorical].astype(str)
    return X_transformed

In [5]:
to_drop = [
    "TransactionID",
    "TransactionDT",
    "isFraud",
]

categorical = train.select_dtypes(include=["object"]).columns.tolist()

In [6]:
x_train, x_valid = train_test_split(
    train, train_size=0.7, random_state=27, shuffle=True
)
y_train, y_valid = train_test_split(
    train["isFraud"], train_size=0.7, random_state=27, shuffle=True
)

In [7]:
x_train = prepare_data(x_train, categorical=categorical, to_drop=to_drop)
x_valid = prepare_data(x_valid, categorical=categorical, to_drop=to_drop)
x_public_lb = prepare_data(test, categorical=categorical, to_drop=to_drop)

print("x_train.shape = {} rows, {} cols".format(*x_train.shape))
print("x_valid.shape = {} rows, {} cols".format(*x_valid.shape))
print("x_test.shape = {} rows, {} cols".format(*x_public_lb.shape))

x_train.shape = 125999 rows, 391 cols
x_valid.shape = 54001 rows, 391 cols
x_test.shape = 100001 rows, 391 cols


In [8]:
cb_params_10000 = {
    "n_estimators": 1000,
    "learning_rate": 0.01,
    "loss_function": "Logloss",
    "eval_metric": "AUC",
    "task_type": "CPU",
    "max_bin": 20,
    "verbose": 300,
    "max_depth": 6,
    "l2_leaf_reg": 100,
    "early_stopping_rounds": 50,
    "thread_count": 6,
    "random_seed": 27
}

model = fit_catboost(
    x_train, y_train, cb_params_10000, categorical, x_valid, y_valid)

0:	test: 0.6186335	best: 0.6186335 (0)	total: 574ms	remaining: 9m 33s
300:	test: 0.8668141	best: 0.8668141 (300)	total: 1m 24s	remaining: 3m 16s
600:	test: 0.8824756	best: 0.8824756 (600)	total: 2m 49s	remaining: 1m 52s
900:	test: 0.8899360	best: 0.8899360 (900)	total: 4m 24s	remaining: 29s
999:	test: 0.8914724	best: 0.8914724 (999)	total: 4m 51s	remaining: 0us

bestTest = 0.8914723581
bestIteration = 999



#### Задание 1: признак TransactionDT - это смещение в секундах относительно базовой даты. Базовая дата - 2017-12-01, преобразовать признак TransactionDT в datetime, прибавив к базовой дате исходное значение признака. Из полученного признака выделить год, месяц, день недели, час, день.

In [9]:
train.head()

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,V330,V331,V332,V333,V334,V335,V336,V337,V338,V339
0,2987000,0,86400,68.5,W,13926,,150.0,discover,142.0,...,,,,,,,,,,
1,2987001,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,...,,,,,,,,,,
2,2987002,0,86469,59.0,W,4663,490.0,150.0,visa,166.0,...,,,,,,,,,,
3,2987003,0,86499,50.0,W,18132,567.0,150.0,mastercard,117.0,...,,,,,,,,,,
4,2987004,0,86506,50.0,H,4497,514.0,150.0,mastercard,102.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
train['DT_diff'] = train['TransactionDT'] - 86400

In [11]:
train.head()

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,V331,V332,V333,V334,V335,V336,V337,V338,V339,DT_diff
0,2987000,0,86400,68.5,W,13926,,150.0,discover,142.0,...,,,,,,,,,,0
1,2987001,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,...,,,,,,,,,,1
2,2987002,0,86469,59.0,W,4663,490.0,150.0,visa,166.0,...,,,,,,,,,,69
3,2987003,0,86499,50.0,W,18132,567.0,150.0,mastercard,117.0,...,,,,,,,,,,99
4,2987004,0,86506,50.0,H,4497,514.0,150.0,mastercard,102.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,106


In [12]:
>>> from datetime import datetime

train['datetime'] = datetime.fromisoformat('2017-12-01T00:00:00')

In [13]:
train.tail()

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,V332,V333,V334,V335,V336,V337,V338,V339,DT_diff,datetime
179995,3166995,0,3958217,39.0,W,1877,310.0,150.0,mastercard,224.0,...,,,,,,,,,3871817,2017-12-01
179996,3166996,0,3958237,59.95,W,10075,514.0,150.0,mastercard,224.0,...,,,,,,,,,3871837,2017-12-01
179997,3166997,0,3958241,34.0,W,6053,122.0,150.0,mastercard,195.0,...,,,,,,,,,3871841,2017-12-01
179998,3166998,0,3958260,59.0,W,7726,555.0,150.0,visa,226.0,...,,,,,,,,,3871860,2017-12-01
179999,3166999,0,3958317,226.0,W,17480,528.0,150.0,visa,226.0,...,,,,,,,,,3871917,2017-12-01


In [14]:
start = train['datetime'][0]
start

Timestamp('2017-12-01 00:00:00')

In [15]:
train['datetime'] = [start + timedelta(seconds=s) for s in train.DT_diff]

In [16]:
train.tail()

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,V332,V333,V334,V335,V336,V337,V338,V339,DT_diff,datetime
179995,3166995,0,3958217,39.0,W,1877,310.0,150.0,mastercard,224.0,...,,,,,,,,,3871817,2018-01-14 19:30:17
179996,3166996,0,3958237,59.95,W,10075,514.0,150.0,mastercard,224.0,...,,,,,,,,,3871837,2018-01-14 19:30:37
179997,3166997,0,3958241,34.0,W,6053,122.0,150.0,mastercard,195.0,...,,,,,,,,,3871841,2018-01-14 19:30:41
179998,3166998,0,3958260,59.0,W,7726,555.0,150.0,visa,226.0,...,,,,,,,,,3871860,2018-01-14 19:31:00
179999,3166999,0,3958317,226.0,W,17480,528.0,150.0,visa,226.0,...,,,,,,,,,3871917,2018-01-14 19:31:57


In [17]:
train = train.drop(['TransactionDT', 'DT_diff'], axis=1)


In [18]:
test.head()

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,V330,V331,V332,V333,V334,V335,V336,V337,V338,V339
0,3287000,1,7415038,226.0,W,12473,555.0,150.0,visa,226.0,...,,,,,,,,,,
1,3287001,0,7415054,3072.0,W,15651,417.0,150.0,visa,226.0,...,,,,,,,,,,
2,3287002,0,7415081,319.95,W,13844,583.0,150.0,visa,226.0,...,,,,,,,,,,
3,3287003,0,7415111,171.0,W,11556,309.0,150.0,visa,226.0,...,,,,,,,,,,
4,3287004,0,7415112,107.95,W,10985,555.0,150.0,visa,226.0,...,,,,,,,,,,


In [19]:
test['DT_diff'] = test['TransactionDT'] - 86400
test['datetime'] = datetime.fromisoformat('2017-12-01T00:00:00')
test['datetime'] = [start + timedelta(seconds=s) for s in test.DT_diff]
test = test.drop(['TransactionDT', 'DT_diff'], axis=1)

In [20]:
train['year'] = train['datetime'].dt.year
train['month'] = train['datetime'].dt.month
train['dayofyear'] = train['datetime'].dt.dayofyear
train['week'] = train['datetime'].dt.isocalendar().week
train['dayofweek'] = train['datetime'].dt.dayofweek
train['hour'] = train['datetime'].dt.hour
train.drop( columns = 'datetime', inplace = True )

In [21]:
train

Unnamed: 0,TransactionID,isFraud,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,...,V336,V337,V338,V339,year,month,dayofyear,week,dayofweek,hour
0,2987000,0,68.50,W,13926,,150.0,discover,142.0,credit,...,,,,,2017,12,335,48,4,0
1,2987001,0,29.00,W,2755,404.0,150.0,mastercard,102.0,credit,...,,,,,2017,12,335,48,4,0
2,2987002,0,59.00,W,4663,490.0,150.0,visa,166.0,debit,...,,,,,2017,12,335,48,4,0
3,2987003,0,50.00,W,18132,567.0,150.0,mastercard,117.0,debit,...,,,,,2017,12,335,48,4,0
4,2987004,0,50.00,H,4497,514.0,150.0,mastercard,102.0,credit,...,0.0,0.0,0.0,0.0,2017,12,335,48,4,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
179995,3166995,0,39.00,W,1877,310.0,150.0,mastercard,224.0,debit,...,,,,,2018,1,14,2,6,19
179996,3166996,0,59.95,W,10075,514.0,150.0,mastercard,224.0,debit,...,,,,,2018,1,14,2,6,19
179997,3166997,0,34.00,W,6053,122.0,150.0,mastercard,195.0,debit,...,,,,,2018,1,14,2,6,19
179998,3166998,0,59.00,W,7726,555.0,150.0,visa,226.0,debit,...,,,,,2018,1,14,2,6,19


In [22]:
test['year'] = test['datetime'].dt.year
test['month'] = test['datetime'].dt.month
test['dayofyear'] = test['datetime'].dt.dayofyear
test['week'] = test['datetime'].dt.isocalendar().week
test['dayofweek'] = test['datetime'].dt.dayofweek
test['hour'] = test['datetime'].dt.hour
test.drop( columns = 'datetime', inplace = True )

In [23]:
test

Unnamed: 0,TransactionID,isFraud,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,...,V336,V337,V338,V339,year,month,dayofyear,week,dayofweek,hour
0,3287000,1,226.000,W,12473,555.0,150.0,visa,226.0,credit,...,,,,,2018,2,54,8,4,19
1,3287001,0,3072.000,W,15651,417.0,150.0,visa,226.0,debit,...,,,,,2018,2,54,8,4,19
2,3287002,0,319.950,W,13844,583.0,150.0,visa,226.0,credit,...,,,,,2018,2,54,8,4,19
3,3287003,0,171.000,W,11556,309.0,150.0,visa,226.0,debit,...,,,,,2018,2,54,8,4,19
4,3287004,0,107.950,W,10985,555.0,150.0,visa,226.0,debit,...,,,,,2018,2,54,8,4,19
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99996,3386996,0,368.990,W,13964,496.0,150.0,mastercard,224.0,debit,...,,,,,2018,3,85,13,0,19
99997,3386997,0,445.330,W,10616,583.0,150.0,visa,226.0,credit,...,,,,,2018,3,85,13,0,19
99998,3386998,0,15.226,C,9803,583.0,150.0,visa,226.0,credit,...,,,,,2018,3,85,13,0,19
99999,3386999,0,34.742,C,16062,500.0,185.0,mastercard,137.0,credit,...,,,,,2018,3,85,13,0,19


In [24]:
train['week'] = train['week'].astype('int64')
test['week'] = test['week'].astype('int64')

In [25]:
to_drop = [
    "TransactionID",
    "isFraud",
]

categorical = train.select_dtypes(include=["object"]).columns.tolist()

x_train, x_valid = train_test_split(
    train, train_size=0.7, random_state=27, shuffle=True
)
y_train, y_valid = train_test_split(
    train["isFraud"], train_size=0.7, random_state=27, shuffle=True
)


x_train = prepare_data(x_train, categorical=categorical, to_drop=to_drop)
x_valid = prepare_data(x_valid, categorical=categorical, to_drop=to_drop)
x_public_lb = prepare_data(test, categorical=categorical, to_drop=to_drop)

print("x_train.shape = {} rows, {} cols".format(*x_train.shape))
print("x_valid.shape = {} rows, {} cols".format(*x_valid.shape))
print("x_test.shape = {} rows, {} cols".format(*x_public_lb.shape))

cb_params_1000 = {
    "n_estimators": 1000,
    "learning_rate": 0.01,
    "loss_function": "Logloss",
    "eval_metric": "AUC",
    "task_type": "CPU",
    "max_bin": 20,
    "verbose": 300,
    "max_depth": 6,
    "l2_leaf_reg": 100,
    "early_stopping_rounds": 50,
    "thread_count": 6,
    "random_seed": 27
}

model = fit_catboost(
    x_train, y_train, cb_params_1000, categorical, x_valid, y_valid)

x_train.shape = 125999 rows, 397 cols
x_valid.shape = 54001 rows, 397 cols
x_test.shape = 100001 rows, 397 cols
0:	test: 0.6883632	best: 0.6883632 (0)	total: 461ms	remaining: 7m 40s
300:	test: 0.8717701	best: 0.8717701 (300)	total: 1m 21s	remaining: 3m 8s
600:	test: 0.8880872	best: 0.8880872 (600)	total: 2m 40s	remaining: 1m 46s
900:	test: 0.8965785	best: 0.8965785 (900)	total: 3m 56s	remaining: 26s
999:	test: 0.8983305	best: 0.8983305 (999)	total: 4m 21s	remaining: 0us

bestTest = 0.898330468
bestIteration = 999



1. Модель: bestTest = 0.8914723581
2. Модель: bestTest = 0.898330468

Мы видим заметное улучшение результата - 0.00685810990000002

#### Задание 2: сделать конкатенацию признаков
#### * card1 + card2;
#### * card1 + card2 + card_3 + card_5;
#### * card1 + card2 + card_3 + card_5 + addr1 + addr2

#### Рассматривать их как категориальных признаки.


In [26]:
train['card1'].fillna(0, inplace=True)
train['card2'].fillna(0, inplace=True)
train['card3'].fillna(0, inplace=True)
train['card5'].fillna(0, inplace=True)
train['addr1'].fillna(0, inplace=True)
train['addr2'].fillna(0, inplace=True)

In [27]:
features = ['card1', 'card2', 'card3', 'card5', 'addr1', 'addr2']

In [28]:
for i in features:
    train[i] = train[i].astype('object')

In [29]:
for i in features:
    test[i] = test[i].astype('object')

In [30]:
train['card1']

0         13926
1          2755
2          4663
3         18132
4          4497
          ...  
179995     1877
179996    10075
179997     6053
179998     7726
179999    17480
Name: card1, Length: 180000, dtype: object

In [31]:
train['card1+card2'] = train['card1'] + train['card2']
train['card1+card2']

0         13926
1          3159
2          5153
3         18699
4          5011
          ...  
179995     2187
179996    10589
179997     6175
179998     8281
179999    18008
Name: card1+card2, Length: 180000, dtype: object

In [32]:
train['card1+card2+card3+card5'] = train['card1'] + train['card2'] + train['card3'] + train['card5']
train['card1+card2+card3+card5']

0         14218
1          3411
2          5469
3         18966
4          5263
          ...  
179995     2561
179996    10963
179997     6520
179998     8657
179999    18384
Name: card1+card2+card3+card5, Length: 180000, dtype: object

In [33]:
train['card1-5+addr1-2'] = train['card1'] + train['card2'] + train['card3'] + train['card5'] + train['addr1'] + train['addr2']
train['card1-5+addr1-2']

0         14620
1          3823
2          5886
3         19529
4          5770
          ...  
179995     2920
179996    11365
179997     6937
179998     9016
179999    18594
Name: card1-5+addr1-2, Length: 180000, dtype: object

In [34]:
test['card1'].fillna(0, inplace=True)
test['card2'].fillna(0, inplace=True)
test['card3'].fillna(0, inplace=True)
test['card5'].fillna(0, inplace=True)
test['addr1'].fillna(0, inplace=True)
test['addr2'].fillna(0, inplace=True)

In [35]:
test['card1+card2'] = test['card1'] + test['card2']
test['card1+card2+card3+card5'] = test['card1'] + test['card2'] + test['card3'] + test['card5']
test['card1-5+addr1-2'] = test['card1'] + test['card2'] + test['card3'] + test['card5'] + test['addr1'] + test['addr2']


In [36]:
x_train, x_valid = train_test_split(
    train, train_size=0.7, random_state=27, shuffle=True
)
y_train, y_valid = train_test_split(
    train["isFraud"], train_size=0.7, random_state=27, shuffle=True
)


x_train = prepare_data(x_train, categorical=categorical, to_drop=to_drop)
x_valid = prepare_data(x_valid, categorical=categorical, to_drop=to_drop)
x_public_lb = prepare_data(test, categorical=categorical, to_drop=to_drop)

print("x_train.shape = {} rows, {} cols".format(*x_train.shape))
print("x_valid.shape = {} rows, {} cols".format(*x_valid.shape))
print("x_test.shape = {} rows, {} cols".format(*x_public_lb.shape))


model = fit_catboost(
    x_train, y_train, cb_params_10000, categorical, x_valid, y_valid)

x_train.shape = 125999 rows, 400 cols
x_valid.shape = 54001 rows, 400 cols
x_test.shape = 100001 rows, 400 cols
0:	test: 0.7043478	best: 0.7043478 (0)	total: 260ms	remaining: 4m 19s
300:	test: 0.8728462	best: 0.8728462 (300)	total: 1m 22s	remaining: 3m 10s
600:	test: 0.8885513	best: 0.8885513 (600)	total: 2m 51s	remaining: 1m 53s
900:	test: 0.8960178	best: 0.8960178 (900)	total: 4m 19s	remaining: 28.5s
999:	test: 0.8974693	best: 0.8974693 (999)	total: 4m 48s	remaining: 0us

bestTest = 0.8974692709
bestIteration = 999



1. Модель: bestTest = 0.8914723581
2. Модель: bestTest = 0.898330468
3. Модель: bestTest = 0.8974692709

По сравнению со второй моделью качество немного упало. При этом, на итерации 600 качество 3 модели было выше, но затем модель отстает от второй. 

#### Задание 3: Сделать FrequencyEncoder для признаков card1 - card6, addr1, addr2.


In [37]:
freq_encoder_card1 = train["card1"].value_counts(normalize=True)
freq_encoder_card2 = train["card2"].value_counts(normalize=True)
freq_encoder_card3 = train["card3"].value_counts(normalize=True)
freq_encoder_card4 = train["card4"].value_counts(normalize=True)
freq_encoder_card5 = train["card5"].value_counts(normalize=True)
freq_encoder_card6 = train["card6"].value_counts(normalize=True)
freq_encoder_addr1 = train["addr1"].value_counts(normalize=True)
freq_encoder_addr2 = train["addr2"].value_counts(normalize=True)

train["card1_freq_enc"] = train["card1"].map(freq_encoder_card1)
train["card2_freq_enc"] = train["card2"].map(freq_encoder_card2)
train["card3_freq_enc"] = train["card3"].map(freq_encoder_card3)
train["card4_freq_enc"] = train["card4"].map(freq_encoder_card4)
train["card5_freq_enc"] = train["card5"].map(freq_encoder_card5)
train["card6_freq_enc"] = train["card6"].map(freq_encoder_card6)
train["addr1_freq_enc"] = train["addr1"].map(freq_encoder_addr1)
train["addr2_freq_enc"] = train["addr2"].map(freq_encoder_addr2)


In [38]:
freq_encoder_card1 = test["card1"].value_counts(normalize=True)
freq_encoder_card2 = test["card2"].value_counts(normalize=True)
freq_encoder_card3 = test["card3"].value_counts(normalize=True)
freq_encoder_card4 = test["card4"].value_counts(normalize=True)
freq_encoder_card5 = test["card5"].value_counts(normalize=True)
freq_encoder_card6 = test["card6"].value_counts(normalize=True)
freq_encoder_addr1 = test["addr1"].value_counts(normalize=True)
freq_encoder_addr2 = test["addr2"].value_counts(normalize=True)

test["card1_freq_enc"] = test["card1"].map(freq_encoder_card1)
test["card2_freq_enc"] = test["card2"].map(freq_encoder_card2)
test["card3_freq_enc"] = test["card3"].map(freq_encoder_card3)
test["card4_freq_enc"] = test["card4"].map(freq_encoder_card4)
test["card5_freq_enc"] = test["card5"].map(freq_encoder_card5)
test["card6_freq_enc"] = test["card6"].map(freq_encoder_card6)
test["addr1_freq_enc"] = test["addr1"].map(freq_encoder_addr1)
test["addr2_freq_enc"] = test["addr2"].map(freq_encoder_addr2)

In [39]:
x_train, x_valid = train_test_split(
    train, train_size=0.7, random_state=27, shuffle=True
)
y_train, y_valid = train_test_split(
    train["isFraud"], train_size=0.7, random_state=27, shuffle=True
)


x_train = prepare_data(x_train, categorical=categorical, to_drop=to_drop)
x_valid = prepare_data(x_valid, categorical=categorical, to_drop=to_drop)
x_public_lb = prepare_data(test, categorical=categorical, to_drop=to_drop)

print("x_train.shape = {} rows, {} cols".format(*x_train.shape))
print("x_valid.shape = {} rows, {} cols".format(*x_valid.shape))
print("x_test.shape = {} rows, {} cols".format(*x_public_lb.shape))


model = fit_catboost(
    x_train, y_train, cb_params_10000, categorical, x_valid, y_valid)

x_train.shape = 125999 rows, 408 cols
x_valid.shape = 54001 rows, 408 cols
x_test.shape = 100001 rows, 408 cols
0:	test: 0.6529649	best: 0.6529649 (0)	total: 301ms	remaining: 5m
300:	test: 0.8732310	best: 0.8732310 (300)	total: 1m 31s	remaining: 3m 32s
600:	test: 0.8891403	best: 0.8891403 (600)	total: 2m 57s	remaining: 1m 57s
900:	test: 0.8981187	best: 0.8981187 (900)	total: 4m 38s	remaining: 30.6s
999:	test: 0.9003870	best: 0.9003870 (999)	total: 5m 12s	remaining: 0us

bestTest = 0.9003869641
bestIteration = 999



1. Модель: bestTest = 0.8914723581
2. Модель: bestTest = 0.898330468
3. Модель: bestTest = 0.8974692709
4. Модель: bestTest = 0.9003869641

Модель показала лучший результат. 

#### Задание 4: Создать признаки на основе отношения: TransactionAmt к вычисленной статистике. Статистика - среднее значение / стандартное отклонение TransactionAmt, сгруппированное по card1 - card6, addr1, addr2, и по признакам, созданным в задании 2.

In [40]:
features = ['card1', 'card2', 'card3', 'card4', 'card5', 'card6', 'addr1', 'addr2', 'card1+card2', 'card1+card2+card3+card5', 'card1-5+addr1-2']

for i in features:
    mean = train.groupby(i)['TransactionAmt'].transform('mean')
    std = train.groupby(i)['TransactionAmt'].transform('std')
    train[f'mean_{i}'] = mean
    train[f'std_{i}'] = std

In [41]:
for i in features:
    mean = test.groupby(i)['TransactionAmt'].transform('mean')
    std = test.groupby(i)['TransactionAmt'].transform('std')
    test[f'mean_{i}'] = mean
    test[f'std_{i}'] = std

In [42]:
x_train, x_valid = train_test_split(
    train, train_size=0.7, random_state=27, shuffle=True
)
y_train, y_valid = train_test_split(
    train["isFraud"], train_size=0.7, random_state=27, shuffle=True
)


x_train = prepare_data(x_train, categorical=categorical, to_drop=to_drop)
x_valid = prepare_data(x_valid, categorical=categorical, to_drop=to_drop)
x_public_lb = prepare_data(test, categorical=categorical, to_drop=to_drop)

print("x_train.shape = {} rows, {} cols".format(*x_train.shape))
print("x_valid.shape = {} rows, {} cols".format(*x_valid.shape))
print("x_test.shape = {} rows, {} cols".format(*x_public_lb.shape))


model = fit_catboost(
    x_train, y_train, cb_params_10000, categorical, x_valid, y_valid)

x_train.shape = 125999 rows, 430 cols
x_valid.shape = 54001 rows, 430 cols
x_test.shape = 100001 rows, 430 cols
0:	test: 0.6933407	best: 0.6933407 (0)	total: 308ms	remaining: 5m 7s
300:	test: 0.8745977	best: 0.8745977 (300)	total: 1m 31s	remaining: 3m 33s
600:	test: 0.8887456	best: 0.8887456 (600)	total: 3m 3s	remaining: 2m 1s
900:	test: 0.8984509	best: 0.8984509 (900)	total: 4m 40s	remaining: 30.8s
999:	test: 0.9000043	best: 0.9000043 (999)	total: 5m 8s	remaining: 0us

bestTest = 0.900004265
bestIteration = 999



1. Модель: bestTest = 0.8914723581
2. Модель: bestTest = 0.898330468
3. Модель: bestTest = 0.8974692709
4. Модель: bestTest = 0.9003869641
5. Модель: bestTest = 0.900004265

Разницы с предыдущей моделью практически нет.  

#### Задание 5: Создать признаки на основе отношения: D15 к вычисленной статистике. Статистика - среднее значение / стандартное отклонение D15, сгруппированное по card1 - card6, addr1, addr2, и по признакам, созданным в задании 2.

In [43]:
for i in features:
    mean = train.groupby(i)['D15'].transform('mean')
    std = train.groupby(i)['D15'].transform('std')
    train[f'D15_mean_{i}'] = mean
    train[f'D15_std_{i}'] = std

In [44]:
for i in features:
    mean = test.groupby(i)['D15'].transform('mean')
    std = test.groupby(i)['D15'].transform('std')
    test[f'D15_mean_{i}'] = mean
    test[f'D15_std_{i}'] = std

In [45]:
x_train, x_valid = train_test_split(
    train, train_size=0.7, random_state=27, shuffle=True
)
y_train, y_valid = train_test_split(
    train["isFraud"], train_size=0.7, random_state=27, shuffle=True
)


x_train = prepare_data(x_train, categorical=categorical, to_drop=to_drop)
x_valid = prepare_data(x_valid, categorical=categorical, to_drop=to_drop)
x_public_lb = prepare_data(test, categorical=categorical, to_drop=to_drop)

print("x_train.shape = {} rows, {} cols".format(*x_train.shape))
print("x_valid.shape = {} rows, {} cols".format(*x_valid.shape))
print("x_test.shape = {} rows, {} cols".format(*x_public_lb.shape))


model = fit_catboost(
    x_train, y_train, cb_params_10000, categorical, x_valid, y_valid)

x_train.shape = 125999 rows, 452 cols
x_valid.shape = 54001 rows, 452 cols
x_test.shape = 100001 rows, 452 cols
0:	test: 0.5907391	best: 0.5907391 (0)	total: 634ms	remaining: 10m 33s
300:	test: 0.8730885	best: 0.8730885 (300)	total: 1m 38s	remaining: 3m 49s
600:	test: 0.8888942	best: 0.8888942 (600)	total: 3m 7s	remaining: 2m 4s
900:	test: 0.8981652	best: 0.8981652 (900)	total: 4m 32s	remaining: 29.9s
999:	test: 0.9005466	best: 0.9005466 (999)	total: 4m 59s	remaining: 0us

bestTest = 0.9005465653
bestIteration = 999



1. Модель: bestTest = 0.8914723581
2. Модель: bestTest = 0.898330468
3. Модель: bestTest = 0.8974692709
4. Модель: bestTest = 0.9003869641
5. Модель: bestTest = 0.900004265
6. Модель: bestTest = 0.9005465653


Минимальное улучшение качества в сравнении с 4 моделью.

#### Задание 6: выделить дробную часть и целую часть признака TransactionAmt в два отдельных признака. После создать отдельных признак - логарифм от TransactionAmt

In [46]:
import math

train['TransactionAmt_int'] = [math.modf(s)[1] for s in train.TransactionAmt]
train['TransactionAmt_float'] = [math.modf(s)[0] for s in train.TransactionAmt]

In [47]:
test['TransactionAmt_int'] = [math.modf(s)[1] for s in test.TransactionAmt]
test['TransactionAmt_float'] = [math.modf(s)[0] for s in test.TransactionAmt]

In [48]:
train['TransactionAmt_log'] = [math.log(s) for s in train.TransactionAmt]
test['TransactionAmt_log'] = [math.log(s) for s in test.TransactionAmt]


In [49]:
x_train, x_valid = train_test_split(
    train, train_size=0.7, random_state=27, shuffle=True
)
y_train, y_valid = train_test_split(
    train["isFraud"], train_size=0.7, random_state=27, shuffle=True
)


x_train = prepare_data(x_train, categorical=categorical, to_drop=to_drop)
x_valid = prepare_data(x_valid, categorical=categorical, to_drop=to_drop)
x_public_lb = prepare_data(test, categorical=categorical, to_drop=to_drop)

print("x_train.shape = {} rows, {} cols".format(*x_train.shape))
print("x_valid.shape = {} rows, {} cols".format(*x_valid.shape))
print("x_test.shape = {} rows, {} cols".format(*x_public_lb.shape))


model = fit_catboost(
    x_train, y_train, cb_params_10000, categorical, x_valid, y_valid)

x_train.shape = 125999 rows, 455 cols
x_valid.shape = 54001 rows, 455 cols
x_test.shape = 100001 rows, 455 cols
0:	test: 0.7000944	best: 0.7000944 (0)	total: 334ms	remaining: 5m 33s
300:	test: 0.8722545	best: 0.8722545 (300)	total: 1m 30s	remaining: 3m 29s
600:	test: 0.8890435	best: 0.8890435 (600)	total: 3m 1s	remaining: 2m
900:	test: 0.8979675	best: 0.8979675 (900)	total: 4m 29s	remaining: 29.6s
999:	test: 0.8993881	best: 0.8993881 (999)	total: 4m 57s	remaining: 0us

bestTest = 0.8993880512
bestIteration = 999



1. Модель: bestTest = 0.8914723581
2. Модель: bestTest = 0.898330468
3. Модель: bestTest = 0.8974692709
4. Модель: bestTest = 0.9003869641
5. Модель: bestTest = 0.900004265
6. Модель: bestTest = 0.9005465653
7. Модель: bestTest = 0.8993880512

Лучшее качество в итоге показала модель 6. 

#### Задание 7 (опция): выполнить предварительную подготовку / очистку признаков P_emaildomain и R_emaildomain (что и как делать - остается на ваше усмотрение) и сделать Frequency Encoding для очищенных признаков.

In [50]:
trainemail = train.loc[train['P_emaildomain'] != train['R_emaildomain']]
trainemail[['P_emaildomain', 'R_emaildomain']].head(50)

Unnamed: 0,P_emaildomain,R_emaildomain
0,,
1,gmail.com,
2,outlook.com,
3,yahoo.com,
4,gmail.com,
5,gmail.com,
6,yahoo.com,
7,mail.com,
8,anonymous.com,
9,yahoo.com,


Во втором признаке отсутствует много значений. При этом, там, где они есть, они всегда совпадают. Заменим отсутствующие в P_emaildomain значения на R_emaildomain и избавимся от дублирующей колонки

In [51]:
train['P_emaildomain'].fillna(train['R_emaildomain'], inplace=True)
train['P_emaildomain'].fillna(train['P_emaildomain'].mode(), inplace=True)
train = train.drop(['R_emaildomain'], axis=1)


In [52]:
test['P_emaildomain'].fillna(test['R_emaildomain'], inplace=True)
test['P_emaildomain'].fillna(test['P_emaildomain'].mode(), inplace=True)
test = test.drop(['R_emaildomain'], axis=1)

In [54]:
categorical = train.select_dtypes(include=["object"]).columns.tolist()

x_train, x_valid = train_test_split(
    train, train_size=0.7, random_state=27, shuffle=True
)
y_train, y_valid = train_test_split(
    train["isFraud"], train_size=0.7, random_state=27, shuffle=True
)


x_train = prepare_data(x_train, categorical=categorical, to_drop=to_drop)
x_valid = prepare_data(x_valid, categorical=categorical, to_drop=to_drop)
x_public_lb = prepare_data(test, categorical=categorical, to_drop=to_drop)

print("x_train.shape = {} rows, {} cols".format(*x_train.shape))
print("x_valid.shape = {} rows, {} cols".format(*x_valid.shape))
print("x_test.shape = {} rows, {} cols".format(*x_public_lb.shape))


model = fit_catboost(
    x_train, y_train, cb_params_10000, categorical, x_valid, y_valid)

x_train.shape = 125999 rows, 454 cols
x_valid.shape = 54001 rows, 454 cols
x_test.shape = 100001 rows, 454 cols
0:	test: 0.5000207	best: 0.5000207 (0)	total: 911ms	remaining: 15m 10s
300:	test: 0.9558594	best: 0.9558594 (300)	total: 2m 1s	remaining: 4m 43s
600:	test: 0.9685553	best: 0.9685553 (600)	total: 3m 56s	remaining: 2m 37s
900:	test: 0.9713951	best: 0.9713951 (900)	total: 5m 45s	remaining: 38s
999:	test: 0.9722143	best: 0.9722143 (999)	total: 6m 21s	remaining: 0us

bestTest = 0.9722143209
bestIteration = 999



1. Модель: bestTest = 0.8914723581
2. Модель: bestTest = 0.898330468
3. Модель: bestTest = 0.8974692709
4. Модель: bestTest = 0.9003869641
5. Модель: bestTest = 0.900004265
6. Модель: bestTest = 0.9005465653
7. Модель: bestTest = 0.8993880512
8. Модель: bestTest = 0.9722143209

Качество последней модели намного превышает все старые. Наверное за счет удаления полупустого, дублирующего другой признака. 

In [55]:
freq_encoder_email = train["P_emaildomain"].value_counts(normalize=True)
train["email_freq_enc"] = train["P_emaildomain"].map(freq_encoder_email)
freq_encoder_email = test["P_emaildomain"].value_counts(normalize=True)
test["email_freq_enc"] = test["P_emaildomain"].map(freq_encoder_email)

In [56]:
categorical = train.select_dtypes(include=["object"]).columns.tolist()

x_train, x_valid = train_test_split(
    train, train_size=0.7, random_state=27, shuffle=True
)
y_train, y_valid = train_test_split(
    train["isFraud"], train_size=0.7, random_state=27, shuffle=True
)


x_train = prepare_data(x_train, categorical=categorical, to_drop=to_drop)
x_valid = prepare_data(x_valid, categorical=categorical, to_drop=to_drop)
x_public_lb = prepare_data(test, categorical=categorical, to_drop=to_drop)

print("x_train.shape = {} rows, {} cols".format(*x_train.shape))
print("x_valid.shape = {} rows, {} cols".format(*x_valid.shape))
print("x_test.shape = {} rows, {} cols".format(*x_public_lb.shape))


model = fit_catboost(
    x_train, y_train, cb_params_10000, categorical, x_valid, y_valid)

x_train.shape = 125999 rows, 455 cols
x_valid.shape = 54001 rows, 455 cols
x_test.shape = 100001 rows, 455 cols
0:	test: 0.6621365	best: 0.6621365 (0)	total: 808ms	remaining: 13m 27s
300:	test: 0.9568327	best: 0.9568327 (300)	total: 1m 52s	remaining: 4m 21s
600:	test: 0.9658157	best: 0.9658157 (600)	total: 3m 46s	remaining: 2m 30s
900:	test: 0.9710523	best: 0.9710523 (900)	total: 5m 36s	remaining: 36.9s
999:	test: 0.9721028	best: 0.9721028 (999)	total: 6m 14s	remaining: 0us

bestTest = 0.9721027737
bestIteration = 999



In [None]:
1. Модель: bestTest = 0.8914723581
2. Модель: bestTest = 0.898330468
3. Модель: bestTest = 0.8974692709
4. Модель: bestTest = 0.9003869641
5. Модель: bestTest = 0.900004265
6. Модель: bestTest = 0.9005465653
7. Модель: bestTest = 0.8993880512
8. Модель: bestTest = 0.9722143209
9. Модель: bestTest = 0.9721027737

    Модель с добавлением FreqEncodera немного хуже, чем без нее