In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import random

from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from category_encoders.one_hot import OneHotEncoder 

from tqdm import tqdm

In [None]:
!mkdir ~/.kaggle
!touch ~/.kaggle/kaggle.json

api_token = {"username":"",
             "key":""}

import json

with open('/root/.kaggle/kaggle.json', 'w') as file:
    json.dump(api_token, file)

!chmod 600 ~/.kaggle/kaggle.json

import kaggle

!kaggle competitions download -c ieee-fraud-detection

!unzip ieee-fraud-detection

In [None]:
train_identity = pd.read_csv('train_identity.csv')
train_transaction = pd.read_csv('train_transaction.csv')
train_transaction.isFraud = train_transaction.isFraud.astype('str')

test_identity = pd.read_csv('test_identity.csv')
test_transaction = pd.read_csv('test_transaction.csv')

In [None]:
# трейн
X_train =  train_transaction.drop('isFraud', axis=1)
X_train = X_train.merge(train_identity, how='outer', on='TransactionID')

y_train = train_transaction['isFraud']

# тест
X_test = test_transaction.merge(test_identity, how='outer', on='TransactionID')

In [None]:
# del train_transaction, train_identity, test_transaction, test_identity

# Описание плана
* Сделать предобработку признаков, оставить только самые полезные
* Для линейных моделей выкидываем коррелирующие признаки
* Бейзлайн - предсказываем мажоритарный класс
* Обучить логистическую регресию с L1 регуляризацией и посмотреть у каких признаков нулевые веса
* Накладывать на линейную фкнкцию после зануления весов более сложные конструкции признаки могут выстрелить в бустинге, но не могут выстрелить в регрессии
* взять подвыборку 70к и понизить размерность до 2х T-SNE/PCA, сдлеать скеттер плот и выделить фрод не фрод показывает возможность классификации


## Признаки

На основе EDA мы выделили для себя следующие полезные признаки

In [None]:
v_cols = ['V166', 'V77', 'V305', 'V47', 'V240', 
     'V241', 'V120', 'V171', 'V3', 'V56', 
     'V107', 'V260', 'V109', 'V282', 
     'V7', 'V124', 'V46', 'V115', 'V1', 
     'V6', 'V220', 'V283', 'V281', 'V209', 
     'V173', 'V223', 'V78', 'V118', 'V121', 
     'V210', 'V2', 'V174', 'V226', 'V169', 
     'V122', 'V286', 'V55', 'V138', 'V208', 
     'V329', 'V273', 'V42', 'V52', 'V265', 
     'V266', 'V229', 'V276', 'V235', 'V186',
     'V91', 'V234', 'V338', 'V158', 'V326', 
     'V259', 'V246', 'V160', 'V187', 'V303', 
     'V268', 'V89', 'V41', 'V247', 'V195',
     'V325', 'V315', 'V292', 'V272', 'V201', 
     'V163', 'V137', 'V130', 'V113', 'V9', 
     'V87', 'V83', 'V76', 'V67', 'V62', 
     'V54', 'V5', 'V45', 'V38', 'V36', 
     'V301', 'V289', 'V262', 'V26', 'V251', 
     'V24', 'V239', 'V20', 'V188', 'V185', 
     'V175', 'V170', 'V147', 'V142', 
     'V140', 'V13', 'V125', 'V119', 'V116', 'V110']

In [None]:
cols_float = ['TransactionDT', 'TransactionAmt', 
              'addr1', 'addr2',
              'dist1', 'dist2', 
              'card1', 'card2', 'card3', 'card5',
              'C1', 'C3', 'C5', 'C9', 'C13',
              'D1','D7', 'D8', 'D9', 'D10', 'D11', 'D12']
cols_float.extend(v_cols)


cols_cat = ['ProductCD', 'card4', 'card6', 'P_emaildomain', 'R_emaildomain',
            'DeviceType', 'DeviceInfo']
cols_cat.extend(['M' + str(i) for i in range(1, 10)])

In [None]:
X_train = X_train[cols_float + cols_cat]
X_test = X_test[cols_float + cols_cat]

## Заполним пропуски

In [None]:
for col in cols_float:
  X_train.loc[X_train[col].isna(), col] = X_train[col].mean()
  X_test.loc[X_test[col].isna(), col] = X_test[col].mean()


for col in cols_cat:
  X_train.loc[X_train[col].isna(), col] = 'no_' + col
  X_test.loc[X_test[col].isna(), col] = 'no_' + col 

## Подкинем пару новых признаков

In [None]:
X_train['TransactionAmt_cents'] = X_train['TransactionAmt'].astype('str').str.split('.').apply(lambda x: x[1])
X_train['TransactionAmt_int_sum'] = X_train['TransactionAmt'].astype('str').str.split('.').apply(lambda x: x[0])

X_test['TransactionAmt_cents'] = X_test['TransactionAmt'].astype('str').str.split('.').apply(lambda x: x[1])
X_test['TransactionAmt_int_sum'] = X_test['TransactionAmt'].astype('str').str.split('.').apply(lambda x: x[0])

In [None]:
cols_float.extend(['TransactionAmt_cents', 'TransactionAmt_int_sum'])

In [None]:
def dttm(date):
    date = date - 86400
    return (date // 86400 // 7,
        date // 86400 % 7, 
        date % 86400 // 3600,
        date % 86400 % 3600 // 60,
        date % 86400 % 3600 % 60)


X_train['dttm_week'] = X_train.TransactionDT.apply(dttm).apply(lambda x: x[0])
X_train['dttm_day'] = X_train.TransactionDT.apply(dttm).apply(lambda x: x[1])
X_train['dttm_hour'] = X_train.TransactionDT.apply(dttm).apply(lambda x: x[2])
X_train['dttm_minute'] = X_train.TransactionDT.apply(dttm).apply(lambda x: x[3])
X_train['dttm_sec'] = X_train.TransactionDT.apply(dttm).apply(lambda x: x[4])

X_test['dttm_week'] = X_test.TransactionDT.apply(dttm).apply(lambda x: x[0])
X_test['dttm_day'] = X_test.TransactionDT.apply(dttm).apply(lambda x: x[1])
X_test['dttm_hour'] = X_test.TransactionDT.apply(dttm).apply(lambda x: x[2])
X_test['dttm_minute'] = X_test.TransactionDT.apply(dttm).apply(lambda x: x[3])
X_test['dttm_sec'] = X_test.TransactionDT.apply(dttm).apply(lambda x: x[4])

In [None]:
cols_float.extend(['dttm_week', 'dttm_day', 'dttm_hour', 'dttm_minute', 'dttm_sec'])

In [None]:
deviceinfo_most_pop = X_train.query('DeviceInfo != "no_DeviceInfo"')['DeviceInfo']\
                                                                              .value_counts()\
                                                                              .head(5)\
                                                                              .index\
                                                                              .tolist()
deviceinfo_most_pop.append('no_DeviceInfo')

X_train.loc[:, 'DeviceInfo'] = X_train['DeviceInfo'].apply(lambda x: x if x in deviceinfo_most_pop else 'other')
X_test.loc[:, 'DeviceInfo'] = X_test['DeviceInfo'].apply(lambda x: x if x in deviceinfo_most_pop else 'other')

## Нормализуем вещественные признаки

In [None]:
standard_scaler = StandardScaler(copy=True)

for col in cols_float[:-7]:
  if X_train[X_train[col] < 0].shape[0]:
    continue

  X_train[col] = np.log(X_train[col])
  X_test[col] = np.log(X_test[col])

X_train[cols_float] = X_train[cols_float].replace(np.inf * -1, -1000000000)
X_test[cols_float] = X_test[cols_float].replace(np.inf * -1, -1000000000)

X_train[cols_float] = X_train[cols_float].replace(np.inf, 1000000000)
X_test[cols_float] = X_test[cols_float].replace(np.inf, 1000000000)

X_train_matrix = standard_scaler.fit_transform(X_train[cols_float])
X_test_matrix = standard_scaler.transform(X_test[cols_float])

In [None]:
for col in cols_float:
  na_percent = X_train[X_train[col].isna()].shape[0]/X_train.shape[0] * 100
  if na_percent > 0:
    print(col, na_percent)

In [None]:
X_train_matrix = pd.DataFrame(X_train_matrix) 
X_train_matrix.columns = X_train[cols_float].columns

X_test_matrix = pd.DataFrame(X_test_matrix) 
X_test_matrix.columns = X_test[cols_float].columns

In [None]:
# X_train_matrix.to_csv('X_train_matrix.csv') 
# X_test_matrix.to_csv('X_test_matrix.csv') 

In [None]:
# !zip X_train_matrix.zip X_train_matrix.csv
# !zip X_test_matrix.zip X_test_matrix.csv

In [None]:
# from google.colab import files

# files.download('X_train_matrix.zip')
# files.download('X_test_matrix.zip')

## Кодируем категориальные признаки

In [None]:
for col in cols_cat:
  print(col, X_train[col].nunique())

In [None]:
ohe_enc = OneHotEncoder()

X_train_matrix_cat = ohe_enc.fit_transform(X_train[cols_cat])
X_test_matrix_cat = ohe_enc.transform(X_test[cols_cat])

In [None]:
X_train_matrix_cat = pd.DataFrame(X_train_matrix_cat) 
X_train_matrix_cat.columns = ohe_enc.get_feature_names()

X_test_matrix_cat = pd.DataFrame(X_test_matrix_cat) 
X_test_matrix_cat.columns = ohe_enc.get_feature_names()

In [None]:
X_train = pd.concat([X_train_matrix, X_train_matrix_cat], axis=1)
X_test = pd.concat([X_test_matrix, X_test_matrix_cat], axis=1)

In [None]:
assert X_test.shape[1] == X_train.shape[1]

### Log Reg с L2 регуляризацией

In [None]:
# разделим данные на обучающую и тестовую выборки
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, train_size=0.7)

In [None]:
# обучение логистической регрессии
from sklearn.linear_model import LogisticRegression
from tqdm import tqdm

lr = LogisticRegression(max_iter=10000,
                        warm_start=True)

ran = np.arange(y_train.shape[0])
inds = np.array_split(ran, 10)

inds = [np.concatenate((chunk, np.array(random.sample(list(ran), k=100))), axis=None) for chunk in inds]

for chunk in tqdm(inds):
  lr.fit(X_train.iloc[chunk, :], y_train.iloc[chunk])

In [None]:
pred_lr = lr.predict_proba(X_val)

In [None]:
# Посчитаем пороговое значение (если будет предсказывать мажоритарный класс)
from sklearn.metrics import roc_auc_score

benchmark = ['0' for _ in range(len(y_val))]

roc_auc_score(y_val, benchmark)

In [None]:
# посчитаем roc auc для обученого лог рега
y_val_pred_for_fraud = [x[1] for x in pred_lr]

roc_auc_score(y_val, y_val_pred_for_fraud)

Как минимум первая моделька уже неплохо обучилась и превысила пороговое значение. Мы уже немного молодцы)

## Log Reg с l1-регуляризацией



In [None]:
lr = LogisticRegression(max_iter=10000,
                        warm_start=True,
                        penalty='l1',
                        solver='liblinear'
                        )

ran = np.arange(y_train.shape[0])
inds = np.array_split(ran, 10)

inds = [np.concatenate((chunk, np.array(random.sample(list(ran), k=100))), axis=None) for chunk in inds]

for chunk in tqdm(inds):
  lr.fit(X_train.iloc[chunk, :], y_train.iloc[chunk])

In [None]:
pred_lr = lr.predict_proba(X_val)

In [None]:
y_val_pred_for_fraud = [x[1] for x in pred_lr]

roc_auc_score(y_val, y_val_pred_for_fraud)

In [None]:
import pickle
from datetime import datetime as dt

model_name = f'best_model_{dt.today()}.pkl'
with open(model_name, 'wb') as f:
    pickle.dump(lr, f, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
from google.colab import files

files.download(model_name) 

In [None]:
import pickle 

with open('best_model_2022-12-23 16_38_11.565860.pkl', 'rb') as f:
  lr = pickle.load(f)

In [None]:
important_features = []
for i, val in enumerate(lr.coef_[0]):
  if val:
    continue
  important_features.append(i)

In [None]:
important_cols = X_train.iloc[:, important_features].columns.tolist()

## Подбираем параметры для Log Reg

In [None]:
# C = [i/100 for i in range(1, 100, 10)]
# C.extend([1, 3, 100])

# for val in tqdm(C):

#     lr = LogisticRegression(C=val, max_iter=500)
#     lr.fit(X_train, y_train)

#     pred_lr = lr.predict_proba(X_val)
#     y_val_pred_for_fraud = [x[1] for x in pred_lr]
#     print(val, roc_auc_score(y_val, y_val_pred_for_fraud))

In [None]:
lr = LogisticRegression(max_iter=10000,
                        warm_start=True,
                        penalty='l1',
                        solver='liblinear',
                        C=3 #вроде как бест параметр
                        )

ran = np.arange(y_train.shape[0])
inds = np.array_split(ran, 10)

inds = [np.concatenate((chunk, np.array(random.sample(list(ran), k=100))), axis=None) for chunk in inds]

for chunk in tqdm(inds):
  lr.fit(X_train.iloc[chunk, :], y_train.iloc[chunk])

In [None]:
pred_lr = lr.predict_proba(X_val)

y_val_pred_for_fraud = [x[1] for x in pred_lr]

roc_auc_score(y_val, y_val_pred_for_fraud)

In [None]:
import pickle
from datetime import datetime as dt

model_name = f'best_model_{dt.today()}.pkl'
with open(model_name, 'wb') as f:
    pickle.dump(lr, f, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
from google.colab import files

files.download(model_name) 

## SVM

In [None]:
# очень долго обучается, поэтому решили попробовать снизить размерность данных 
from sklearn.decomposition import PCA

pca = PCA(n_components=2) # сократим кол-во признаков

X_train_new = pca.fit_transform(X_train[important_cols])
# X_val_new = pca.transform(X_val)
# X_test_new = pca.transform(X_test)

In [None]:
import seaborn as sns

X_train_df = pd.DataFrame(X_train_new)
sns.scatterplot(data=X_train_df, x=0, y=1, hue=y_train)

plt.show()

Визуально, на основе разложения, можно выделить два класса.

In [None]:
# очень долго обучается, поэтому решили попробовать снизить размерность данных 
from sklearn.decomposition import PCA

pca = PCA(n_components=70) # сократим кол-во признаков

X_train_new = pca.fit_transform(X_train[important_cols])
X_val_new = pca.transform(X_val[important_cols])
X_test_new = pca.transform(X_test[important_cols])

In [None]:
from sklearn.svm import SVC

for kernel in tqdm(['linear', 'rbf', 'poly', 'sigmoid']):
    svm = SVC(kernel=kernel,
              probability=True,
              verbose=True,
              max_iter=100)

    svm.fit(X_train_new, y_train)

    pred = svm.predict_proba(X_val_new)
    y_val_pred_for_fraud = [x[1] for x in pred]

    print(kernel, roc_auc_score(y_val, y_val_pred_for_fraud))

In [None]:
for degree in tqdm(np.arange(2,10)):
    svm = SVC(kernel='linear', 
              degree=degree,
              max_iter=1000)

    svm.fit(X_train_new, y_train)

    pred = svm.predict_proba(X_val_new)
    y_val_pred_for_fraud = [x[1] for x in pred]

    print(degree, roc_auc_score(y_val, y_val_pred_for_fraud))

In [None]:
svm = SVC(kernel='linear',
          max_iter=1000,
          probability=True,
          verbose=True)

svm.fit(X_train, y_train)

pred = svm.predict(X_val)

print(degree, roc_auc_score(y_val, pred))

In [None]:
import pickle
from datetime import datetime as dt

model_name = f'svm_best_model_{dt.today()}.pkl'
with open(model_name, 'wb') as f:
    pickle.dump(svm, f, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
from google.colab import files

files.download(model_name) 

## Catboost

In [None]:
from catboost import CatBoostClassifier

params = {
    'iterations': 2500,
    'depth': 8,
    'loss_function' : 'Logloss',
    'eval_metric' : 'AUC',
    'learning_rate': .1,
    'random_seed': 42,
    'od_wait': 5,
    'verbose': 100
}

model = CatBoostClassifier(**params)

model.fit(X_train, y_train)

In [None]:
pred = model.predict_proba(X_val)

y_val_pred_for_fraud = [x[1] for x in pred]

print(roc_auc_score(y_val, y_val_pred_for_fraud))

In [None]:
import pickle
from datetime import datetime as dt

model_name = f'catboost_best_model_{dt.today()}.pkl'
with open(model_name, 'wb') as f:
    pickle.dump(model, f, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
from google.colab import files

files.download(model_name)

## Тестовый сабмит 

In [None]:
pred_lr = lr.predict_proba(X_test)

In [None]:
sample = pd.read_csv('sample_submission.csv')
sample.shape

In [None]:
pred = model.predict_proba(X_test)

In [None]:
result = test_transaction['TransactionID'].reset_index()

result['isFraud'] = [fraud for notfraud, fraud in pred]

In [None]:
cols = ['TransactionID','isFraud']
result[cols].to_csv('prediction.csv', index=False)

In [None]:
!kaggle competitions submit -c ieee-fraud-detection -f  prediction.csv -m "Message"