First submission code by Rodrigo Veríssimo.

Note: Removed the calculated Day of the week and the timedelta variable in hours between the most recent transation in the training set the the observed transaction. Because we only have 5 days worth of data and I don't really believe in the timedelta one. They slightly improved validation results and still have to confirm by submitting in the portal again.

In [2]:
from typing import List, Tuple

import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.base import TransformerMixin, ClassifierMixin
from sklearn.model_selection import cross_val_score
from category_encoders.target_encoder import TargetEncoder

RANDOM_STATE: int = 1
SUBMISSION_VERSION: int = 1
cat_columns: List[str] = ['product_id', 'product_department',
       'product_category', 'card_id', 'user_id', 'C15', 'C16', 'C17', 'C18',
       'C19', 'C20', 'C21']

In [4]:
def load_data() -> Tuple[pd.DataFrame, pd.DataFrame]:
    return (pd.read_csv('data/train.csv'),
           pd.read_csv('data/test.csv'))

def submit_submission(df_transactions: pd.DataFrame,
                     model: Pipeline,
                     submission_version: int) -> None:
    df_submission: pd.DataFrame = df_transactions['id'].to_frame()
    df_submission['isfraud'] = model.predict_proba(df_transactions)[:,1]
    df_submission.to_csv(f'submission_{submission_version}.csv', index=False)

class ColumnTypeConverter(TransformerMixin):
    def __init__(self) -> None:
        self.cat_columns = ['product_id', 'product_department',
       'product_category', 'card_id', 'user_id', 'C15', 'C16', 'C17', 'C18',
       'C19', 'C20', 'C21']
    
    def fit(self, df_transactions: pd.DataFrame, _):
        return self
    
    def transform(self, df_transactions: pd.DataFrame) -> pd.DataFrame:
        new_df_transactions: pd.DataFrame = df_transactions.copy()
        new_df_transactions.timestamp = pd.to_datetime(new_df_transactions.timestamp, unit='ms')
        new_df_transactions[cat_columns] = df_transactions[self.cat_columns].astype('category')
        return new_df_transactions
    
class ExtractDatetimeFeatures(TransformerMixin):
    def __init__(self) -> None:
        pass
    
    def fit(self, df_transactions: pd.DataFrame, _):
        return self
    
    def transform(self, df_transactions: pd.DataFrame) -> pd.DataFrame:
        return(df_transactions.pipe(self.build_temporal_features)
                              .pipe(self.cyclical_transform))

    def build_temporal_features(
        self, df_transactions: pd.DataFrame
    ) -> pd.DataFrame:

        return df_transactions.assign(
            HourDay=df_transactions.timestamp.dt.hour
            + df_transactions.timestamp.dt.minute / 59,
            DayWeek=df_transactions.timestamp.dt.dayofweek,
        )

    def cyclical_transform(self, df_transactions) -> pd.DataFrame:
        return (
            df_transactions.pipe(self.compute_cyclical, 'HourDay', 23)
        )

    def compute_cyclical(self, df_transactions: pd.DataFrame, 
                         feature_name: str, period: int) -> pd.DataFrame:
        df_transactions[feature_name + "Cos"] = np.cos(
            2*np.pi*df_transactions[feature_name] / period
        )
        df_transactions[feature_name + "Sin"] = np.sin(
            2*np.pi*df_transactions[feature_name] / period
        )
        return df_transactions

class Model(ClassifierMixin):
    def __init__(self, random_state: int) -> None:
        self.columns: List[str] = ['product_id', 'product_department',
       'product_category', 'card_id', 'user_id', 'C15', 'C16', 'C17', 'C18',
       'C19', 'C20', 'C21', 'amount', 'HourDayCos', 'HourDaySin']
        self.model: LogisticRegression = LogisticRegression(random_state=random_state,
                                       solver='liblinear')

    def fit(self, df_transactions: pd.DataFrame, is_fraud: pd.Series):
        self.model.fit(df_transactions[self.columns], is_fraud)
        return self
    
    def predict_proba(self, df_transactions: pd.DataFrame) -> pd.DataFrame:
        return self.model.predict_proba(df_transactions[self.columns])

In [5]:
df_train, df_test = load_data()

In [6]:
pipeline: Pipeline = Pipeline(
            [
                ('dtype_converter',
                ColumnTypeConverter()),
                ('datetime_extractor',
                ExtractDatetimeFeatures()),
                ('categorical_encoder',
                TargetEncoder(cols=cat_columns, 
                              min_samples_leaf=20)),
                ('model',
                Model(random_state=RANDOM_STATE))
            ]
        ).fit(df_train.drop(columns='isfraud'), 
              df_train.isfraud)

In [7]:
submit_submission(df_transactions=df_test, model=pipeline, submission_version=SUBMISSION_VERSION)