In [45]:
import pandas as pd
import datetime
from pandarallel import pandarallel
pandarallel.initialize(progress_bar=False)

import datetime
import pandas as pd
import time

TEST = 138696
df_ori = pd.read_csv('/kaggle/input/adyen-dataset/adyen-dataset.csv')
df_test_original = df_ori.loc[[TEST]]
df_test = df_test_original.copy()[
    ["psp_reference", "eur_amount", "card_number", "email_address", "ip_address", "ip_country","zip_code","has_fraudulent_dispute"]]

def date(row):
    year = row[3]
    day_of_year = int(row[6])
    date = datetime.datetime(year, 1, 1) + datetime.timedelta(day_of_year - 1)
    return date


def get_tx_datetime(transaction):
    year = transaction.year
    day_of_year = int(transaction.day_of_year)
    hour_of_day = transaction.hour_of_day
    minute_of_hour = transaction.minute_of_hour
    date = datetime.datetime(year, 1, 1) + datetime.timedelta(day_of_year - 1)
    tx_datetime = datetime.datetime.combine(date, datetime.time(hour_of_day, minute_of_hour))
    return tx_datetime


def is_weekend(tx_datetime):
    # Transform date into weekday (0 is Monday, 6 is Sunday)
    weekday = tx_datetime.weekday()
    # Binary value: 0 if weekday, 1 if weekend
    is_weekend = weekday >= 5

    return int(is_weekend)


def is_night(tx_datetime):
    # Get the hour of the transaction
    tx_hour = tx_datetime.hour
    # Binary value: 1 if hour less than 6, and 0 otherwise
    is_night = tx_hour <= 6

    return int(is_night)


def get_card_spending_behaviour_features(transactions, windows_size_in_days=[1, 7, 30]):
    global df_test
    # Let us first order transactions chronologically
    transactions = transactions.sort_values('tx_datetime')
    datetime_index = df_test['tx_datetime'].iloc[0]

    # The transaction date and time is set as the index, which will allow the use of the rolling function
    transactions.index = transactions.tx_datetime

    # For each window size
    for window_size in windows_size_in_days:
        # Compute the sum of the transaction amounts and the number of transactions for the given window size
        SUM_AMOUNT_TX_WINDOW = transactions['eur_amount'].rolling(str(window_size) + 'd').sum().loc[datetime_index]
        NB_TX_WINDOW = transactions['eur_amount'].rolling(str(window_size) + 'd').count().loc[datetime_index]

        # Compute the average transaction amount for the given window size
        # NB_TX_WINDOW is always >0 since current transaction is always included
        AVG_AMOUNT_TX_WINDOW = SUM_AMOUNT_TX_WINDOW / NB_TX_WINDOW

        # Save feature values
        df_test['card_nb_tx_' + str(window_size) + 'day_window'] = NB_TX_WINDOW
        df_test['card_avg_amount_' + str(window_size) + 'day_window'] = AVG_AMOUNT_TX_WINDOW

    # Reindex according to transaction IDs
    transactions.index = transactions.psp_reference

    # And return the dataframe with the new features
    return transactions


def get_diff_tx_time(transactions):
    global df_test
    index = df_test.index[0]
    transactions = transactions.sort_values('tx_datetime')
    df_test["diff_tx_time_in_hours"] = transactions.tx_datetime.diff().dt.total_seconds().loc[index] / 3600


def is_diff_previous(transactions, feature):
    global df_test
    index = df_test.index[0]
    transactions = transactions.sort_values('tx_datetime')
    # exclude na values when comparing
    transactions_notna = transactions.dropna(subset=[feature])
    transactions_notna = transactions_notna.sort_values('tx_datetime')
    transactions_notna.loc(axis=1)["is_diff_previous_" + feature] = (transactions_notna[feature] != transactions_notna[feature].shift(periods=1))
    df_test["is_diff_previous_" + feature] = transactions_notna.loc[index]["is_diff_previous_" + feature]


def get_count_risk_rolling_window(transactions, feature, delay_period=7, windows_size_in_days=[1, 7, 30]):
    global df_test
    check_nan = transactions[feature].isnull().values.any()
    if check_nan:
        for window_size in windows_size_in_days:
            df_test[feature + '_nb_tx_' + str(window_size) + 'day_window'] = 0
            df_test[feature + '_risk_' + str(window_size) + 'day_window'] = 0
        transactions.index = transactions.psp_reference
        return transactions
    
    datetime_index = df_test['tx_datetime'].iloc[0]
    transactions = transactions.sort_values('tx_datetime')

    transactions.index = transactions.tx_datetime

    NB_FRAUD_DELAY = transactions['has_fraudulent_dispute'].rolling(str(delay_period) + 'd').sum().loc[datetime_index]
    NB_TX_DELAY = transactions['has_fraudulent_dispute'].rolling(str(delay_period) + 'd').count().loc[datetime_index]

    for window_size in windows_size_in_days:
        NB_FRAUD_DELAY_WINDOW = transactions['has_fraudulent_dispute'].rolling(
            str(delay_period + window_size) + 'd').sum().loc[datetime_index]
        NB_TX_DELAY_WINDOW = transactions['has_fraudulent_dispute'].rolling(
            str(delay_period + window_size) + 'd').count().loc[datetime_index]

        NB_FRAUD_WINDOW = NB_FRAUD_DELAY_WINDOW - NB_FRAUD_DELAY
        NB_TX_WINDOW = NB_TX_DELAY_WINDOW - NB_TX_DELAY

        RISK_WINDOW = NB_FRAUD_WINDOW / NB_TX_WINDOW

        df_test[feature + '_nb_tx_' + str(window_size) + 'day_window'] = NB_TX_WINDOW
        df_test[feature + '_risk_' + str(window_size) + 'day_window'] = RISK_WINDOW

        # Replace NA values with 0 (all undefined risk scores where NB_TX_WINDOW is 0)
        df_test[feature + '_risk_' + str(window_size) + 'day_window'].fillna(0, inplace=True)

    transactions.index = transactions.psp_reference


df_ori = pd.read_csv('/kaggle/input/adyen-dataset/adyen-dataset.csv')
df_baseline_features = df_ori.copy()[
    ["psp_reference", "eur_amount", "card_number", "email_address", "ip_address", "ip_country","zip_code","has_fraudulent_dispute"]]
df_baseline_features["tx_datetime"] = df_ori.parallel_apply(get_tx_datetime, axis=1)  # for each row of df
df_baseline_features["tx_datetime"] = pd.to_datetime(df_baseline_features["tx_datetime"])
    
def time_feature_engineering(reference):
    global df_test
    
    TEST = reference#138696
    df_test_original = df_ori.loc[[TEST]]
    df_test = df_test_original.copy()[["psp_reference", "eur_amount", "card_number", "email_address", "ip_address", "ip_country","zip_code","has_fraudulent_dispute"]]

    start = time.time()
    df_test = df_test_original.copy()[
        ["psp_reference", "eur_amount", "card_number", "email_address", "ip_address", "ip_country","zip_code","has_fraudulent_dispute"]]

    df_test["tx_datetime"] = df_test_original.apply(get_tx_datetime, axis=1)  # for each row of df
    df_test["tx_datetime"] = pd.to_datetime(df_test["tx_datetime"])



    df_test["is_night"] = df_test.tx_datetime.apply(is_night)  # series.apply for each cell
    df_test["is_weekend"] = df_test.tx_datetime.apply(is_weekend)


    card_number = df_test.iloc[0]['card_number']
    email_address = df_test.iloc[0]['email_address']
    ip_address = df_test.iloc[0]['ip_address']

    df_baseline_features[df_baseline_features['card_number']==card_number].groupby('card_number').apply(
        lambda x: get_diff_tx_time(x))

    df_baseline_features[df_baseline_features['card_number']==card_number].groupby('card_number').apply(
        lambda x: is_diff_previous(x, feature="ip_country"))

    df_baseline_features[df_baseline_features['card_number']==card_number].groupby('card_number').apply(
        lambda x: get_card_spending_behaviour_features(x, windows_size_in_days=[1, 7, 30]))

    df_baseline_features[df_baseline_features['email_address']==email_address].groupby('email_address', dropna=False).apply(
        lambda x: get_count_risk_rolling_window(x, feature="email_address", delay_period=7,
                                                windows_size_in_days=[1, 7, 30]))

    df_baseline_features[df_baseline_features['ip_address']==ip_address].groupby('ip_address', dropna=False).apply(
        lambda x: get_count_risk_rolling_window(x, feature="ip_address", delay_period=7, windows_size_in_days=[1, 7, 30]))

    df_test['no_ip'] = df_test['ip_address'].isnull()
    df_test['no_email'] = df_test['email_address'].isnull()
    
    end = time.time()
    
    
    return end - start
"""
df_baseline_features
"""

INFO: Pandarallel will run on 2 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


'\ndf_baseline_features\n'

In [46]:
X = []
for element in range (138696 - 1000, 138696):
    for _ in range(2):
        X.append(time_feature_engineering(element))

X



[0.15735077857971191,
 0.11711668968200684,
 0.11406707763671875,
 0.11268782615661621,
 0.11445832252502441,
 0.11464691162109375,
 0.11958813667297363,
 0.11613821983337402,
 0.10043549537658691,
 0.11696887016296387,
 0.11939096450805664,
 0.11801719665527344,
 0.11472749710083008,
 0.13269376754760742,
 0.14082789421081543,
 0.12198257446289062,
 0.08433246612548828,
 0.08089923858642578,
 0.09738922119140625,
 0.09741640090942383,
 0.0943455696105957,
 0.09874844551086426,
 0.09684586524963379,
 0.09702181816101074,
 0.11703991889953613,
 0.11280465126037598,
 0.11458754539489746,
 0.1125938892364502,
 0.11526942253112793,
 0.11528587341308594,
 0.11560177803039551,
 0.11531710624694824,
 0.11516976356506348,
 0.1132352352142334,
 0.11752009391784668,
 0.1188814640045166,
 0.09830784797668457,
 0.0985269546508789,
 0.1170661449432373,
 0.1123971939086914,
 0.09584975242614746,
 0.09684276580810547,
 0.11549949645996094,
 0.1170356273651123,
 0.09971499443054199,
 0.101679801940917

In [47]:
import numpy as np

print(f"Mean {np.mean(X)}")
print(f"Standart Deviation {np.std(X)}")

Mean 0.10903204035758972
Standart Deviation 0.009779375564710332


Unnamed: 0,psp_reference,merchant,card_scheme,year,hour_of_day,minute_of_hour,day_of_year,is_credit,eur_amount,ip_country,issuing_country,device_type,ip_address,email_address,card_number,shopper_interaction,zip_code,card_bin,has_fraudulent_dispute,is_refused_by_adyen
138696,58957356926,Merchant D,Other,2021,10,6,312,True,35.13,BR,BR,Windows,947PJ1zh6yFwZxGOYG8Lnw,ReVzz-e9w8mNO63YA1cjFA,DJHwui3GH60rBpx_tAOZZw,Ecommerce,BZD,4920,True,False
