In [91]:
import pandas as pd
from math import radians

### Feature Functions

In [92]:
# Compute age at transaction.
def get_age_at_transaction(trans_df, profiles_df):
    merged_df = pd.merge(trans_df, profiles_df, on="cc_num")
    # convert birthdate to datetime with specified format
    merged_df["birthdate"] = pd.to_datetime(merged_df["birthdate"], format="%Y-%m-%d")

    # # convert datetime to datetime with specified format
    merged_df["datetime"] = pd.to_datetime(
        merged_df["datetime"], format="%Y-%m-%d %H:%M:%S"
    )
    
    merged_df["age_at_transaction"] = merged_df["datetime"].dt.year - merged_df["birthdate"].dt.year
    trans_df["age_at_transaction"] = merged_df["age_at_transaction"]
    return trans_df

# Compute days until card expires.
def get_days_until_card_expires(trans_df, credit_cards_df):
    merged_df = pd.merge(trans_df, credit_cards_df, on="cc_num")
    # changed expires column to datetime with specified format
    merged_df["expires"] = pd.to_datetime(merged_df["expires"], format="%m/%y")
    merged_df["datetime"] = pd.to_datetime(
        merged_df["datetime"], format="%Y-%m-%d %H:%M:%S"
    )
    merged_df["days_until_card_expires"] = (merged_df["expires"] - merged_df["datetime"]).dt.days
    
    trans_df["days_until_card_expires"] = merged_df["days_until_card_expires"]
    return trans_df


# 1. Loading the data and feature engineeing,

### Loading the Data

In [93]:
credit_cards_df = pd.read_csv('datasource/credit_cards.csv')
profiles_df = pd.read_csv('datasource/profiles.csv')
transactions_df = pd.read_csv('datasource/transactions.csv')

## Feature Engineering

Fraudulent transactions can differ from regular ones in many different ways. Typical red flags would for instance be a large transaction volume/frequency in the span of a few hours. It could also be the case that elderly people in particular are targeted by fraudsters. To facilitate model learning you will create additional features based on these patterns. In particular, you will create two types of features:
1. **Features that aggregate data from different data sources**. This could for instance be the age of a customer at the time of a transaction, which combines the `birthdate` feature from `profiles.csv` with the `datetime` feature from `transactions.csv`.
2. **Features that aggregate data from multiple time steps**. An example of this could be the transaction frequency of a credit card in the span of a few hours, which is computed using a window function.

Let's start with the first category.

In [94]:
# Compute age at transaction.
transactions_df = get_age_at_transaction(transactions_df, profiles_df)

# Compute days until card expires.
transactions_df = get_days_until_card_expires(transactions_df, credit_cards_df)

In [95]:
transactions_df[["age_at_transaction", "days_until_card_expires"]].head(3)

Unnamed: 0,age_at_transaction,days_until_card_expires
0,98,1460
1,98,1459
2,98,1459


In [96]:
transactions_df.shape

(106020, 12)

In [97]:
transactions_df = transactions_df.drop_duplicates(["datetime"])

In [98]:
transactions_df.shape

(105092, 12)

In [None]:
# Sort the trans_df DataFrame based on the "datetime" column in ascending order
transactions_df.sort_values("datetime", inplace=True)

# Apply the radians function to the "longitude" and "latitude" columns in the trans_df DataFrame
# This is a common preprocessing step for geographical data
transactions_df[["longitude", "latitude"]] = transactions_df[["longitude", "latitude"]].applymap(radians)

# Create a new column "loc_delta" in trans_df representing the haversine distance between consecutive transactions for each credit card
transactions_df["loc_delta"] = transactions_df.groupby("cc_num")\
    .apply(lambda x: transactions_fraud.haversine(x["longitude"], x["latitude"]))\
    .reset_index(level=0, drop=True)\
    .fillna(0)