In [None]:
%load_ext nb_black

In [None]:
import datetime as dt
import os.path as osp
import datetime as dt
from datetime import date, time
from functools import partial
from typing import List

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from eda.stats.categorical import (
    describe as describe_cat,
    get_value_counts,
    _get_contributions_of_features,
)
from eda.stats.continuous import describe as describe_continuous
from helpers.series_list import bag_of_words_series, get_n_elements, get_unique_values
from helpers.read_file import read_feather

In [None]:
sns.set(rc={"figure.figsize": (15, 8)})

# Table of contents <a id="table_of_contents">
- [Load data](#load_data)
- [Distributions](#distributions)
    - [Labels](#labels)
    - [Continuous](#continuous)
    - [Time related](#time_related)
    - [Categorical](#categorical)

# Load data <a  id="load_data"> </a>
- [Table of contents](#table_of_contents)

In [None]:
TRAINING_DATA_PATH = "../data/train.fth"
df = read_feather(TRAINING_DATA_PATH)

In [None]:
df.head()

## Getting the types of the columns

In [None]:
df.info()

## Checking for `NaN` values

In [None]:
df.isna().sum()

In [None]:
df.isna().sum() / len(df)

In [None]:
df["TransactionID"].is_unique

## Errors?

In [None]:
errors_series = df["Errors?"]
errors_series

In [None]:
describe_cat(errors_series)

In [None]:
errors_series.value_counts()

In [None]:
errors_series.value_counts().sum()

In [None]:
errors_series.value_counts().sum() / len(errors_series)

It seems that the transactions that have at least one type of error are around $1.58\%$

In [None]:
get_value_counts(errors_series)

In [None]:
unique_errors = get_unique_values(errors_series)
unique_errors

In [None]:
bag_of_words_errors = bag_of_words_series(errors_series)
bag_of_words_errors

In [None]:
n_errors_series = get_n_elements(errors_series)
n_errors_series.head()

In [None]:
errors_indicator_series = pd.Series(np.where(n_errors_series > 0, 1, 0))
errors_indicator_series

In [None]:
df["candidate_label"] = errors_indicator_series

We notice that out of all transactionns that have at least one error, `Insufficient balance` is the prevalent category (with being also in other categories which have at least two kind of errors). 

# Create a datetime field from `Year`, `Month`, `Day` 

In [None]:
df["date"] = df.apply(
    lambda row: dt.datetime(int(row["Year"]), int(row["Month"]), int(row["Day"])),
    axis=1,
)

# Distributions <a id="distributions">
- [Table of contents](#table_of_contents)

## Card

In [None]:
feature_name = "Card"
describe_cat(df[feature_name])

In [None]:
ax = sns.histplot(
    data=df,
    x=feature_name,
    hue="candidate_label",
    common_norm=False,
    multiple="dodge",
    stat="percent",
)


plt.xticks(rotation=80)


## Time related (cyclical) features

### Year

### Month

In [None]:
MONTHS = [
    "Jan",
    "Feb",
    "Mar",
    "Apr",
    "May",
    "Jun",
    "Jul",
    "Aug",
    "Sep",
    "Oct",
    "Nov",
    "Dec",
]

In [None]:
feature_name = "Month"

ax = sns.histplot(
    data=df,
    x=feature_name,
    hue="candidate_label",
    common_norm=False,
    multiple="dodge",
    stat="percent",
    bins=24,
)

ax.set_xticks(np.arange(1, 13, 1))
ax.set_xticklabels(MONTHS)

ax.set_ylabel("Percentage")
ax.set_xlabel(f"{feature_name}")



### Day of week

In [None]:
DAYS = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]

df["day_of_week"] = df["date"].dt.dayofweek

In [None]:
feature_name = "day_of_week"

ax = sns.histplot(
    data=df,
    x=feature_name,
    hue="candidate_label",
    common_norm=False,
    multiple="dodge",
    stat="percent",
    bins=14,
)

ax.set_xticks(np.arange(0, 7, 1))
ax.set_xticklabels(DAYS)

ax.set_ylabel("Percentage")
ax.set_xlabel(f"{feature_name}")


### Hour

In [None]:
TIME_FORMAT = "%H:%M"
df["hour"] = df["Time"].apply(lambda x: dt.datetime.strptime(x, TIME_FORMAT)).dt.hour

In [None]:
feature_name = "hour"

ax = sns.histplot(
    data=df,
    x=feature_name,
    hue="candidate_label",
    common_norm=False,
    multiple="dodge",
    stat="percent",
    bins=48,
)
ax.set_xticks(np.arange(0, 24, 1))
ax.set_ylabel("Percentage")
ax.set_xlabel("Time (Hour) in a Day")


We can see that the hours 7, 14, 15, 17 and 19 we have more transactions (as a ratio) related to some kind of error.

## Amount

In [None]:
feature_name = "Amount"

In [None]:
try:
    df[feature_name] = df[feature_name].apply(lambda x: float(x[1:]))
except:
    pass

In [None]:
describe_continuous(df[feature_name], fmt=".4f")

In [None]:
negative_amounts_df = df[df["Amount"] < 0]

In [None]:
sns.histplot(
    negative_amounts_df,
    x=feature_name,
    hue="candidate_label",
    common_norm=False,
    stat="percent",
    multiple="dodge",
)

There is an indication that where the value is from 0 to -100 it is more likely to report an error.

In [None]:
sns.histplot(
    df[df[feature_name].between(-100, 0)],
    x=feature_name,
    hue="candidate_label",
    common_norm=False,
    stat="percent",
    multiple="dodge",
)

In [None]:
np.percentile(df[feature_name], 99)

## Use Chip

In [None]:
feature_name = "Use Chip"

ax = sns.histplot(
    data=df,
    x=feature_name,
    hue="candidate_label",
    common_norm=False,
    multiple="dodge",
    stat="percent",
    bins=3,
)

ax.set_ylabel("Percentage")
ax.set_xlabel(f"{feature_name}")

### Merchant Name

In [None]:
feature_name = "Merchant Name"

In [None]:
describe_cat(df[feature_name])

In [None]:
get_value_counts(df[feature_name])

In [None]:
feature_name = "Merchant Name"

In [None]:
get_contributions_of_features = partial(
    _get_contributions_of_features,
    df=df,
    errors_indicator_series=errors_indicator_series,
)
get_contributions_of_features("Merchant Name")

<div class='alert alert-info'>
    
We want to examine what specific `Merchant Name` appears more in transactions with errors or not. In order to do so, we evalute the contribution (ratio) of each Merchant Name appearing in transactions with errors or in transactions without errors.
    
</div>

### Merchant City

In [None]:
feature_name = "Merchant City"

In [None]:
describe_cat(df[feature_name])

In [None]:
get_value_counts(df[feature_name])

In [None]:
get_contributions_of_features(feature_name)

We want to examine what specific `Merchant City` appears more in transactions with errors or not. In order to do so, we evalute the contribution (ratio) of each Merchant Name appearing in transactions with errors or in transactions without errors. We notice that ONLINE transaction tend to be associated a bit more with transactions with errors, but still quite low.

## Merchant State

In [None]:
feature_name = "Merchant State"

In [None]:
describe_cat(df[feature_name])

In [None]:
get_contributions_of_features(feature_name)

We want to examine what specific `Merchant State` appears more in transactions with errors or not. In order to do so, we evalute the contribution (ratio) of each Merchant Name appearing in transactions with errors or in transactions without errors. We notice that ONLINE tends to be associated a bit more with transactions with errors, but still quite low.

## MCC

In [None]:
feature_name = "MCC"

In [None]:
get_contributions_of_features(feature_name)

# <font color='green'> TEST </font>