## Configure Notebook

In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import json
from IPython.display import Markdown

## Load & Preview Data

In [2]:
with open('../data/raw/dataset-metadata.json', 'rt') as f:
    meta = json.load(f)

Markdown('\n'.join([
    f"# {meta['title']}",
    '',
    f"{meta['subtitle']}",
    '',
    f"**Keywords:**",
    '\n'.join(f"- {kw}" for kw in meta['keywords']),
    '',
    f"{meta['description']}",
    '',
    f"## Fields",
    '',
    f"| Name | Type | Description |",
    f"| ---- | ---- | ----------- |",
    '\n'.join(map(lambda fld: f"| {fld['name']} | {fld['type']} | {fld['title'] or ''} |",
                  meta['resources'][0]['schema']['fields']))
]))

# Credit Card Fraud Detection

Anonymized credit card transactions labeled as fraudulent or genuine

**Keywords:**
- finance
- crime
- medium
- featured

Context
---------

It is important that credit card companies are able to recognize fraudulent credit card transactions so that customers are not charged for items that they did not purchase.

Content
---------

The datasets contains transactions made by credit cards in September 2013 by european cardholders. 
This dataset presents transactions that occurred in two days, where we have 492 frauds out of 284,807 transactions. The dataset is highly unbalanced, the positive class (frauds) account for 0.172% of all transactions.

It contains only numerical input variables which are the result of a PCA transformation. Unfortunately, due to confidentiality issues, we cannot provide the original features and more background information about the data. Features V1, V2, ... V28 are the principal components obtained with PCA, the only features which have not been transformed with PCA are 'Time' and 'Amount'. Feature 'Time' contains the seconds elapsed between each transaction and the first transaction in the dataset. The feature 'Amount' is the transaction Amount, this feature can be used for example-dependant cost-senstive learning. Feature 'Class' is the response variable and it takes value 1 in case of fraud and 0 otherwise. 

Inspiration
---------

Identify fraudulent credit card transactions.

Given the class imbalance ratio, we recommend measuring the accuracy using the Area Under the Precision-Recall Curve (AUPRC). Confusion matrix accuracy is not meaningful for unbalanced classification.

Acknowledgements
---------

The dataset has been collected and analysed during a research collaboration of Worldline and the Machine Learning Group (http://mlg.ulb.ac.be) of ULB (Université Libre de Bruxelles) on big data mining and fraud detection.
More details on current and past projects on related topics are available on http://mlg.ulb.ac.be/BruFence and http://mlg.ulb.ac.be/ARTML

Please cite: Andrea Dal Pozzolo, Olivier Caelen, Reid A. Johnson and Gianluca Bontempi. Calibrating Probability with Undersampling for Unbalanced Classification. In Symposium on Computational Intelligence and Data Mining (CIDM), IEEE, 2015





## Fields

| Name | Type | Description |
| ---- | ---- | ----------- |
| Time | Numeric | Number of seconds elapsed between this transaction and the first transaction in the dataset |
| V1 | Numeric |  |
| V2 | Numeric |  |
| V3 | Numeric |  |
| V4 | Numeric |  |
| V5 | Numeric |  |
| V6 | Numeric |  |
| V7 | Numeric |  |
| V8 | Numeric |  |
| V9 | Numeric |  |
| V10 | Numeric |  |
| V11 | Numeric |  |
| V12 | Numeric |  |
| V13 | Numeric |  |
| V14 | Numeric |  |
| V15 | Numeric |  |
| V16 | Numeric |  |
| V17 | Numeric |  |
| V18 | Numeric |  |
| V19 | Numeric |  |
| V20 | Numeric |  |
| V21 | Numeric |  |
| V22 | Numeric |  |
| V23 | Numeric |  |
| V24 | Numeric |  |
| V25 | Numeric |  |
| V26 | Numeric |  |
| V27 | Numeric |  |
| V28 | Numeric | abc |
| Amount | Numeric | Transaction amount |
| Class | Boolean | 1 for fraudulent transactions, 0 otherwise |

In [3]:
raw = pd.read_csv('../data/raw/creditcard.csv')
raw.describe()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
count,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,...,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0
mean,94813.859575,1.16598e-15,3.416908e-16,-1.37315e-15,2.086869e-15,9.604066e-16,1.490107e-15,-5.556467e-16,1.177556e-16,-2.406455e-15,...,1.656562e-16,-3.44485e-16,2.578648e-16,4.471968e-15,5.340915e-16,1.687098e-15,-3.666453e-16,-1.220404e-16,88.349619,0.001727
std,47488.145955,1.958696,1.651309,1.516255,1.415869,1.380247,1.332271,1.237094,1.194353,1.098632,...,0.734524,0.7257016,0.6244603,0.6056471,0.5212781,0.482227,0.4036325,0.3300833,250.120109,0.041527
min,0.0,-56.40751,-72.71573,-48.32559,-5.683171,-113.7433,-26.16051,-43.55724,-73.21672,-13.43407,...,-34.83038,-10.93314,-44.80774,-2.836627,-10.2954,-2.604551,-22.56568,-15.43008,0.0,0.0
25%,54201.5,-0.9203734,-0.5985499,-0.8903648,-0.8486401,-0.6915971,-0.7682956,-0.5540759,-0.2086297,-0.6430976,...,-0.2283949,-0.5423504,-0.1618463,-0.3545861,-0.3171451,-0.3269839,-0.07083953,-0.05295979,5.6,0.0
50%,84692.0,0.0181088,0.06548556,0.1798463,-0.01984653,-0.05433583,-0.2741871,0.04010308,0.02235804,-0.05142873,...,-0.02945017,0.006781943,-0.01119293,0.04097606,0.0165935,-0.05213911,0.001342146,0.01124383,22.0,0.0
75%,139320.5,1.315642,0.8037239,1.027196,0.7433413,0.6119264,0.3985649,0.5704361,0.3273459,0.597139,...,0.1863772,0.5285536,0.1476421,0.4395266,0.3507156,0.2409522,0.09104512,0.07827995,77.165,0.0
max,172792.0,2.45493,22.05773,9.382558,16.87534,34.80167,73.30163,120.5895,20.00721,15.59499,...,27.20284,10.50309,22.52841,4.584549,7.519589,3.517346,31.6122,33.84781,25691.16,1.0


**Missing Values**  
There are no missing values. Whether this is because of the nature of the data / how the data was collected or because rows with missing values were removed is unknown.

**Data Types**
The anonymous variables appear to all be continuous quantities, seeing as they each can take decimal values.

**Central Tendency**  
The anonymous variables appear to have already been centered, seeing as their mean is each zero, +/- what is likely to be numerical error.

**Spread**  
The standard deviation of the anonymous variables ranges between ~1.96 and ~0.30. As far as numbering goes, the anonymous variables are sorted in descending order of standard deviation.

In [4]:
total_seconds = raw['Time'].astype(int).max()
total_minutes, seconds = divmod(total_seconds, 60)
total_hours, minutes = divmod(total_minutes, 60)
days, hours = divmod(total_hours, 24)

display(Markdown("**Time Span of Data Set**"))
pd.DataFrame(data=[days, hours, minutes, seconds],
             index=['days', 'hours', 'minutes', 'seconds'],
             columns=[''])

**Time Span of Data Set**

Unnamed: 0,Unnamed: 1
days,1
hours,23
minutes,59
seconds,52


## Structure DataFrame & Graph Basic Timeline

In [5]:
df = raw.set_index('Time')
df.index = pd.TimedeltaIndex(df.index, unit='s')
df['Fraudulent'] = df['Class'].astype(bool)
df['Legitimate'] = ~df['Fraudulent']

del df['Class']
del raw

df.info()

<class 'pandas.core.frame.DataFrame'>
TimedeltaIndex: 284807 entries, 0 days 00:00:00 to 1 days 23:59:52
Data columns (total 31 columns):
V1            284807 non-null float64
V2            284807 non-null float64
V3            284807 non-null float64
V4            284807 non-null float64
V5            284807 non-null float64
V6            284807 non-null float64
V7            284807 non-null float64
V8            284807 non-null float64
V9            284807 non-null float64
V10           284807 non-null float64
V11           284807 non-null float64
V12           284807 non-null float64
V13           284807 non-null float64
V14           284807 non-null float64
V15           284807 non-null float64
V16           284807 non-null float64
V17           284807 non-null float64
V18           284807 non-null float64
V19           284807 non-null float64
V20           284807 non-null float64
V21           284807 non-null float64
V22           284807 non-null float64
V23           284807 non-n

In [7]:
ofs = (df.index.max() - df.index.min()) // 200  # 0.5% span of the time domain
df_winsum = df[['Fraudulent', 'Legitimate']].rolling(ofs).sum()

In [None]:
df_winsum.plot(title='Rolling Count of Transactions by Type', figsize=(10,5));

In [None]:
(df_winsum
 .apply(lambda srs: srs / srs.max())
 .plot(title='Scaled Rolling Count of Transactions by Type',
       figsize=(10,5),
       yticks=()));

In [None]:
df_cumsum = df[['Fraudulent', 'Legitimate']].cumsum()

In [None]:
df_cumsum.plot(title='Cumulative Count of Transactions by Type', figsize=(10,5));

In [None]:
(df_cumsum
 .apply(lambda srs: srs / srs.max())
 .plot(title='Scaled Cumulative Count of Transactions by Type',
       figsize=(10,5),
       yticks=()));

## Explore Clusters & Correlations

In [None]:
from sklearn.cluster import FeatureAgglomeration

FeatureAgglomeration()

In [None]:
from sklearn.decomposition import PCA

pca = PCA()
fit = pca.fit(df.drop(columns=['Fraudulent', 'Legitimate']))
evr = fit.explained_variance_ratio_

plt.bar(list(range(len(evr))), evr)
plt.show()

### Investigate 'Amount'

In [None]:
df.columns[np.argmax(fit.components_[0])]

In [None]:
plt.hist(df['Amount'], );
plt.title('Histogram of \'Amount\'');
plt.show()

In [None]:
amt = df['Amount']

fig, axes = plt.subplots(3, 1)
for i, qnt in enumerate([0.997, 0.95, 0.68]):
    axes[i].hist(amt[amt < np.quantile(amt, qnt, axis=None)])
    axes[i].set_title(f'Histogram of \'Amount\' Through Quantile {qnt}')
    
fig.set_figheight(15)
plt.show()

### Continue Exploration

In [None]:
pca2 = PCA()
fit2 = pca2.fit(df.drop(columns=['Fraudulent', 'Legitimate', 'Amount']))
evr2 = fit2.explained_variance_ratio_

plt.bar(list(range(len(evr2))), evr2)
plt.show()

In [None]:
evr2_cumsum = evr2.cumsum()

qnts = [0.68, 0.95, 0.997]
x_idxs = [np.argwhere(evr2_cumsum>=qnt).min() for qnt in qnts]
auc = round((evr2_cumsum.sum() / evr2_cumsum.shape[0])*100)/100
X = list(range(len(evr2_cumsum)))

plt.bar(X, evr2_cumsum, color=['C0' if idx not in x_idxs else 'C9' for idx in X])
plt.title('Cumulative Explained Variance Ratio\nby Principle Component')
plt.xticks(x_idxs)
plt.yticks(qnts)
plt.grid(True, 'major', 'y')
plt.annotate(f'AUC ≅ {auc}', (0.0,0.85))
plt.show()