In [None]:
import numpy as np
import pandas as pd
import gc
import matplotlib.pylab as plt
import seaborn as sns

# 1. Descriptive Statistics

## 1.1 Load Datasets

In [None]:
train_transaction = pd.read_csv('input/train_transaction.csv')
train_identity = pd.read_csv('input/train_identity.csv')

test_transaction = pd.read_csv('input/test_transaction.csv')
test_identity = pd.read_csv('input/test_identity.csv')

Reduce memory since the datasets would take a lot

In [None]:
## Function to reduce the DF size
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [None]:
## Reducing memory of train sets
train_transaction = reduce_mem_usage(train_transaction)
train_identity = reduce_mem_usage(train_identity)

In [None]:
## Reducing memory of test sets
test_transaction = reduce_mem_usage(test_transaction)
test_identity = reduce_mem_usage(test_identity)

## 1.2 Preparing Data

merge *train_transaction* and *train_identity*, *test_transaction* and *test_identity*

In [None]:
train = pd.merge(train_transaction, train_identity, on='TransactionID', how='left')
test = pd.merge(test_transaction, test_identity, on='TransactionID', how='left')

data descripiton of *train* and *test*

In [99]:
train.info()
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 590540 entries, 0 to 590539
Columns: 434 entries, TransactionID to DeviceInfo
dtypes: float16(354), float32(45), int16(1), int32(2), int8(1), object(31)
memory usage: 650.5+ MB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 506691 entries, 0 to 506690
Columns: 433 entries, TransactionID to DeviceInfo
dtypes: float16(346), float32(53), int16(1), int32(2), object(31)
memory usage: 565.4+ MB


output the new csv files

In [101]:
train.to_csv('data/train.csv')
test.to_csv('data/test.csv')

In [None]:
del train_transaction, train_identity, test_transaction, test_identity
gc.collect()

## 1.3 View Datasets

- load datasets

In [None]:
# If you have run 1.1 and 1.2, you don't need to run this cell
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

- train

In [None]:
pd.set_option('display.max_columns', None)
pd.DataFrame(train)

- test

In [None]:
pd.DataFrame(test)

## 1.4 Shape of Datasets

In [None]:
print('train shape is {}'.format(train.shape))
print('test shape is {}'.format(test.shape))

## 1.5 Features Overview
### 1.5.1 types of features

In [None]:
# pd.set_option('display.max_rows', None)
sm = pd.DataFrame(train).dtypes

In [None]:
pd.set_option("display.precision", 1)
pd.DataFrame(train.describe())

### 1.5.2 Statistical Description and Boxplots
#### 1.5.2.1 TransactionDT

In [None]:
print('type of TransactionDT is ' + str(train["TransactionDT"].dtypes))

> The *TransactionDT* feature is a timedelta from a given reference datetime (not an actual timestamp).

In [None]:
sm = pd.DataFrame(
    [
        ['count',train['TransactionDT'].count(), test['TransactionDT'].count()],
        ['min',train.TransactionDT.min(), test.TransactionDT.min()],
        ['max',train.TransactionDT.max(), test.TransactionDT.max()],
        ['range',train.TransactionDT.max() - train.TransactionDT.min(), test.TransactionDT.max() - test.TransactionDT.min()],
        ['mid-range',(train.TransactionDT.max() - train.TransactionDT.min())/2, (test.TransactionDT.max() - test.TransactionDT.min())/2],
        ['mean',train.TransactionDT.mean(), test.TransactionDT.mean()],
        ['median',train.TransactionDT.median(), test.TransactionDT.median()],
        ['std.dev',train.TransactionDT.std(), test.TransactionDT.std()],
        ['variance',train.TransactionDT.var(), test.TransactionDT.var()],
        ['mode',train.TransactionDT.mode(), test.TransactionDT.mode()],
        ['25%',train.TransactionDT.quantile(0.25), test.TransactionDT.quantile(0.25)],
        ['50%',train.TransactionDT.quantile(0.50), test.TransactionDT.quantile(0.50)],
        ['75%',train.TransactionDT.quantile(0.75), test.TransactionDT.quantile(0.75)],
        ['IQR',train.TransactionDT.quantile(0.75)-train.TransactionDT.quantile(0.25), test.TransactionDT.quantile(0.75)-test.TransactionDT.quantile(0.25)]
    ],
    columns=['property','value_of_train','value_of_test']
)
# sm['value_of_train'] = sm.value_of_train.astype(int)
# sm['value_of_test'] = sm.value_of_test.astype(int)
sm

In [None]:
train[['TransactionDT']].boxplot(vert = False)
plt.show()

In [None]:
print("train: min = " + str(train.TransactionDT.min()) + ", train: max = " + str(train.TransactionDT.max()))
print("test: min = " + str(test.TransactionDT.min()) + ", test: max = " + str(test.TransactionDT.max()))

In [None]:
time_span = (test.TransactionDT.max() - train.TransactionDT.min()) / 60 / 60 / 24
print("time span of the total datasets is: " + str(round(time_span)) + 'days')

#### 1.5.2.2 ProductCD

In [None]:
print('type of ProductCD is ' + str(train["ProductCD"].dtypes))

> Products in this dataset come under five broad categories: W, H, C, S, and R.

We will visualize this feature in relation with the target, *isFraud*.

In [None]:
fig, ax = plt.subplots(figsize=(5, 5))
plot = sns.boxplot(x="ProductCD", y="TransactionAmt", hue = "isFraud", data=train.query("TransactionAmt < 500"), palette=["blue", "yellow"], ax=ax).set_title('TransactionAmt', fontsize=16)
plt.show()

Except for *C* and *S*, the non-fraudulent mean, first quartile, and third quartile of the products are higher than the fraudulent ones.

#### 1.5.2.3 TransactionAmt

In [None]:
print('type of TransactionAmt is ' + str(train["TransactionAmt"].dtypes))

This is the amount of money transferred during the transaction, which is a continuous variable. We will visualize this feature in relation with the target - *isFraud*.

In [None]:
sm = pd.DataFrame(
    [
        ['count',train['TransactionAmt'].count()],
        ['min',train.TransactionAmt.min()],
        ['max',train.TransactionAmt.max()],
        ['range',train.TransactionAmt.max() - train.TransactionDT.min()],
        ['mid-range',(train.TransactionAmt.max() - train.TransactionAmt.min())/2],
        ['mean',train.TransactionAmt.mean()],
        ['median',train.TransactionAmt.median()],
        ['std.dev',train.TransactionAmt.std()],
        ['variance',train.TransactionAmt.var()],
        ['mode',train.TransactionAmt.mode()],
        ['25%',train.TransactionAmt.quantile(0.25)],
        ['50%',train.TransactionAmt.quantile(0.50)],
        ['75%',train.TransactionAmt.quantile(0.75)],
        ['IQR',train.TransactionAmt.quantile(0.75)-train.TransactionAmt.quantile(0.25)]
    ],
    columns=['property','value']
)
sm

In [None]:
fig, ax = plt.subplots(figsize=(5, 5))
plot = sns.boxplot(x="isFraud", y="TransactionAmt", data=train.query("TransactionAmt < 500"), palette=["blue", "yellow"], ax=ax).set_title('TransactionAmt', fontsize=16)
plt.show()

The above box plot implies that transactions with higher prices are more likely to be fraudulent. This can be deduced from the fact that the yellow box's mean value is higher than the blue box's. The third quartile of the yellow box is significantly higher than that of the blue box, despite the first quartiles of the two distributions being very similar. This difference serves as additional proof that the higher the transaction amount, the more likely the transaction to be fraudulent.

#### 1.5.2.4 card brand (card4)

In [None]:
print('type of card4 is ' + str(train["card4"].dtypes))

The *card4* refers to the brand of the card, which are discover, mastercard, visa, and american express.

In [None]:
fig, ax = plt.subplots(figsize=(5, 5))
plot = sns.boxplot(x="TransactionAmt", y="card4", data=train.query("TransactionAmt < 500"), palette=["blue", "yellow"], ax=ax).set_title('TransactionAmt', fontsize=16)
plt.show()

The distributions for fraudulent and legitimate Mastercard and Visa cards are remarkably similar. Furthermore, it is evident from this box plot that Discover and American Express cards are frequently used for larger transaction amounts. They have far higher means than the other two card brands.

#### 1.5.2.5 Card type (card6)

In [None]:
print('type of card6 is ' + str(train["card6"].dtypes))

The *card6* refers to the btype of the card, which are credit and debit.

In [None]:
fig, ax = plt.subplots(figsize=(5, 5))
plot = sns.boxplot(x="TransactionAmt", y="card6", data=train.query("TransactionAmt < 500"), palette=["blue", "yellow"], ax=ax).set_title('TransactionAmt', fontsize=16)
plt.show()

It can be seen from the boxplot above, the average transaction value of credit cards is much higher compared to debit cards.