### Fraud Test data 

Use only logistic regression.
Data from https://www.kaggle.com/datasets/kartik2112/fraud-detection

In [1]:
import torch
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt

In [None]:
'''
data = pd.read_csv('ieee-fraud-detection/train_transaction.csv')
data.shape # (590540, 394)
data.to_pickle('ieee_pickled.pkl')
'''

In [2]:
data = pd.read_pickle('ieee_pickled.pkl')

In [None]:
data.shape # size (590540, 394), last one is label

In [3]:
labels = data.iloc[:, 1]
n_positive = sum(labels)
print(n_positive)
print('percent positive = ', n_positive/len(labels))

20663
percent positive =  0.03499000914417313


### About missing values

In [None]:
nan_count = data.isna().sum()
plt.plot(nan_count)
plt.yscale('log')
plt.ylabel('number of missing values'), plt.xlabel('features')
plt.show()

In [None]:
bad_col = data.columns[nan_count> 1e5]
print(bad_col)
good_col = data.columns[nan_count < 100] 
print(len(good_col))

there are 19 features with no missing values, 52 features with fewer than 100 missing values, 94 features with fewer than 1000 missing values.<br>
In the case of < 100 missing values, all columns with missing values share the same 12 indices. 2 of these are fraud.<br>
In the case of < 1000 missing values, the missing values lie in 326 different rows. Among these, 19 of them are fraud. This ratio is a bit high.

In [None]:
total_missing = []
for col in good_col:
    missing_idx = data.index[data[col].isna()].tolist()
    total_missing.extend(missing_idx)

print('number of rows with missing values', len(set(total_missing)))    
print('fraudulent', sum(data.isFraud[list(set(total_missing))]))

data_nomissing = data.drop(set(total_missing))
data_label = data_nomissing['isFraud']
data_features = data_nomissing.drop(columns=['isFraud'])

### Keeping all features
Split into numerical and categorial columns. 
For categorical, use one hot or 1/0.
For numerical, normalize. 

In [4]:
data_label = data.isFraud
data_features = data.drop(columns=['isFraud'])

# Numerical columns
num_cols = data_features.select_dtypes(include=['int64', 'float64']).columns.tolist()
print("Numerical columns:", num_cols)

# Categorical columns
cat_cols = data_features.select_dtypes(include=['object', 'category']).columns.tolist()
print("Categorical columns:", cat_cols)

Numerical columns: ['TransactionID', 'TransactionDT', 'TransactionAmt', 'card1', 'card2', 'card3', 'card5', 'addr1', 'addr2', 'dist1', 'dist2', 'C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'C10', 'C11', 'C12', 'C13', 'C14', 'D1', 'D2', 'D3', 'D4', 'D5', 'D6', 'D7', 'D8', 'D9', 'D10', 'D11', 'D12', 'D13', 'D14', 'D15', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'V29', 'V30', 'V31', 'V32', 'V33', 'V34', 'V35', 'V36', 'V37', 'V38', 'V39', 'V40', 'V41', 'V42', 'V43', 'V44', 'V45', 'V46', 'V47', 'V48', 'V49', 'V50', 'V51', 'V52', 'V53', 'V54', 'V55', 'V56', 'V57', 'V58', 'V59', 'V60', 'V61', 'V62', 'V63', 'V64', 'V65', 'V66', 'V67', 'V68', 'V69', 'V70', 'V71', 'V72', 'V73', 'V74', 'V75', 'V76', 'V77', 'V78', 'V79', 'V80', 'V81', 'V82', 'V83', 'V84', 'V85', 'V86', 'V87', 'V88', 'V89', 'V90', 'V91', 'V92', 'V93', 'V94', 'V95', 'V96', 'V97', '

### Preprocessing: categorical features
Among categorical columns, ProductCD has 5 values, card4 has 4, card6 has 4, ------> one hot <br>
P_emaildomain, R_emaildomain have a lot (59 and 60)<br> 
M1-M9 except M4 are True False --------> convert to 1/0 <br>
M4 has values M0, M2, M1 ----------> numerical 0, 2, 1

In [None]:
#for col in cat_cols:
#    print(data[col].nunique())

In [5]:
features = data_features.copy()

In [6]:
for col in cat_cols: 
    if col[0] == 'M': 
        if col[1] != 4:
            features[col] = features[col].map({'T': 1, 'F': 0})
        else:
            features[col] = features[col].map({'M0': 0, 'M1': 1, 'M2':2})
    else:
        features = pd.get_dummies(features, columns=[col], dtype=int)

features.head

<bound method NDFrame.head of         TransactionID  TransactionDT  TransactionAmt  card1  card2  card3  \
0             2987000          86400           68.50  13926    NaN  150.0   
1             2987001          86401           29.00   2755  404.0  150.0   
2             2987002          86469           59.00   4663  490.0  150.0   
3             2987003          86499           50.00  18132  567.0  150.0   
4             2987004          86506           50.00   4497  514.0  150.0   
...               ...            ...             ...    ...    ...    ...   
590535        3577535       15811047           49.00   6550    NaN  150.0   
590536        3577536       15811049           39.50  10444  225.0  150.0   
590537        3577537       15811079           30.95  12037  595.0  150.0   
590538        3577538       15811088          117.00   7826  481.0  150.0   
590539        3577539       15811131          279.95  15066  170.0  150.0   

        card5  addr1  addr2  dist1  ...  R_em

### Preprocessing: scaling and filling in Nan with mean

In [8]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [12]:
scaler = MinMaxScaler()
features_normalized=features.copy()
features_normalized[num_cols] = scaler.fit_transform(features[num_cols])

In [13]:
features_normalized.head()

Unnamed: 0,TransactionID,TransactionDT,TransactionAmt,card1,card2,card3,card5,addr1,addr2,dist1,...,R_emaildomain_web.de,R_emaildomain_windstream.net,R_emaildomain_yahoo.co.jp,R_emaildomain_yahoo.co.uk,R_emaildomain_yahoo.com,R_emaildomain_yahoo.com.mx,R_emaildomain_yahoo.de,R_emaildomain_yahoo.es,R_emaildomain_yahoo.fr,R_emaildomain_ymail.com
0,0.0,0.0,0.002137,0.743044,,0.381679,0.306569,0.488636,0.836957,0.001847,...,0,0,0,0,0,0,0,0,0,0
1,2e-06,6.359409e-08,0.0009,0.100885,0.608,0.381679,0.014599,0.511364,0.836957,,...,0,0,0,0,0,0,0,0,0,0
2,3e-06,4.387992e-06,0.00184,0.210566,0.78,0.381679,0.481752,0.522727,0.836957,0.027902,...,0,0,0,0,0,0,0,0,0,0
3,5e-06,6.295815e-06,0.001558,0.984824,0.934,0.381679,0.124088,0.854545,0.836957,,...,0,0,0,0,0,0,0,0,0,0
4,7e-06,6.740974e-06,0.001558,0.201023,0.828,0.381679,0.014599,0.727273,0.836957,,...,0,0,0,0,0,0,0,0,0,0
