# Basic Setup, Downloading data, etc.

## Imports

In [None]:
import numpy as np
import seaborn as sns
import pandas as pd
import matplotlib.pylab as plt
from cycler import cycler
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score


### Read in data

In [None]:
np.random.seed(314159)
train_txn = pd.read_csv('data/train_transaction.csv')
test_txn = pd.read_csv('data/test_transaction.csv')
train_id = pd.read_csv('data/train_identity.csv')

### Split training into train/validation set

In [None]:
#_RESPONSE = 'isFraud'
#y_df = train_txn[_RESPONSE]
#x_df = train_txn
#del x_df[_RESPONSE]

#train_txn, valid_txn = train_test_split(train_txn, test_size=0.25, stratify=train_txn['isFraud'])

# Very basic EDA

In [None]:
plt.style.use('ggplot')
color_pal = [x['color'] for x in plt.rcParams['axes.prop_cycle']]

prop_cycle=(cycler('color', color_pal))

In [None]:
train_txn.shape

In [None]:
#valid_txn.shape

In [None]:
list(train_txn.columns)[:10]

In [None]:
set(train_txn.columns) - set(test_txn.columns)

In [None]:
test_txn.shape

In [None]:
train_txn.describe()

In [None]:
isFraud = train_txn[train_txn['isFraud']==1]['isFraud']
isNotFraud = train_txn[train_txn['isFraud']==0]['isFraud']
print('Num fraud: {}\nNon non-fraud: {}\nPercent fraud: {}'.format(isFraud.count(), isNotFraud.count(), isFraud.count()/(isNotFraud.count()+isFraud.count())))

In [None]:
train_txn['isFraud'].hist(bins=2)

In [None]:
train_txn['ProductCD'].value_counts().plot(kind='bar')

In [None]:
def list_pcnt(feature, threshold=1, data=train_txn):
  pe = data.groupby([feature, 'isFraud'])['TransactionID'].count()
  values = data[feature].unique()
  values = [x for x in values if x==x]
  for v in values:
    det = pe[v]
    if(len(det) > 1):
      nf = det[0]
      f = det[1]
      total = nf+f
      if total > threshold:
        print("Percent", v, "is fraud:", round(100*f/(total), 2), "% of", total)
    else:
      if det[0] > threshold:
        print("Percent", v, "is fraud: 0")

In [None]:
list_pcnt('card4')

In [None]:
list_pcnt('P_emaildomain', 1000)

In [None]:
list_pcnt('R_emaildomain', 1000)

In [None]:
train_txn.groupby('ProductCD') \
    ['TransactionID'].count() \
    .sort_index() \
    .plot(kind='barh',
        figsize=(15, 3),
        title='Count of Observations by ProductCD',
        color=color_pal[1])
plt.show()

In [None]:
train_txn.groupby('ProductCD')['isFraud'] \
    .mean() \
    .sort_index() \
    .plot(kind='barh',
          figsize=(15, 3),
         title='Percentage of Fraud by ProductCD')
plt.show()

In [None]:
fig, ((ax1, ax2)) = plt.subplots(1, 2, figsize=(15, 6))
train_txn.loc[train_txn['isFraud'] == 1] \
    ['ProductCD'].value_counts() \
    .plot(kind='bar',
          title='Product Category (fraud)',
          color=color_pal[1],
          xlim=(-3, 10),
         ax= ax1)
train_txn.loc[train_txn['isFraud'] == 0] \
    ['ProductCD'].value_counts() \
    .plot(kind='bar',
          title='Product Category (non-fraud)',
          color=color_pal[2],
          xlim=(-3, 10),
         ax=ax2)
plt.show()

In [None]:
#drop categorical features with high cardinality
new_df = train_txn.drop(columns=['P_emaildomain', 'R_emaildomain'])

#one-hot remaining categorical features
new_df = pd.get_dummies(new_df, columns=['ProductCD'], prefix=['ProductCD'])
new_df = pd.get_dummies(new_df, columns=['card4'], prefix=['card4'])
new_df = pd.get_dummies(new_df, columns=['card6'], prefix=['card6'])



#drop columns that have >=25% missing values
size = train_txn.shape[0]
#new_df = new_df.dropna(axis=1, thresh=(.25 * size))

#drop rows that still have missing values (won't drop more than 25% of dataset, guaranteed above)
#new_df = new_df.dropna(axis=0)

#binary encode M1-9
encode = lambda truth: 1 if truth=="T" else 0
for i in range(1,10):
  label = "M" + str(i)
  new_df[label] = new_df[label].apply(encode)

In [None]:
new_df.describe()

In [None]:
def plot_corr(df,size=10):
    """Function plots a graphical correlation matrix for each pair of columns in the dataframe.

    Input:
        df: pandas DataFrame
        size: vertical and horizontal size of the plot
    """
    import matplotlib.pyplot as plt

    inter_cols = [
      'TransactionID',
      'isFraud',
      'TransactionDT',
      'TransactionAmt',
      'ProductCD_W',
      'ProductCD_C',
      'ProductCD_H',
      'ProductCD_R',
      'ProductCD_S',
      ]
    corr = df.corrwith(new_df[inter_cols])
    fig, ax = plt.subplots(figsize=(size, size))
    ax.matshow(corr)
    plt.xticks(range(len(corr.columns)), corr.columns)
    plt.yticks(range(len(inter_cols.columns)), inter_cols.columns)

    plt.savefig("corr_plot.png", bbox_inches = 'tight')

#plot_corr(new_df, new_df.shape[1])

#from IPython.display import Image
#Image('corr_plot.png')

In [None]:
lim_corr = new_df.corrwith(new_df['isFraud'])

In [None]:
lim_corr.filter(regex='[^V\d+]', axis=0).sort_values(ascending=False).head(50)

In [None]:
lim_corr.filter(regex='[^V\d+]', axis=0).sort_values().head(50)

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

plt.hist(train_txn['TransactionDT'], label='train');
plt.hist(test_txn['TransactionDT'], label='test');
plt.legend();
plt.title('Distribution of transaction dates');
plt.show()

In [None]:
frauds = train_txn.loc[train_txn['isFraud'] == 1]
frauds


In [None]:
notfraud = train_txn.loc[train_txn['isFraud'] == 0].sample(n=20663)
even = pd.concat([frauds,notfraud], ignore_index=True)
even = even.loc[:,~even.columns.str.startswith('V')]
even

In [None]:
trimmed = train_txn.loc[:,~train_txn.columns.str.startswith('V')]

In [None]:
import seaborn as sn
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['figure.figsize'] = [30, 30]

corrMatrix = even.corr()
sn.heatmap(corrMatrix, annot=True)
plt.show()

In [None]:
corrMatrixTrim = trimmed.corr()
sn.heatmap(corrMatrixTrim, annot=True)
plt.show()

In [None]:
# from some kaggle eda to plot the 

total = len(trimmed)

tmp = pd.crosstab(trimmed['card3'], trimmed['isFraud'], normalize='index') * 100
tmp = tmp.reset_index()
tmp.rename(columns={0:'NoFraud', 1:'Fraud'}, inplace=True)

plt.figure(figsize=(100,10))

g2 = sn.countplot(x='card3', data=trimmed, order=list(tmp.card3.values))
g22 = g2.twinx()
gg2 = sn.pointplot(x='card3', y='Fraud', data=tmp, 
                    color='black', order=list(tmp.card3.values))
gg2.set_ylabel("% of Fraud Transactions", fontsize=16)
g2.set_title("Card 3 Values Distribution and % of Transaction Frauds", fontsize=20)
g2.set_xlabel("Card 3 Values", fontsize=18)
g2.set_ylabel("Count", fontsize=18)
for p in g2.patches:
    height = p.get_height()
    g2.text(p.get_x()+p.get_width()/2.,
            height + 25,
            '{:1.2f}%'.format(height/total*100),
            ha="center") 


In [None]:
trimmed[["card3","isFraud"]][trimmed["card3"]==125]
# trimmed[["card3","isFraud"]][trimmed["card3"]==125]

In [None]:
# from some kaggle eda to plot the 

total = len(even)

tmp = pd.crosstab(even['card3'], even['isFraud'], normalize='index') * 100
tmp = tmp.reset_index()
tmp.rename(columns={0:'NoFraud', 1:'Fraud'}, inplace=True)

plt.figure(figsize=(50,10))

g2 = sn.countplot(x='card3', data=even, order=list(tmp.card3.values))
g22 = g2.twinx()
gg2 = sn.pointplot(x='card3', y='Fraud', data=tmp, 
                    color='black', order=list(tmp.card3.values))
gg2.set_ylabel("% of Fraud Transactions", fontsize=16)
g2.set_title("Card 3 Values Distribution and % of Transaction Frauds", fontsize=20)
g2.set_xlabel("Card 3 Values", fontsize=18)
g2.set_ylabel("Count", fontsize=18)
for p in g2.patches:
    height = p.get_height()
    g2.text(p.get_x()+p.get_width()/2.,
            height + 25,
            '{:1.2f}%'.format(height/total*100),
            ha="center") 


In [None]:
even[["isFraud","card3"]]

# Modeling

In [None]:
from sklearn.linear_model import LogisticRegression
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import SGD

_KEEP_COLUMNS_MODEL_1 = ['TransactionAmt']

_KEEP_COLUMNS_MODEL_1_5 = ['ProductCD', 'TransactionAmt', 'card1', 'card2', 'card3', 'card4', 
                      'card5', 'card6', 'P_emaildomain', 'isFraud']

_KEEP_COLUMNS_MODEL_2 = ['TransactionID', 'isFraud', 'TransactionDT', 'TransactionAmt', 'ProductCD', 'card1', 'card2', 'card3', 'card4', 
                      'card5', 'card6', 'addr1', 'addr2', 'dist1', 'dist2', 'P_emaildomain', 'R_emaildomain']

_RESPONSE = 'isFraud'

In [None]:
train_txn_copy = train_txn.copy()
train_txn_copy = train_txn_copy[_KEEP_COLUMNS_MODEL_2]

Separate the training data columns from the target column for easier use later in modeling.

In [None]:
y_df = train_txn_copy[_RESPONSE]
x_df = train_txn_copy
# remove the target label from our training dataframe...
del x_df[_RESPONSE]

In [None]:
# stratify on the target column to keep the approximate balance of positive examples since it's so imbalanced
x_train_df, x_test_df, y_train_df, y_test_df = \
  train_test_split(x_df, y_df, test_size=0.25, stratify=y_df)

In [None]:
x_train_df.head()

In [None]:
y_train_df.head()

Let's create a dummy 'model' that always predicts "no fraud" -- zeroes for every response.

In [None]:
y_pred_dummy = np.zeros(y_train_df.shape[0])

Are we done here? 97% accuracy? We rule!

In [None]:
accuracy_score(y_train_df, y_pred_dummy)

Not so fast... Remember the ROC-AUC metric?
[Let's recall with this short YouTube Video](https://www.youtube.com/watch?v=MUCo7NvB9SI) and if you want more, you can also read more with [this good explanation](https://towardsdatascience.com/understanding-auc-roc-curve-68b2303cc9c5). Along with countless detailed YouTube videos, etc.

In [None]:
roc_auc_score(y_train_df, y_pred_dummy)

## Very naive model on txn amount
Let's see how we can do with a very simple model on only transaction amount.

In [None]:
_KEEP_COLUMNS_MODEL_1

In [None]:
x_train_df = x_train_df[_KEEP_COLUMNS_MODEL_1]

Do some basic validation that the data looks like we think it should...

In [None]:
y_train_df.head().values

In [None]:
len(x_train_df)

make sure there's no missing data...

In [None]:
x_train_df.TransactionAmt.isna().sum()

In [None]:
y_train_df.isna().sum()

Train a simple logistic regression with all default parameters.

In [None]:
naive_baseline_lr = LogisticRegression(solver='lbfgs').fit(x_train_df, y_train_df)  # random_state=0, 
naive_baseline_lr.score(x_train_df, y_train_df)

In [None]:
x_test_df.shape

Get a few predictions to make sure it's working

In [None]:
y_pred = naive_baseline_lr.predict_proba(x_train_df)
# grab the predictions for the positive class...
y_pred = y_pred[:,1]
y_pred[:5]

In [None]:
roc_auc_score(y_train_df, y_pred)

Oops, we tested on our training data. Let's try again with data the model hasn't already seen.

In [None]:
x_test_simple_df = x_test_df[_KEEP_COLUMNS_MODEL_1]
y_pred = naive_baseline_lr.predict_proba(x_test_simple_df)
y_pred[:5]

pull out just the positive label score

In [None]:
y_pred = y_pred[:,1]
y_pred[:5]

In [None]:
y_pred = y_pred > 0.5
print('Our accuracy is: {} and ROC-AUC is: {}'.format(accuracy_score(y_test_df, y_pred), roc_auc_score(y_test_df, y_pred)))

## Two features

In [None]:
train_txn_copy = train_txn.copy()
y_df = train_txn_copy[_RESPONSE]
x_df = train_txn_copy
# remove the target label from our training dataframe...
del x_df[_RESPONSE]

# stratify on the target column to keep the approximate balance of positive examples since it's so imbalanced
x_train_df, x_test_df, y_train_df, y_test_df = \
  train_test_split(x_df, y_df, test_size=0.25, stratify=y_df)

_KEEP_COLUMNS_MODEL_TxnPcd = ['TransactionAmt', 'ProductCD']
x_train_norm_df = x_train_df[_KEEP_COLUMNS_MODEL_TxnPcd]
x_train_norm_df.TransactionAmt = (x_train_norm_df.TransactionAmt - x_train_df.TransactionAmt.mean()) / x_train_df.TransactionAmt.std()
x_train_norm_df[:10]

In [None]:
x_train_oh_df = pd.get_dummies(x_train_norm_df)
x_train_oh_df

#### logistic version with two values...

In [None]:
two_fr_lr = LogisticRegression(solver='lbfgs').fit(x_train_oh_df.values, y_train_df.values)  # random_state=0, 
two_fr_lr.score(x_train_oh_df.values, y_train_df.values)

In [None]:
x_test_norm_df = x_test_df[_KEEP_COLUMNS_MODEL_TxnPcd]
x_test_norm_df.TransactionAmt = (x_test_df.TransactionAmt - x_train_df.TransactionAmt.mean()) / x_train_df.TransactionAmt.std()
x_test_norm_df

In [None]:
x_test_oh_df = pd.get_dummies(x_test_norm_df)

In [None]:
y_pred = two_fr_lr.predict_proba(x_test_oh_df)
# grab the predictions for the positive class...
y_pred = y_pred[:,1]
y_pred[:5]
y_pred_bin = y_pred > 0.5

In [None]:
print('Our accuracy is: {} and ROC-AUC is: {}'.format(accuracy_score(y_test_df, y_pred_bin), roc_auc_score(y_test_df, y_pred)))

#### keras version with two values...

In [None]:
inputs = Input(shape=(x_train_oh_df.values.shape[1],))
preds = Dense(1, activation='sigmoid')(inputs)
model = Model(inputs=inputs, outputs=preds)
model.compile(optimizer=SGD(), loss='binary_crossentropy', metrics=['accuracy'])
model.fit(x_train_oh_df.values, y_train_df.values, batch_size=512, epochs=10, shuffle=False)

In [None]:
y_pred_k = model.predict(x_test_oh_df.values)

In [None]:
roc_auc_score(y_test_df, y_pred_k)

In [None]:
# To get your project name
# Project name = bucket name

import google.auth

_, project_id = google.auth.default()

In [None]:
model.save(f'gs://{project_id}/model1')