In [1]:
import plotly.express as px
import numpy as np
import pandas as pd
import plotly.figure_factory as ff

In [8]:
def load_data():
  # Set the path to the file you'd like to load
  file_path = "data/transactions.csv"

  # Load the latest version
  df = pd.read_csv(file_path)
  df['log_amount'] = np.log1p(df['amount'])
  return df

In [9]:
df = load_data()
# exploration
print("First 5 records:", df.sample(n=100))
print(df['isFraud'].value_counts())
print(f"\nFraud percentage: {df['isFraud'].mean() * 100:.4f}%")
print(df.info())
print(df.describe())


First 5 records:         step      type      amount     nameOrig  oldbalanceOrg  \
99521     38   CASH_IN    47377.73  C1925599637       21595.00   
52249    346   PAYMENT    15444.25  C1102262886       10397.00   
75222     10  CASH_OUT    13187.57  C1534114744       10356.00   
110115    44  CASH_OUT   165074.23  C1113315508        7056.00   
11122    251  CASH_OUT   163750.69   C921755263        6039.00   
...      ...       ...         ...          ...            ...   
9163     306   CASH_IN    22690.01   C845215376        8758.00   
83046     41  TRANSFER  1030541.74  C1001124945       11345.00   
185098   206  CASH_OUT   102398.19  C1704000439       51927.00   
117932   375  TRANSFER   435977.55   C274687261           0.00   
130029   408   CASH_IN   106063.51   C700511642     8479008.69   

        newbalanceOrig     nameDest  oldbalanceDest  newbalanceDest  isFraud  \
99521         68972.73  C1655370378      8960350.88      8912973.15        0   
52249             0.00  M14766

In [4]:
# transaction types
type_counts = df['type'].value_counts().reset_index()
print(type_counts)
type_counts.columns = ['Transaction Type', 'Count']
fig = px.bar(type_counts, x = 'Transaction Type', y = 'Count', title="Transaction Type Distribution", text='Count')
# fig.write_html("transaction_types.html")
fig.show()

       type  count
0  CASH_OUT  70571
1   PAYMENT  67562
2   CASH_IN  43919
3  TRANSFER  16630
4     DEBIT   1317


In [5]:
# fraud rate by transaction type
fraud_rate = df.groupby('type')['isFraud'].mean().reset_index()
fraud_rate.columns = ['Transaction Type', 'Fraud Rate']
fig = px.bar(fraud_rate, x = 'Transaction Type', y = 'Fraud Rate', title="Fraud Rate by Transaction Type", text='Fraud Rate')
# fig.write_html("fraud_rate.html")
fig.show()

In [6]:
# transaction amounts
df['log_amount'] = np.log1p(df['amount'])
fig = px.histogram(df, x='log_amount', nbins = 100, title="Log-Scaled Distribution of Transaction Amounts")
fig.update_layout(xaxis_title = 'Log(1 + Amount)', yaxis_title = 'Count')
# fig.write_html("transaction_amount.html")
fig.show()

In [7]:

# numeric correlation to fraud
correlation_with_fraud = df.corr(numeric_only=True)['isFraud'].drop('isFraud').sort_values(ascending=False).round(3)
z = [correlation_with_fraud.values.tolist()]
x = correlation_with_fraud.index.tolist()
y = ['isFraud']

fig = ff.create_annotated_heatmap(
  z=z,
  x=x,
  y=y,
  colorscale='Viridis',
  zmin=-1, zmax=1,
  showscale=True,
  annotation_text=[[f"{val:.2f}" for val in z[0]]]
)
fig.update_layout(title='Correlation Heatmap of Numeric Features')
# fig.write_html("correlation_heatmap.html")
fig.show()
