In [67]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import precision_recall_curve



In [2]:
df = pd.read_csv('train.csv')

In [84]:
df.sort_values(by='unix_time', inplace=True)
df_by_cc = df.groupby('cc_num')

df['avg_amt_sig'] = df_by_cc['amt'].transform(lambda x: abs((x - x.mean()) / x.std()) > 1.96)

df['prev_category_same'] = df_by_cc['category'].shift(-1) == df['category']
df['next_category_same'] = df_by_cc['category'].shift(1) == df['category']
df['both_category_different'] = df['prev_category_same'] | df['next_category_same']

# Averege time of the transaction per cc and category
# Likely hood of being in a cluster
df['avg_time'] = df.groupby(['cc_num', 'category'])['unix_time'].transform('mean')
df['cat_time_diff'] = df['unix_time'] - df['avg_time'] 

# Average time of the transaction per cc
df['avg_time_cc'] = df.groupby('cc_num')['unix_time'].diff().mean()
df['time_between_last_transaction'] = df['unix_time'] - df['unix_time'].shift(1)
df['time_between_last_transaction'].fillna(0, inplace=True)
df['diff_time'] = df['avg_time_cc'] - df['time_between_last_transaction']

df['Age'] = 2024 - df['dob'].str.split('-').str[0].astype(int)

df['log_amt'] = np.log1p(df['amt'])
df['v1'] = df['amt'] * df['cat_time_diff'] * df['diff_time']
df['v2'] = df['amt'] * df['diff_time']
df['v3'] = df['cat_time_diff'] * df['diff_time']

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['time_between_last_transaction'].fillna(0, inplace=True)


TypeError: unsupported operand type(s) for /: 'float' and 'str'

In [65]:
target_column = 'is_fraud'

X = df.drop(columns=[target_column, 'id', 'zip', 'avg_time_cc', 'state', 'long', 'lat', 'merch_lat', 'merch_long', 'first', 'last', 'street', 'city', 'dob', 'merchant', 'job', 'trans_num'])
y = df[target_column]

non_numeric_cols = X.select_dtypes(include=['object']).columns

# Apply Label Encoding for each non-numeric column
le = LabelEncoder()
for col in non_numeric_cols:
    X[col] = le.fit_transform(X[col].astype(str))

In [66]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

rf_model = RandomForestClassifier(
    n_estimators=100,
    class_weight='balanced',  # Handles class imbalance
    random_state=42
)
rf_model.fit(X_train, y_train)

# 5. Evaluate the Model
y_pred = rf_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Optional: Feature Importance
importances = rf_model.feature_importances_
feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': importances})
print(feature_importance_df.sort_values(by='Importance', ascending=False))

Accuracy: 0.9930402880997019
Classification Report:
              precision    recall  f1-score   support

           0       0.99      1.00      1.00     65702
           1       0.98      0.95      0.97      8439

    accuracy                           0.99     74141
   macro avg       0.99      0.98      0.98     74141
weighted avg       0.99      0.99      0.99     74141

                          Feature  Importance
4                             amt    0.365382
1                      trans_time    0.140225
11        both_category_different    0.082234
13                  cat_time_diff    0.066301
2                       unix_time    0.056047
8                     avg_amt_sig    0.050869
9              prev_category_same    0.042437
0                      trans_date    0.041144
3                        category    0.034892
10             next_category_same    0.032646
12                       avg_time    0.028112
16                            Age    0.021477
14  time_between_last_t

In [83]:
from xgboost import XGBClassifier

# Calculate scale_pos_weight
class_0_count, class_1_count = np.bincount(y_train)
scale_pos_weight = class_0_count / class_1_count

# Initialize XGBClassifier with class imbalance handling
xgb_model = XGBClassifier(
    n_estimators=200,
    learning_rate=0.05,       # Learning rate
    max_depth=4,              # Limit tree depth
    min_child_weight=5,       # Minimum weight to split nodes
    gamma=0.2,                # Regularization for node splitting
    subsample=0.8,            # Randomly use 80% of samples for each tree
    colsample_bytree=0.8,     # Use 80% of features per tree
    scale_pos_weight=scale_pos_weight,  # Handle class imbalance
    random_state=42
)

# Train the model
xgb_model.fit(X_train, y_train)

# Predict and evaluate
y_pred_xgb = xgb_model.predict(X_test)
from sklearn.metrics import classification_report
print("Classification Report with XGBoost:")
print(classification_report(y_test, y_pred_xgb))

Classification Report with XGBoost:
              precision    recall  f1-score   support

           0       1.00      0.98      0.99     65702
           1       0.84      0.97      0.90      8439

    accuracy                           0.98     74141
   macro avg       0.92      0.98      0.94     74141
weighted avg       0.98      0.98      0.98     74141



In [69]:
y_probs = rf_model.predict_proba(X_test)[:, 1]

# Calculate precision, recall, and thresholds
precision, recall, thresholds = precision_recall_curve(y_test, y_probs)

# Find the threshold where precision and recall are balanced
f1_scores = 2 * (precision * recall) / (precision + recall)
optimal_idx = np.argmax(f1_scores)  # Index of best F1-score
optimal_threshold = thresholds[optimal_idx]

print("Optimal Threshold:", optimal_threshold)

# Apply adjusted threshold
y_pred_adjusted = (y_probs >= optimal_threshold).astype(int)

# Evaluate
print("Classification Report with Balanced Threshold:")
print(classification_report(y_test, y_pred_adjusted))

Optimal Threshold: 0.41
Classification Report with Balanced Threshold:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     65702
           1       0.97      0.97      0.97      8439

    accuracy                           0.99     74141
   macro avg       0.99      0.98      0.98     74141
weighted avg       0.99      0.99      0.99     74141



In [70]:
final_train = pd.read_csv('train.csv')
final_test = pd.read_csv('test.csv')
final_df = pd.concat([final_train, final_test], axis=0)

In [71]:
final_df.sort_values(by='unix_time', inplace=True)
final_df_by_cc = final_df.groupby('cc_num')

final_df['avg_amt_sig'] = final_df_by_cc['amt'].transform(lambda x: abs((x - x.mean()) / x.std()) > 1.96)

final_df['prev_category_same'] = final_df_by_cc['category'].shift(-1) == final_df['category']
final_df['next_category_same'] = final_df_by_cc['category'].shift(1) == final_df['category']
final_df['both_category_different'] = final_df['prev_category_same'] | final_df['next_category_same']

# Averege time of the transaction per cc and category
# Likely hood of being in a cluster
final_df['avg_time'] = final_df.groupby(['cc_num', 'category'])['unix_time'].transform('mean')
final_df['cat_time_diff'] = final_df['unix_time'] - final_df['avg_time'] 

# Average time of the transaction per cc
final_df['avg_time_cc'] = final_df.groupby('cc_num')['unix_time'].diff().mean()
final_df['time_between_last_transaction'] = final_df['unix_time'] - final_df['unix_time'].shift(1)
final_df['time_between_last_transaction'].fillna(0, inplace=True)
final_df['diff_time'] = final_df['avg_time_cc'] - final_df['time_between_last_transaction']

final_df['Age'] = 2024 - final_df['dob'].str.split('-').str[0].astype(int)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  final_df['time_between_last_transaction'].fillna(0, inplace=True)


In [80]:
final = final_df[final_df['id'].isin(final_test['id'])]

X_final = final.drop(columns=[target_column, 'id', 'zip', 'avg_time_cc', 'state', 'long', 'lat', 'merch_lat', 'merch_long', 'first', 'last', 'street', 'city', 'dob', 'merchant', 'job', 'trans_num'])

non_numeric_cols = X_final.select_dtypes(include=['object']).columns

# Apply Label Encoding for each non-numeric column
le = LabelEncoder()
for col in non_numeric_cols:
    X_final[col] = le.fit_transform(X_final[col].astype(str))

# y_final_prob = rf_model.predict_proba(X_final)[:, 1]
# y_final = (y_final_prob >= optimal_threshold).astype(int)

y_final_xgb = xgb_model.predict(X_final)


In [82]:
output_df = pd.DataFrame({
    'id': final['id'],
    'is_fraud': y_final_xgb
})

# Writing to a CSV file
output_df.to_csv('sample_submission.csv', index=False)