In [None]:
import numpy as np
import pandas as pd 

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [3]:
df_identity = pd.read_csv(r"C:\Users\Admin\Desktop\ieee-fraud-detection\train_identity.csv")
df_transaction = pd.read_csv(r"C:\Users\Admin\Desktop\ieee-fraud-detection\train_transaction.csv")

df_test_identity = pd.read_csv(r"C:\Users\Admin\Desktop\ieee-fraud-detection\test_identity.csv")
df_test_transaction = pd.read_csv(r"C:\Users\Admin\Desktop\ieee-fraud-detection\test_transaction.csv")


In [None]:
train_data = df_identity.merge(df_transaction, on="TransactionID")

In [None]:
test_data = df_test_transaction.merge(df_test_identity, on="TransactionID", how="left")

In [6]:
del df_identity, df_transaction, df_test_identity , df_test_transaction

In [None]:
null_percentage_train = (train_data.isnull().mean()) * 100

with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    print("Percentage of null values in each column:")
    print(null_percentage_train)

train_nan_counts = train_data.isna().sum()
print("NaN values in train_data:")
print(train_nan_counts)

columns_to_drop = null_percentage_train[null_percentage_train > 80].index
cleaned_train = train_data.drop(columns=columns_to_drop)

print("Cleaned DataFrame:")
print(cleaned_train)

Percentage of null values in each column:
TransactionID       0.000000
id_01               0.000000
id_02               2.330257
id_03              54.016071
id_04              54.016071
id_05               5.108401
id_06               5.108401
id_07              96.425922
id_08              96.425922
id_09              48.052110
id_10              48.052110
id_11               2.256765
id_12               0.000000
id_13              11.726165
id_14              44.503685
id_15               2.251912
id_16              10.325654
id_17               3.372321
id_18              68.722137
id_19               3.407681
id_20               3.447200
id_21              96.423149
id_22              96.416215
id_23              96.416215
id_24              96.708798
id_25              96.441868
id_26              96.420375
id_27              96.416215
id_28               2.256765
id_29               2.256765
id_30              46.222432
id_31               2.739318
id_32              46.207872
i

In [None]:
from sklearn.preprocessing import LabelEncoder
def fill_num_cal_nan(data):
    numerical_columns = data.select_dtypes(include=['number']).columns
    data[numerical_columns] = data[numerical_columns].fillna(data[numerical_columns].median())
    categorical_columns = data.select_dtypes(include=['object']).columns
    data[categorical_columns] = data[categorical_columns].fillna('unknown')
    label_encoder = LabelEncoder()
    for col in categorical_columns:
        data[col] = label_encoder.fit_transform(data[col])
    
    return data
train_data_filled = fill_num_cal_nan(train_data)
test_data_filled = fill_num_cal_nan(test_data)


In [9]:
test_data_filled.describe()
print(test_data_filled.shape)

(506691, 433)


In [10]:
train_data_filled.describe()
print(train_data_filled.shape)

(144233, 434)


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix
import xgboost as xgb
target_variable = 'isFraud'
X = train_data_filled.drop(columns=[target_variable])
y = train_data_filled[target_variable]
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

model = xgb.XGBClassifier(objective="binary:logistic", random_state=42)

model.fit(X_train, y_train)
y_val_pred = model.predict_proba(X_val)[:, 1]
auc_score = roc_auc_score(y_val, y_val_pred)

test_data_filled.columns = test_data_filled.columns.str.replace('-', '_') 
X_test = test_data_filled[X_train.columns]  
y_test_pred = model.predict_proba(X_test)[:, 1]

submission_df = pd.DataFrame({
    'TransactionID': test_data_filled['TransactionID'],
    'isFraud': y_test_pred
})

submission_file_path = os.path.join(r"C:\Users\Admin\Desktop\AI FINAL REUSLG", 'submission.csv')
submission_df.to_csv(submission_file_path, index=False)
print(f"Submission CSV file generated successfully and saved to: {submission_file_path}")

print("\nClassification Report:")
y_val_pred_binary = (y_val_pred > 0.5).astype(int)
print(classification_report(y_val, y_val_pred_binary))

print("\nConfusion Matrix:")
print(confusion_matrix(y_val, y_val_pred_binary))


Submission CSV file generated successfully and saved to: C:\Users\Admin\Desktop\AI FINAL REUSLG\submission.csv

Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.99      0.99     26587
           1       0.93      0.73      0.82      2260

    accuracy                           0.97     28847
   macro avg       0.95      0.86      0.90     28847
weighted avg       0.97      0.97      0.97     28847


Confusion Matrix:
[[26454   133]
 [  611  1649]]


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix
import xgboost as xgb

target_variable = 'isFraud'

X = train_data_filled.drop(columns=[target_variable])
y = train_data_filled[target_variable]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

class_weight_ratio = 15
class_weights = {0: 1, 1: class_weight_ratio}

model = xgb.XGBClassifier(objective="binary:logistic", random_state=42, scale_pos_weight=class_weight_ratio)

model.fit(X_train, y_train)

y_val_pred = model.predict_proba(X_val)[:, 1]

auc_score = roc_auc_score(y_val, y_val_pred)

test_data_filled.columns = test_data_filled.columns.str.replace('-', '_') 
X_test = test_data_filled[X_train.columns]  
y_test_pred = model.predict_proba(X_test)[:, 1]

print("\nClassification Report:")
y_val_pred_binary = (y_val_pred > 0.5).astype(int)
print(classification_report(y_val, y_val_pred_binary))

print("\nConfusion Matrix:")
print(confusion_matrix(y_val, y_val_pred_binary))



Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.95      0.97     26587
           1       0.61      0.88      0.72      2260

    accuracy                           0.95     28847
   macro avg       0.80      0.92      0.85     28847
weighted avg       0.96      0.95      0.95     28847


Confusion Matrix:
[[25332  1255]
 [  267  1993]]
