In [7]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
from sklearn.decomposition import IncrementalPCA
from tqdm import tqdm

In [8]:
train_data = pd.read_csv("fraudTrain.csv")

In [9]:
test_data = pd.read_csv("fraudTest.csv")

In [10]:
combined_data = pd.concat([train_data, test_data], axis=0)

In [11]:
def extract_datetime_features(df):
    df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'])
    df['day_of_week'] = df['trans_date_trans_time'].dt.dayofweek
    df['hour_of_day'] = df['trans_date_trans_time'].dt.hour
    df.drop('trans_date_trans_time', axis=1, inplace=True)
    return df

In [12]:
combined_data = extract_datetime_features(combined_data)

In [13]:
columns_to_drop = ["first", "last", "job", "dob", "trans_num", "street"]
combined_data.drop(columns_to_drop, axis=1, inplace=True)

In [14]:
X_combined = combined_data.drop("is_fraud", axis=1)
y_combined = combined_data["is_fraud"]

In [15]:
label_encoder = LabelEncoder()
X_combined["merchant"] = label_encoder.fit_transform(X_combined["merchant"])
X_combined["category"] = label_encoder.fit_transform(X_combined["category"])

In [16]:
from sklearn.preprocessing import OneHotEncoder

categorical_columns = ["gender", "city", "state"]
onehot_encoder = OneHotEncoder(drop="first", handle_unknown='ignore', sparse_output=False)
X_combined_categorical = onehot_encoder.fit_transform(X_combined[categorical_columns])

In [17]:
scaler = StandardScaler()
X_combined_numeric = scaler.fit_transform(X_combined.drop(categorical_columns, axis=1))

In [18]:
X_combined_encoded = np.hstack((X_combined_numeric, X_combined_categorical))

In [19]:
X_train = X_combined_encoded[:len(train_data)]
X_test = X_combined_encoded[len(train_data):]
y_train = y_combined[:len(train_data)]
y_test = y_combined[len(train_data):]

In [20]:
sm = SMOTE(random_state=42)
X_resampled, y_resampled = sm.fit_resample(X_train, y_train)

In [21]:
n_components = 100  # Adjust the number of components as needed
ipca = IncrementalPCA(n_components=n_components)

In [22]:
for batch in tqdm(np.array_split(X_resampled, 10), desc="Applying Incremental PCA"):
    ipca.partial_fit(batch)

Applying Incremental PCA: 100%|██████████| 10/10 [10:52<00:00, 65.28s/it]


In [23]:
X_resampled_pca = ipca.transform(X_resampled)
X_test_pca = ipca.transform(X_test)

In [24]:
rf_classifier = RandomForestClassifier(random_state=42)

In [25]:
rf_classifier.fit(X_resampled_pca, y_resampled)

In [26]:
y_pred = rf_classifier.predict(X_test_pca)

In [27]:
accuracy = accuracy_score(y_test, y_pred)
confusion = confusion_matrix(y_test, y_pred)
report = classification_report(y_test, y_pred)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [28]:
print(f"Accuracy: {accuracy}")
print(f"Confusion Matrix:\n{confusion}")
print(f"Classification Report:\n{report}")

Accuracy: 0.9948717948717949
Confusion Matrix:
[[2134    0]
 [  11    0]]
Classification Report:
              precision    recall  f1-score   support

           0       0.99      1.00      1.00      2134
           1       0.00      0.00      0.00        11

    accuracy                           0.99      2145
   macro avg       0.50      0.50      0.50      2145
weighted avg       0.99      0.99      0.99      2145

