In [23]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import KBinsDiscretizer, OrdinalEncoder

# Load your data from the CSV file
data = pd.read_csv('feature_engineered_data.csv')

# Assuming recency, frequency, monetary, and seniority columns are derived from your existing columns
data['recency'] = np.random.randint(1, 100, size=len(data))  # Example logic
data['frequency'] = data['TransactionCount']
data['monetary'] = data['TotalTransactionAmount']
data['seniority'] = 2025 - data['TransactionYear']

data['rfms_score'] = data['recency'] * 0.2 + data['frequency'] * 0.3 + data['monetary'] * 0.4 + data['seniority'] * 0.1

# Establish a boundary for classification
boundary = data['rfms_score'].median()

# Assign good and bad labels
data['label'] = np.where(data['rfms_score'] > boundary, 'good', 'bad')


# Function to calculate Weight of Evidence (WoE)
def calc_woe(df, feature, target):
    df_woe = df.groupby(feature)[target].agg(['count', 'sum'])
    df_woe.columns = ['n', 'bad']
    df_woe['good'] = df_woe['n'] - df_woe['bad']
    df_woe['bad_prop'] = df_woe['bad'] / df_woe['bad'].sum()
    df_woe['good_prop'] = df_woe['good'] / df_woe['good'].sum()
    df_woe['woe'] = np.log(df_woe['good_prop'] / df_woe['bad_prop'])
    return df_woe['woe']

# Discretize the features into bins
est = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='uniform')
data_binned = est.fit_transform(data[['recency', 'frequency', 'monetary', 'seniority']])
data_binned = pd.DataFrame(data_binned, columns=['recency_bin', 'frequency_bin', 'monetary_bin', 'seniority_bin'])

# Calculate WoE for each binned feature
default_column = 'FraudResult'  # Adjust the column name if needed
for feature in ['recency_bin', 'frequency_bin', 'monetary_bin', 'seniority_bin']:
    data_binned[f'{feature}_woe'] = calc_woe(data_binned.join(data[[default_column]]), feature, default_column)

# Combine the original and WoE-transformed features
data_combined = pd.concat([data, data_binned[['recency_bin_woe', 'frequency_bin_woe', 'monetary_bin_woe', 'seniority_bin_woe']]], axis=1)



In [24]:
# Check if 'TransactionStartTime' column exists
if 'TransactionStartTime' in data_combined.columns:
    # Convert `TransactionStartTime` to datetime format
    data_combined['TransactionStartTime'] = pd.to_datetime(data_combined['TransactionStartTime'])

    # Extract features from `TransactionStartTime`
    data_combined['TransactionHour'] = data_combined['TransactionStartTime'].dt.hour
    data_combined['TransactionDay'] = data_combined['TransactionStartTime'].dt.day
    data_combined['TransactionMonth'] = data_combined['TransactionStartTime'].dt.month
    data_combined['TransactionYear'] = data_combined['TransactionStartTime'].dt.year

    # Drop the original `TransactionStartTime` column
    data_combined.drop('TransactionStartTime', axis=1, inplace=True)
else:
    print("The 'TransactionStartTime' column is not present in the dataset.")


In [29]:
# Identify categorical features
categorical_features = ['CurrencyCode', 'CountryCode', 'ProviderId', 'ProductId', 'ProductCategory', 'ChannelId', 'PricingStrategy']

# Apply ordinal encoding to categorical features
encoder = OrdinalEncoder()
data_combined[categorical_features] = encoder.fit_transform(data_combined[categorical_features])

# Display columns for verification
print(data_combined.columns)



Index(['TransactionId', 'BatchId', 'AccountId', 'SubscriptionId', 'CustomerId',
       'CurrencyCode', 'CountryCode', 'ProviderId', 'ProductId',
       'ProductCategory', 'ChannelId', 'Amount', 'Value', 'PricingStrategy',
       'FraudResult', 'TransactionHour', 'TransactionDay', 'TransactionMonth',
       'TransactionYear', 'ChannelIdEncoded', 'ProductCategory_airtime',
       'ProductCategory_data_bundles', 'ProductCategory_financial_services',
       'ProductCategory_movies', 'ProductCategory_other',
       'ProductCategory_ticket', 'ProductCategory_transport',
       'ProductCategory_tv', 'ProductCategory_utility_bill',
       'TotalTransactionAmount', 'AvgTransactionAmount', 'TransactionCount',
       'StdDevTransactionAmount', 'recency', 'frequency', 'monetary',
       'seniority', 'rfms_score', 'label', 'recency_bin_woe',
       'frequency_bin_woe', 'monetary_bin_woe', 'seniority_bin_woe'],
      dtype='object')


In [31]:
from sklearn.impute import SimpleImputer

# Define the features (X) and the target variable (y)
X = data_combined.drop(['label', 'TransactionId', 'BatchId', 'AccountId', 'SubscriptionId', 'CustomerId'], axis=1)
y = data_combined['label']

# Replace infinite values with NaN
X.replace([np.inf, -np.inf], np.nan, inplace=True)

# Initialize the SimpleImputer with median strategy
imputer = SimpleImputer(strategy='median')

# Apply the imputer to the feature set
X_imputed = imputer.fit_transform(X)

# Convert the result back to a DataFrame
X = pd.DataFrame(X_imputed, columns=X.columns)



In [37]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Make sure these steps only apply to training data
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)



print(f'Training set size: {X_train.shape}')
print(f'Testing set size: {X_test.shape}')


Training set size: (61223, 37)
Testing set size: (19133, 37)


In [38]:
# Apply transformations only to training data
# For example: WOE or scaling
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

# Initialize the SimpleImputer with median strategy
imputer = SimpleImputer(strategy='median')

# Apply the imputer only to the training set
X_train_imputed = imputer.fit_transform(X_train)
X_val_imputed = imputer.transform(X_val)
X_test_imputed = imputer.transform(X_test)

# Apply scaling to the training set and then apply the same scaler to validation and test sets
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_imputed)
X_val_scaled = scaler.transform(X_val_imputed)
X_test_scaled = scaler.transform(X_test_imputed)


In [39]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import pandas as pd

# Initialize a simpler Logistic Regression model
log_reg_simple = LogisticRegression(C=0.01, penalty='l2', max_iter=1000)
log_reg_simple.fit(X_train_scaled, y_train)

# Predict on the validation set
y_pred_log_reg_simple = log_reg_simple.predict(X_val_scaled)

# Compute evaluation metrics for Logistic Regression
accuracy_log_reg_simple = accuracy_score(y_val, y_pred_log_reg_simple)
precision_log_reg_simple = precision_score(y_val, y_pred_log_reg_simple, pos_label='good')
recall_log_reg_simple = recall_score(y_val, y_pred_log_reg_simple, pos_label='good')
f1_log_reg_simple = f1_score(y_val, y_pred_log_reg_simple, pos_label='good')
roc_auc_log_reg_simple = roc_auc_score(y_val, log_reg_simple.predict_proba(X_val_scaled)[:, 1])

# Display evaluation results
print("\nSimple Logistic Regression Performance:")
print(f"Accuracy: {accuracy_log_reg_simple}")
print(f"Precision: {precision_log_reg_simple}")
print(f"Recall: {recall_log_reg_simple}")
print(f"F1 Score: {f1_log_reg_simple}")
print(f"ROC-AUC: {roc_auc_log_reg_simple}")

# Combine X_val with the actual and predicted labels
results = pd.DataFrame(X_val_scaled, columns=X.columns)
results['Actual'] = y_val
results['LogReg_Predicted'] = y_pred_log_reg_simple

# Save the DataFrame to a CSV file
results.to_csv('simple_model_predictions_with_features.csv', index=False)
print("Predictions saved to 'simple_model_predictions_with_features.csv'")



Simple Logistic Regression Performance:
Accuracy: 0.8820723899124526
Precision: 0.9585927770859277
Recall: 0.8000519682993374
F1 Score: 0.872176191487855
ROC-AUC: 0.9708676839733886
Predictions saved to 'simple_model_predictions_with_features.csv'


In [41]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
import pandas as pd


# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Replace infinite values with NaN
X_train.replace([np.inf, -np.inf], np.nan, inplace=True)
X_val.replace([np.inf, -np.inf], np.nan, inplace=True)
X_test.replace([np.inf, -np.inf], np.nan, inplace=True)

# Initialize the SimpleImputer with median strategy
imputer = SimpleImputer(strategy='median')

# Apply the imputer only to the training set
X_train_imputed = imputer.fit_transform(X_train)
X_val_imputed = imputer.transform(X_val)
X_test_imputed = imputer.transform(X_test)

# Apply scaling to the training set and then apply the same scaler to validation and test sets
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_imputed)
X_val_scaled = scaler.transform(X_val_imputed)
X_test_scaled = scaler.transform(X_test_imputed)

# Initialize and train a simpler Logistic Regression model
log_reg_simple = LogisticRegression(C=0.01, penalty='l2', max_iter=1000)
log_reg_simple.fit(X_train_scaled, y_train)

# Predict on the validation set for Logistic Regression
y_pred_log_reg_simple = log_reg_simple.predict(X_val_scaled)

# Compute evaluation metrics for Logistic Regression
accuracy_log_reg_simple = accuracy_score(y_val, y_pred_log_reg_simple)
precision_log_reg_simple = precision_score(y_val, y_pred_log_reg_simple, pos_label='good')
recall_log_reg_simple = recall_score(y_val, y_pred_log_reg_simple, pos_label='good')
f1_log_reg_simple = f1_score(y_val, y_pred_log_reg_simple, pos_label='good')
roc_auc_log_reg_simple = roc_auc_score(y_val, log_reg_simple.predict_proba(X_val_scaled)[:, 1])

# Display evaluation results for Logistic Regression
print("\nSimple Logistic Regression Performance:")
print(f"Accuracy: {accuracy_log_reg_simple}")
print(f"Precision: {precision_log_reg_simple}")
print(f"Recall: {recall_log_reg_simple}")
print(f"F1 Score: {f1_log_reg_simple}")
print(f"ROC-AUC: {roc_auc_log_reg_simple}")

# Initialize and train a simpler Random Forest model with Grid Search for Hyperparameter Tuning
rf_params = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [10, 20],
    'min_samples_leaf': [10, 20]
}
grid_search_rf = GridSearchCV(RandomForestClassifier(), rf_params, cv=5, n_jobs=-1, verbose=1)
grid_search_rf.fit(X_train_scaled, y_train)

# Best Random Forest model
best_rf = grid_search_rf.best_estimator_

# Predict on the validation set for Random Forest
y_pred_best_rf = best_rf.predict(X_val_scaled)

# Compute evaluation metrics for Best Random Forest
accuracy_best_rf = accuracy_score(y_val, y_pred_best_rf)
precision_best_rf = precision_score(y_val, y_pred_best_rf, pos_label='good')
recall_best_rf = recall_score(y_val, y_pred_best_rf, pos_label='good')
f1_best_rf = f1_score(y_val, y_pred_best_rf, pos_label='good')
roc_auc_best_rf = roc_auc_score(y_val, best_rf.predict_proba(X_val_scaled)[:, 1])

# Display evaluation results for Random Forest
print("\nBest Random Forest (with Grid Search) Performance:")
print(f"Accuracy: {accuracy_best_rf}")
print(f"Precision: {precision_best_rf}")
print(f"Recall: {recall_best_rf}")
print(f"F1 Score: {f1_best_rf}")
print(f"ROC-AUC: {roc_auc_best_rf}")

# Combine X_val with the actual and predicted labels
results = pd.DataFrame(X_val_scaled, columns=X.columns)
results['Actual'] = y_val
results['LogReg_Predicted'] = y_pred_log_reg_simple
results['RandomForest_Predicted'] = y_pred_best_rf

# Save the DataFrame to a CSV file
results.to_csv('model_predictions_with_features.csv', index=False)
print("Predictions saved to 'model_predictions_with_features.csv'")



Simple Logistic Regression Performance:
Accuracy: 0.8815497190644191
Precision: 0.9566904687985098
Recall: 0.800701572041055
F1 Score: 0.8717731098380367
ROC-AUC: 0.9705720711575034
Fitting 5 folds for each of 36 candidates, totalling 180 fits

Best Random Forest (with Grid Search) Performance:
Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1 Score: 1.0
ROC-AUC: 1.0
Predictions saved to 'model_predictions_with_features.csv'
