In [1]:
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE
from collections import Counter
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score, roc_curve
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
import pandas as pd
import numpy as np
from joblib import dump

In [2]:
# Load the datasets
train_data = pd.read_csv("Train.csv")
test_data = pd.read_csv("Test.csv")
sample_submission = pd.read_csv("sample_submission.csv")

In [3]:
# Drop unnecessary columns
columns_to_drop = ['transaction_status', 'user_segment', 'customer_ip_location']
train_data.drop(columns_to_drop, axis=1, inplace=True)
test_data.drop(columns_to_drop, axis=1, inplace=True)

In [4]:
# Define feature groups
numeric_features = ['cart_size', 'transaction_amount', 'session_duration', 
                   'time_spent_on_payment_page', 'card_expiration_delay', 
                   'time_since_account_creation', 'ip_address_previous_transactions', 
                   'product_views_during_session']

categorical_features = ['transaction_type', 'customer_age_group', 'payment_method', 
                       'login_status', 'visit_origin', 'device_type']

In [5]:
# Impute numeric missing values
numeric_imputer = SimpleImputer(strategy='mean')
train_data[numeric_features] = numeric_imputer.fit_transform(train_data[numeric_features])
test_data[numeric_features] = numeric_imputer.transform(test_data[numeric_features])

# Impute categorical missing values
categorical_imputer = SimpleImputer(strategy='most_frequent')
train_data[categorical_features] = categorical_imputer.fit_transform(train_data[categorical_features])
test_data[categorical_features] = categorical_imputer.transform(test_data[categorical_features])

In [6]:
# Verify no missing values
print("Missing values after imputation:")
print("\nTrain data:")
print(train_data.isnull().sum().sum())
print("\nTest data:")
print(test_data.isnull().sum().sum())

Missing values after imputation:

Train data:
0

Test data:
0


In [7]:
# Prepare features and target
X = train_data.drop('flag', axis=1)
y = train_data['flag']

In [8]:
# Prepare features and target
X = train_data.drop('flag', axis=1)
y = train_data['flag']

# First encode categorical variables
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
X_train_encoded = encoder.fit_transform(X[categorical_features])
X_test_encoded = encoder.transform(test_data[categorical_features])

# Convert encoded arrays to DataFrames
encoded_columns = encoder.get_feature_names_out(categorical_features)
X_train_encoded = pd.DataFrame(X_train_encoded, columns=encoded_columns, index=X.index)
X_test_encoded = pd.DataFrame(X_test_encoded, columns=encoded_columns, index=test_data.index)

# Combine numeric and encoded categorical features
X_train_final = pd.concat([X[numeric_features], X_train_encoded], axis=1)
X_test_final = pd.concat([test_data[numeric_features], X_test_encoded], axis=1)

# Scale features before SMOTE
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_final)
X_test_scaled = scaler.transform(X_test_final)

# Now apply SMOTE on the scaled and encoded data
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y)

In [10]:
# Initialize and train the model
model = LogisticRegression(random_state=42, class_weight='balanced', max_iter=1000)
model.fit(X_train_resampled, y_train_resampled)

In [11]:
# Get cross-validation scores
cv_scores = cross_val_score(model, X_train_resampled, y_train_resampled, cv=5, scoring='roc_auc')
print(f"\nCross-validation ROC-AUC scores: {cv_scores}")
print(f"Mean ROC-AUC: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")


Cross-validation ROC-AUC scores: [0.64476241 0.65185227 0.65997862 0.66490716 0.65493098]
Mean ROC-AUC: 0.6553 (+/- 0.0138)


In [16]:
# Make predictions and get probability scores
probability_scores = model.predict_proba(X_test_scaled)[:, 1]
# Use a default threshold of 0.5 or adjust based on business requirements
threshold = 0.5
predictions = (probability_scores >= threshold).astype(int)

In [17]:
# Create submission DataFrame
submission_df = pd.DataFrame({
    'transaction_id': test_data['transaction_id'],
    'predicted_score': probability_scores,
    'predicted_flag': predictions
})

In [18]:
# Ensure same order as sample submission
submission_df = submission_df.sort_values('transaction_id').reset_index(drop=True)

In [19]:
# Print prediction statistics
print("\nPrediction Statistics:")
print(f"Average predicted score: {submission_df['predicted_score'].mean():.4f}")
print(f"Number of flagged transactions: {submission_df['predicted_flag'].sum()}")
print(f"Percentage of flagged transactions: {(submission_df['predicted_flag'].sum() / len(submission_df)) * 100:.2f}%")


Prediction Statistics:
Average predicted score: 0.4718
Number of flagged transactions: 2677
Percentage of flagged transactions: 44.62%


In [20]:
# Compare with sample submission
comparison = pd.merge(
    submission_df,
    sample_submission,
    on='transaction_id',
    suffixes=('_model', '_sample')
) 

print("\nComparison with sample submission:")
print(f"Matching predictions: {(comparison['predicted_flag_model'] == comparison['predicted_flag']).mean():.2%}") 


Comparison with sample submission:


KeyError: 'predicted_flag'