In [1]:
import os
import datetime
import re

# analysis
import numpy as np
import pandas as pd



In [2]:
fpath = '/data/raw/creditcard.csv'
df = pd.read_csv(fpath)

In [4]:
df.columns

Index(['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',
       'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20',
       'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount',
       'Class'],
      dtype='object')

In [5]:
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [8]:
df.Time.head(20)

0      0.0
1      0.0
2      1.0
3      1.0
4      2.0
5      2.0
6      4.0
7      7.0
8      7.0
9      9.0
10    10.0
11    10.0
12    10.0
13    11.0
14    12.0
15    12.0
16    12.0
17    13.0
18    14.0
19    15.0
Name: Time, dtype: float64

In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, roc_auc_score, precision_recall_curve, average_precision_score
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline as ImbPipeline
import xgboost as xgb
import optuna

ModuleNotFoundError: No module named 'imblearn'

In [None]:
class ImbalancedTimeSeriesClassifier:
    def __init__(self, random_state=42):
        self.random_state = random_state
        self.scaler = StandardScaler()
        
    def prepare_features(self, df):
        # Previous time-based features remain the same
        df['time_hours'] = df['time'] / 3600
        df['time_days'] = df['time'] / (3600 * 24)
        
        # Add more granular windows for fraud detection
        windows = [60, 300, 900, 3600, 3600*24]  # 1min, 5min, 15min, 1hr, 1day
        
        for window in windows:
            mask = df['time'].rolling(window=window, min_periods=1).max() - df['time'] <= window
            window_df = df[mask]
            window_name = f'{window}s'
            
            # Add ratio features that are often indicative of fraud
            df[f'amount_to_mean_{window_name}'] = df['amount'] / (window_df.groupby(df.index)['amount'].mean() + 1e-6)
            df[f'amount_to_max_{window_name}'] = df['amount'] / (window_df.groupby(df.index)['amount'].max() + 1e-6)
            df[f'transaction_frequency_{window_name}'] = window_df.groupby(df.index).size() / window
        
        # Add transaction velocity features
        df['time_since_last_tx'] = df['time'].diff()
        df['time_since_last_tx'].fillna(0, inplace=True)
        df['transaction_velocity'] = 1 / (df['time_since_last_tx'] + 1)
        
        return df.fillna(0)  # Replace NaN with 0 for all features

    def create_balanced_folds(self, X, y, n_splits=5):
        """Create time series folds while ensuring each contains positive samples"""
        tscv = TimeSeriesSplit(n_splits=n_splits)
        balanced_folds = []
        
        for train_idx, val_idx in tscv.split(X):
            # Ensure validation set has at least one positive sample
            while np.sum(y.iloc[val_idx] == 1) == 0:
                # If no positives, adjust fold boundaries
                val_idx = val_idx[:-100]  # Remove last 100 samples from validation
                train_idx = np.append(train_idx, val_idx[-100:])  # Add them to training
            
            balanced_folds.append((train_idx, val_idx))
        
        return balanced_folds

    def optimize_xgboost(self, X_train, y_train, X_val, y_val):
        def objective(trial):
            params = {
                'max_depth': trial.suggest_int('max_depth', 3, 10),
                'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
                'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
                'min_child_weight': trial.suggest_int('min_child_weight', 1, 7),
                'subsample': trial.suggest_float('subsample', 0.6, 1.0),
                'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
                'scale_pos_weight': trial.suggest_float('scale_pos_weight', 100, 1000)  # Adjusted for 0.1% positive class
            }
            
            model = xgb.XGBClassifier(
                **params,
                random_state=self.random_state,
                eval_metric='aucpr'  # Changed to PR-AUC which is better for imbalanced data
            )
            
            # Create sampling pipeline
            sampling_pipeline = ImbPipeline([
                ('undersample', RandomUnderSampler(sampling_strategy=0.1, random_state=self.random_state)),
                ('smote', SMOTE(sampling_strategy=0.5, random_state=self.random_state))
            ])
            
            # Apply sampling only to training data
            X_resampled, y_resampled = sampling_pipeline.fit_resample(X_train, y_train)
            
            model.fit(
                X_resampled, y_resampled,
                eval_set=[(X_val, y_val)],
                early_stopping_rounds=50,
                verbose=False
            )
            
            # Use average precision score (PR-AUC) instead of ROC-AUC
            return average_precision_score(y_val, model.predict_proba(X_val)[:, 1])
        
        study = optuna.create_study(direction='maximize')
        study.optimize(objective, n_trials=50)
        
        return study.best_params

    def train(self, X, y):
        feature_cols = [col for col in X.columns if col != 'time']
        X_scaled = self.scaler.fit_transform(X[feature_cols])
        X_scaled = pd.DataFrame(X_scaled, columns=feature_cols)
        
        balanced_folds = self.create_balanced_folds(X_scaled, y)
        
        self.models = []
        self.cv_scores = {'pr_auc': [], 'roc_auc': []}
        
        for fold, (train_idx, val_idx) in enumerate(balanced_folds):
            X_train, X_val = X_scaled.iloc[train_idx], X_scaled.iloc[val_idx]
            y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
            
            print(f"\nFold {fold + 1}:")
            print(f"Positive samples - Train: {sum(y_train == 1)} ({sum(y_train == 1)/len(y_train):.4%})")
            print(f"Positive samples - Val: {sum(y_val == 1)} ({sum(y_val == 1)/len(y_val):.4%})")
            
            best_params = self.optimize_xgboost(X_train, y_train, X_val, y_val)
            model = xgb.XGBClassifier(**best_params, random_state=self.random_state)
            
            # Apply sampling pipeline
            sampling_pipeline = ImbPipeline([
                ('undersample', RandomUnderSampler(sampling_strategy=0.1)),
                ('smote', SMOTE(sampling_strategy=0.5))
            ])
            X_resampled, y_resampled = sampling_pipeline.fit_resample(X_train, y_train)
            
            model.fit(X_resampled, y_resampled)
            
            # Calculate both PR-AUC and ROC-AUC
            y_pred_proba = model.predict_proba(X_val)[:, 1]
            pr_auc = average_precision_score(y_val, y_pred_proba)
            roc_auc = roc_auc_score(y_val, y_pred_proba)
            
            self.cv_scores['pr_auc'].append(pr_auc)
            self.cv_scores['roc_auc'].append(roc_auc)
            self.models.append(model)
            
            print(f"PR-AUC: {pr_auc:.4f}")
            print(f"ROC-AUC: {roc_auc:.4f}")
        
        print("\nOverall Performance:")
        print(f"Mean PR-AUC: {np.mean(self.cv_scores['pr_auc']):.4f} ± {np.std(self.cv_scores['pr_auc']):.4f}")
        print(f"Mean ROC-AUC: {np.mean(self.cv_scores['roc_auc']):.4f} ± {np.std(self.cv_scores['roc_auc']):.4f}")

In [None]:
# Assuming your data is in a pandas DataFrame called 'df'
# with columns: 'time', 'amount', 'X1'-'X28', 'target'

# Initialize the classifier
clf = TimeSeriesClassifier()

# Prepare features
df = clf.prepare_features(df)

# Split features and target
feature_cols = ['amount', 'hour', 'day_of_week', 'day_of_month', 'month', 
                'amount_lag1', 'amount_lag2', 'amount_rolling_mean', 
                'amount_rolling_std'] + [f'X{i}' for i in range(1, 29)]
X = df[feature_cols]
y = df['target']

# Train the model
clf.train(X, y)

# Get feature importance
importance_df = clf.feature_selection(X, y)
print("\nTop 10 most important features:")
print(importance_df.head(10))