# IEEE-CIS Fraud Detection

Hi :)

This is a portfolio project of mine: to predict transaction fraud.

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from category_encoders.binary import BinaryEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from joblib import Memory
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from data_cleaner import DataCleaner
from logging_config import setup_logging
from imblearn.ensemble import EasyEnsembleClassifier
from sklearn.model_selection import GridSearchCV

identity_train_filepath = '../data/train_identity.csv.gz'
transction_train_filepath = '../data/train_transaction.csv.gz'

In [2]:
# Set up logging
logger = setup_logging(__name__)
logger.info('Reading data')


In [3]:
identity_df = pd.read_csv(identity_train_filepath, compression='gzip')
logger.info("Loaded identity.csv data as DataFrame")

In [4]:
transaction_df = pd.read_csv(transction_train_filepath, compression='gzip')
logger.info("Loaded transaction.csv data as DataFrame")

In [5]:
# Merge the two dataframes by TransactionID
logger.info("Merging the two DataFrames by TransactionID")
df = pd.merge(
    transaction_df,
    identity_df,
    on="TransactionID",
    how="left",
    validate="one_to_one"
)
logger.info("DataFrames merged.")

In [6]:
data_cleaner = DataCleaner(target_col='isFraud')
cleaned_df = data_cleaner.clean_dataset(df)

In [7]:
X = cleaned_df.drop("isFraud", axis=1)
cat_cols, num_cols = data_cleaner.determine_data_type(X)

y = cleaned_df["isFraud"]

In [8]:
memory = Memory(location='cache', verbose=0)

cat_preprocessor = Pipeline(
    [
        ('encoder', BinaryEncoder(drop_invariant=True,)),
        ('imputer', SimpleImputer(strategy='most_frequent'))
    ],
memory=memory
)

num_preprocessor = Pipeline(
    [
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())
    ],
    memory=memory
)

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', cat_preprocessor, cat_cols),
        ('num', num_preprocessor, num_cols)
    ]
)

pca = PCA(
    n_components=50,
    random_state=42
    )

base_estimator = RandomForestClassifier(
    n_estimators=100,
    random_state=42,
    class_weight='balanced',
)

model = EasyEnsembleClassifier(
    estimator=base_estimator,
    n_estimators=10,
    random_state=42,
    n_jobs=-1
)

pipeline = Pipeline(
    [
        ('preprocessor', preprocessor),
        ('pca', pca),
        ('model', model)
    ],
memory=memory
)


X_train, X_val, y_train, y_val = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42
)

logger.info("train-test-split complete. Fitting preprocessor.")
pipeline.fit(X_train, y_train)
logger.info("Preprocessor fitted.")

  X, fitted_transformer = fit_transform_one_cached(
  X, fitted_transformer = fit_transform_one_cached(
  X, fitted_transformer = fit_transform_one_cached(
  X, fitted_transformer = fit_transform_one_cached(


KeyboardInterrupt: 

In [None]:
logger.info("Scoring Easy Ensemble Classifier.")
score = pipeline.score(X_val, y_val)
logger.info(f"Easy Ensemble Classifier scored {score}.")




In [None]:
# design a parameter grid for random searching
param_grid = {
    'model__estimator__n_estimators': [100, 200, 300],
    'model__estimator__max_depth': [10, 20, 30, 40, 50],
    'model__estimator__min_samples_split': [2, 5, 10],
    'model__estimator__min_samples_leaf': [1, 2, 4],
    'model__estimator__bootstrap': [True, False]
}

# grid search
logger.info("Starting GridSearchCV")
grid_search = GridSearchCV(
    pipeline,
    param_grid,
    cv=5,
    n_jobs=-1,
    verbose=1
)
logger.info("GridSearchCV complete.")

grid_search.fit(X_train, y_train)

logger.info("Scoring RandomizedSearchCV")
score = grid_search.score(X_val, y_val)
logger.info("Best parameters:")
logger.info(grid_search.best_params_)
logger.info("Best score:")
logger.info(grid_search.best_score_)

NameError: name 'logger' is not defined