# Build the FRAUD MODEL (Offline ML Development)**

### Tasks:

* Create a sample dataset (transactions + labels)
* Train baseline fraud model (XGBoost, RandomForest, or LightGBM)
* Save model to **MLflow Model Registry**
* Log:

  * parameters
  * metrics
  * artifacts
  * models

### Test:

* Load model from MLflow and run prediction on sample data
* Validate latency (<5 ms per transaction expected)


In [33]:
import pandas as pd
import numpy as np
import datetime as dt


In [5]:
df_customer = pd.read_csv("../data/customer.csv")
df_transaction = pd.read_csv("../data/transactions.csv")

print(df_customer.columns)
print(df_transaction.columns)

Index(['cc_num', 'first', 'last', 'gender', 'street', 'city', 'state', 'zip',
       'lat', 'long', 'job', 'dob'],
      dtype='object')
Index(['cc_num', 'first', 'last', 'trans_num', 'trans_date', 'trans_time',
       'unix_time', 'category', 'merchant', 'amt', 'merch_lat', 'merch_long',
       'is_fraud'],
      dtype='object')


In [14]:
# merge the dataset
df_transaction = df_transaction.drop(columns=['first','last'],axis=1)
df = pd.merge(df_transaction, df_customer, how='left', on='cc_num')
df.columns

Index(['cc_num', 'trans_num', 'trans_date', 'trans_time', 'unix_time',
       'category', 'merchant', 'amt', 'merch_lat', 'merch_long', 'is_fraud',
       'first', 'last', 'gender', 'street', 'city', 'state', 'zip', 'lat',
       'long', 'job', 'dob'],
      dtype='object')

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11871 entries, 0 to 11870
Data columns (total 22 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   cc_num      11871 non-null  int64  
 1   trans_num   11871 non-null  object 
 2   trans_date  11871 non-null  object 
 3   trans_time  11871 non-null  object 
 4   unix_time   11871 non-null  int64  
 5   category    11871 non-null  object 
 6   merchant    11871 non-null  object 
 7   amt         11871 non-null  int64  
 8   merch_lat   11871 non-null  float64
 9   merch_long  11871 non-null  float64
 10  is_fraud    11871 non-null  int64  
 11  first       11871 non-null  object 
 12  last        11871 non-null  object 
 13  gender      11871 non-null  object 
 14  street      11871 non-null  object 
 15  city        11871 non-null  object 
 16  state       11871 non-null  object 
 17  zip         11871 non-null  int64  
 18  lat         11871 non-null  float64
 19  long        11871 non-nul

In [16]:
df.head()

Unnamed: 0,cc_num,trans_num,trans_date,trans_time,unix_time,category,merchant,amt,merch_lat,merch_long,...,last,gender,street,city,state,zip,lat,long,job,dob
0,180094108369013,80f5177be11f0bcd768e06a0b1b294c8,2012-01-01T00:00:00.000+05:30,00:12:15,1325376735,personal_care,Hills-Boyer,64,39.011566,-119.937831,...,Holland,M,630 Christina Harbor,Zephyr Cove,NV,89448,39.0204,-119.9114,Geophysical data processor,1949-12-28 13:30:00
1,4368593032190508,7933d389bf8ef8a11a8a301da6e6bc6c,2012-01-01T00:00:00.000+05:30,00:16:58,1325377018,gas_transport,Friesen-DAmore,133,40.149071,-75.589697,...,Fleming,F,9667 Brown Club Suite 507,Spring City,PA,19475,40.1765,-75.5697,"Engineer, chemical",1961-05-20 14:30:00
2,4361355512072,1467c318b5d73d22d6741f575db42f3f,2012-01-01T00:00:00.000+05:30,00:36:42,1325378202,entertainment,Larson-Moen,119,47.297797,-96.819362,...,Nelson,M,13242 Jackson Landing,Hendrum,MN,56550,47.2689,-96.7988,"Programme researcher, broadcasting/film/video",1987-07-13 14:30:00
3,4037295225657274,4a3848719d72daaa32c938ce43e0eb03,2012-01-01T00:00:00.000+05:30,00:37:59,1325378279,shopping_pos,Lynch Ltd,62,40.078781,-102.373954,...,Garcia,M,2145 Chad Viaduct,Wray,CO,80758,40.0685,-102.393,Advertising account executive,1978-09-16 14:30:00
4,4515092388857440,02d27e94f279e1013a4c323eb8bb6811,2012-01-01T00:00:00.000+05:30,00:39:18,1325378358,shopping_pos,"Baumbach, Strosin and Nicolas",198,41.549359,-83.044403,...,Silva,M,1339 Matthew View,Port Clinton,OH,43452,41.5583,-83.0502,"Psychologist, counselling",1939-04-28 13:07:00


In [17]:
df.describe()

Unnamed: 0,cc_num,unix_time,amt,merch_lat,merch_long,is_fraud,zip,lat,long
count,11871.0,11871.0,11871.0,11871.0,11871.0,11871.0,11871.0,11871.0,11871.0
mean,3133233000000000.0,1433198000.0,150.597675,39.767213,-92.629963,0.042204,53321.23629,39.768711,-92.635258
std,2361177000000000.0,60805650.0,195.814267,5.567138,12.671022,0.201062,25557.65004,5.562462,12.671775
min,675985200000.0,1325377000.0,11.0,26.319252,-125.437401,0.0,5476.0,27.4295,-123.7945
25%,4483019000000.0,1451912000.0,71.0,36.944888,-102.397248,0.0,39746.0,36.9576,-102.393
50%,4170243000000000.0,1463510000.0,103.0,41.870607,-92.162653,0.0,59323.0,41.8798,-92.1644
75%,5157436000000000.0,1471882000.0,163.0,44.759263,-82.980032,0.0,78344.0,44.769,-82.9878
max,6546851000000000.0,1481233000.0,2786.0,48.709398,-71.001628,1.0,97302.0,47.2689,-72.6905


In [18]:
df.shape

(11871, 22)

In [19]:
df.isna().sum()

cc_num        0
trans_num     0
trans_date    0
trans_time    0
unix_time     0
category      0
merchant      0
amt           0
merch_lat     0
merch_long    0
is_fraud      0
first         0
last          0
gender        0
street        0
city          0
state         0
zip           0
lat           0
long          0
job           0
dob           0
dtype: int64

In [24]:
len(df['trans_num'].unique())

11871

### drop columns:
`first, last name of user,
street name, trans_num`

In [None]:
df = df.drop(columns=['first','last', 'street','trans_num'])
df.columns

Index(['cc_num', 'trans_date', 'trans_time', 'unix_time', 'category',
       'merchant', 'amt', 'merch_lat', 'merch_long', 'is_fraud', 'gender',
       'city', 'state', 'zip', 'lat', 'long', 'job', 'dob'],
      dtype='object')

### Feature Engineering

In [None]:
categorical_col = ['cc_num','category', 'merchant','is_fraud', 'gender', 'state', 'job']
date_col = ['trans_date', 'dob']

In [36]:
df['dob'] = pd.to_datetime(df['dob']).dt.normalize()
df['trans_date'] = pd.to_datetime(df['trans_date']).dt.normalize()
df['age'] = df['trans_date'].dt.year - df['dob'].dt.year

categorical_col +=['age']

In [None]:
df['trans_time'] = pd.to_datetime(df['trans_time'])
df['trans_hour'] = df['trans_time'].dt.hour

categorical_col+=['trans_hour']

In [42]:
## find distance between user location and transaction location
# User coordinates
user_lat = df['lat']
user_long = df['long']

# Merchant coordinates
merch_lat = df['merch_lat']
merch_long = df['merch_long']

# Convert degrees to radians
lat1 = np.radians(user_lat)
lon1 = np.radians(user_long)
lat2 = np.radians(merch_lat)
lon2 = np.radians(merch_long)

# Haversine formula
dlat = lat2 - lat1
dlon = lon2 - lon1

a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
c = 2 * np.arcsin(np.sqrt(a))

R = 6371  # Earth radius in km

df['distance_km'] = R * c
df['distance_km'].describe()


count    11871.000000
mean         7.059771
std         28.558318
min          0.015227
25%          1.021778
50%          1.452968
75%          1.878533
max        244.447039
Name: distance_km, dtype: float64

In [45]:
for col in categorical_col:
    print(df[col].value_counts())

cc_num
4641003399120410    2016
675985166411        1971
4170242670039985    1340
5157595343543285    1308
6011779269963768    1023
4483018920250        996
4361646620879135     329
5421885738881170     198
4037295225657274     163
4354778868658084     156
370763211656868      147
180094108369013      127
6011537727192499     126
3593533875650654     120
5157436163845247     105
6011165671471311     102
3526015186182660     101
3561758568398109      97
38535403302699        97
4006862159277         97
4765125568595799      70
349326734419590       70
4361355512072         70
30157941709315        69
4368593032190508      69
30300776786251        69
4515092388857440      69
180036251237802       69
348933408404369       67
4738555317386146      65
4550801576257         64
30260722249657        46
5132731018032805      42
30021746099829        41
374115112731710       39
30170394853324        39
5264302655249852      39
30405027360515        39
4092259246729         38
5590294502817012  

### Analysis

### Model Training

1. Feature selection

2. Model training

3. Evaluation

4. store model

In [46]:
df.columns

Index(['cc_num', 'trans_date', 'trans_time', 'unix_time', 'category',
       'merchant', 'amt', 'merch_lat', 'merch_long', 'is_fraud', 'gender',
       'city', 'state', 'zip', 'lat', 'long', 'job', 'dob', 'age',
       'trans_hour', 'distance_km'],
      dtype='object')

In [48]:
selected_column = ['category', 'gender', 'state','trans_hour', 'amt', 'distance_km', 'age','is_fraud']
df_final = df[selected_column]

In [49]:
### Train test split
from sklearn.model_selection import train_test_split

X = df_final.drop(['is_fraud'],axis=1)
y = df_final['is_fraud']

X_train, X_test, y_train, y_test = train_test_split(X, 
    y, 
    test_size=0.2,      # 20% test
    random_state=42,    # reproducible
    stratify=y          # keep fraud balance
)
X_train.shape

(9496, 7)

In [None]:
cat_cols = ['category', 'gender', 'state' ]
num_cols = ['amt', 'distance_km', 'age']
target_col = ['is_fraud']

In [60]:
# convert category into one hot encoding - gender, catrgory, stage

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
import joblib

cat_cols = ['category', 'gender', 'state']

# 1. Create the encoder
ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

# 2. Build the ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('ohe', ohe, cat_cols)
    ],
    remainder='passthrough'   # keep other columns as is
)

# 3. Fit and transform the data
# train_encoded = preprocessor.fit_transform(X_train[cat_cols])
# X_train = X_train.drop(cat_cols)
# X_train = pd.concat([X_train, train_encoded])

preprocessor.fit(df_final[cat_cols])
X_train_encoded = preprocessor.transform(X_train)
X_test_encoded = preprocessor.transform(X_test)


joblib.dump(preprocessor, "../artifacts/onehot_encoder.pkl")


['../artifacts/onehot_encoder.pkl']

In [61]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier

from sklearn.metrics import accuracy_score, f1_score

# --------------------------------------
# TRAIN MULTIPLE MODELS
# --------------------------------------

models = {
    "LogisticRegression": LogisticRegression(max_iter=300),
    "RandomForest": RandomForestClassifier(n_estimators=300, random_state=42),
    "GradientBoosting": GradientBoostingClassifier(random_state=42),
    "XGBoost": XGBClassifier(
        n_estimators=300,
        learning_rate=0.05,
        max_depth=5,
        subsample=0.9,
        colsample_bytree=0.9,
        eval_metric="logloss",
        random_state=42
    )
}

results = {}
best_model = None
best_score = -np.inf
best_model_name = ""

# --------------------------------------
# TRAIN + EVALUATE MODELS
# --------------------------------------
for name, model in models.items():
    print(f"Training: {name}")
    model.fit(X_train_encoded, y_train)

    y_pred = model.predict(X_test_encoded)

    acc = accuracy_score(y_test, y_pred)
    f1  = f1_score(y_test, y_pred)

    results[name] = {"accuracy": acc, "f1_score": f1}

    print(f"{name} → Accuracy: {acc:.4f}, F1: {f1:.4f}")

    # Select best by F1 (recommended for fraud)
    if f1 > best_score:
        best_model = model
        best_score = f1
        best_model_name = name

# --------------------------------------
# SHOW RESULTS
# --------------------------------------
print("\n=== Model Comparison ===")
for name, metrics in results.items():
    print(f"{name}: {metrics}")

print(f"\nBest Model: {best_model_name}  (F1 = {best_score:.4f})")

# --------------------------------------
# SAVE BEST MODEL
# --------------------------------------
joblib.dump(best_model, "../artifacts/best_model.pkl")
print(f"\nSaved best model as best_model.pkl")


Training: LogisticRegression
LogisticRegression → Accuracy: 0.9583, F1: 0.1081
Training: RandomForest
RandomForest → Accuracy: 0.9579, F1: 0.2857
Training: GradientBoosting
GradientBoosting → Accuracy: 0.9592, F1: 0.2362
Training: XGBoost
XGBoost → Accuracy: 0.9575, F1: 0.2406

=== Model Comparison ===
LogisticRegression: {'accuracy': 0.9583157894736842, 'f1_score': 0.10810810810810811}
RandomForest: {'accuracy': 0.9578947368421052, 'f1_score': 0.2857142857142857}
GradientBoosting: {'accuracy': 0.9591578947368421, 'f1_score': 0.23622047244094488}
XGBoost: {'accuracy': 0.9574736842105264, 'f1_score': 0.24060150375939848}

Best Model: RandomForest  (F1 = 0.2857)

Saved best model as best_model.pkl
