In [22]:
import pandas as pd
import numpy as np
import geohash2
from sklearn.model_selection import StratifiedShuffleSplit, train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer
from sklearn.cluster import KMeans
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import lightgbm as lgb
from sklearn.metrics import mean_absolute_error
from lightgbm import LGBMRegressor, early_stopping, log_evaluation
from sentence_transformers import SentenceTransformer
from sklearn.neighbors    import NearestNeighbors
from sklearn.metrics      import mean_absolute_error
import joblib
from sklearn.metrics import f1_score, accuracy_score, classification_report

In [2]:
file = 'creditcard.csv'

df = pd.read_csv(file)

In [3]:
df.head(3)

Unnamed: 0,id,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0,-0.260648,-0.469648,2.496266,-0.083724,0.129681,0.732898,0.519014,-0.130006,0.727159,...,-0.110552,0.217606,-0.134794,0.165959,0.12628,-0.434824,-0.08123,-0.151045,17982.1,0
1,1,0.9851,-0.356045,0.558056,-0.429654,0.27714,0.428605,0.406466,-0.133118,0.347452,...,-0.194936,-0.605761,0.079469,-0.577395,0.19009,0.296503,-0.248052,-0.064512,6531.37,0
2,2,-0.260272,-0.949385,1.728538,-0.457986,0.074062,1.419481,0.743511,-0.095576,-0.261297,...,-0.00502,0.702906,0.945045,-1.154666,-0.605564,-0.312895,-0.300258,-0.244718,2513.54,0


In [4]:
df.tail(3)

Unnamed: 0,id,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
568627,568627,-0.311997,-0.004095,0.137526,-0.035893,-0.042291,0.121098,-0.070958,-0.019997,-0.122048,...,0.140788,0.536523,-0.2111,-0.448909,0.540073,-0.755836,-0.48754,-0.268741,23572.85,1
568628,568628,0.636871,-0.51697,-0.300889,-0.14448,0.131042,-0.294148,0.580568,-0.207723,0.893527,...,-0.060381,-0.195609,-0.175488,-0.554643,-0.099669,-1.434931,-0.159269,-0.076251,10160.83,1
568629,568629,-0.795144,0.433236,-0.64914,0.374732,-0.244976,-0.603493,-0.347613,-0.340814,0.253971,...,0.534853,-0.291514,0.157303,0.93103,-0.349423,-1.090974,-1.575113,0.722936,21493.92,1


In [5]:
# We can see that we have the same number of samples for both classes.
# Therefore, it is not an imbalanced dataset adn the chnces of class 0 is not that low.
df['Class'].value_counts()

Class
0    284315
1    284315
Name: count, dtype: int64

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 568630 entries, 0 to 568629
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   id      568630 non-null  int64  
 1   V1      568630 non-null  float64
 2   V2      568630 non-null  float64
 3   V3      568630 non-null  float64
 4   V4      568630 non-null  float64
 5   V5      568630 non-null  float64
 6   V6      568630 non-null  float64
 7   V7      568630 non-null  float64
 8   V8      568630 non-null  float64
 9   V9      568630 non-null  float64
 10  V10     568630 non-null  float64
 11  V11     568630 non-null  float64
 12  V12     568630 non-null  float64
 13  V13     568630 non-null  float64
 14  V14     568630 non-null  float64
 15  V15     568630 non-null  float64
 16  V16     568630 non-null  float64
 17  V17     568630 non-null  float64
 18  V18     568630 non-null  float64
 19  V19     568630 non-null  float64
 20  V20     568630 non-null  float64
 21  V21     56

In [None]:
# Seems like the dataset is already clean and there are no missing values.
# We can also see that the dataset is not too big, so we can use a simple model to train on it.

In [8]:
for i in df.columns:
    print(df[i].min(), df[i].max())

0 568629
-3.495583516386668 2.22904613004356
-49.96657153869079 4.361865196721416
-3.1837603416948093 14.125833911866232
-4.951222429093022 3.201535546069201
-9.952785617741023 42.716890639914205
-21.11110792759147 26.168402294404643
-4.351839315074907 217.873038474627
-10.75634230545734 5.958040147327273
-3.751918738076145 20.270062075837107
-3.163275761885778 31.72270910795672
-5.954723293982779 2.5135727491214537
-2.020399318328518 17.913556111364983
-5.955226700040189 7.187485954748435
-2.1074168038580363 19.169544406102982
-3.8618127653411154 14.532202180325108
-2.214512888665661 46.6529060440468
-2.484938386554947 6.994124024684426
-2.4219487219161318 6.783716009168727
-7.80498794807604 3.8316716979071006
-78.14783856605457 29.872812160323736
-19.382523087206284 8.087080028016498
-7.734798174224937 12.63251122579015
-30.295450154840687 31.707626578253517
-4.067967795102357 12.965638661146754
-13.612633178980907 14.621509105774306
-8.226969338778424 5.623285408193404
-10.498633077

In [11]:
df.drop(columns=['id'], inplace=True)

In [12]:
df.head(1)

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,-0.260648,-0.469648,2.496266,-0.083724,0.129681,0.732898,0.519014,-0.130006,0.727159,0.637735,...,-0.110552,0.217606,-0.134794,0.165959,0.12628,-0.434824,-0.08123,-0.151045,17982.1,0


In [18]:
X = df.drop(columns=['Class'])
Y = df['Class']
X.shape, Y.shape

((568630, 29), (568630,))

In [25]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42, stratify=Y,shuffle=True)
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((454904, 29), (113726, 29), (454904,), (113726,))

In [26]:
x_test.head(1)

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
510747,0.701467,-0.062754,0.405692,0.374299,0.425804,0.476822,0.34643,-0.405724,1.292818,0.41937,...,-0.522569,0.115249,-1.062692,0.098032,-0.49847,0.464774,-0.360343,-0.258456,-0.025794,2056.65


In [27]:
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split, cross_val_score

In [28]:
model = LGBMClassifier(
    objective='binary',
    metric='binary_logloss',
    n_estimators=1000,
    learning_rate=0.1,
    num_leaves=31,
    max_depth=5,
)

In [29]:
pipeline = Pipeline([ 
                     ("cls", model)
                ])

In [30]:
pipeline.fit(x_train, y_train)

[LightGBM] [Info] Number of positive: 227452, number of negative: 227452
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.019632 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7395
[LightGBM] [Info] Number of data points in the train set: 454904, number of used features: 29
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


In [31]:
pred = pipeline.predict(x_test)

In [32]:
f1_score(y_test, pred)

0.9975936204594955