In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, accuracy_score, recall_score, f1_score
from imblearn.over_sampling import SMOTE
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.impute import KNNImputer
from sklearn.experimental import enable_iterative_imputer  # noqa
from sklearn.impute import IterativeImputer
from sklearn.impute import SimpleImputer
from xgboost import XGBClassifier

In [2]:
data = pd.read_csv("fraudTrain.csv")
data_test = pd.read_csv("fraudTest.csv")
pd.set_option('display.max_columns', None)
data.head()

Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,city,state,zip,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,0,2019-01-01 00:00:18,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,Moravian Falls,NC,28654,36.0788,-81.1781,3495,"Psychologist, counselling",1988-03-09,0b242abb623afc578575680df30655b9,1325376018,36.011293,-82.048315,0
1,1,2019-01-01 00:00:44,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,Orient,WA,99160,48.8878,-118.2105,149,Special educational needs teacher,1978-06-21,1f76529f8574734946361c461b024d99,1325376044,49.159047,-118.186462,0
2,2,2019-01-01 00:00:51,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,594 White Dale Suite 530,Malad City,ID,83252,42.1808,-112.262,4154,Nature conservation officer,1962-01-19,a1a22d70485983eac12b5b88dad1cf95,1325376051,43.150704,-112.154481,0
3,3,2019-01-01 00:01:16,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,White,M,9443 Cynthia Court Apt. 038,Boulder,MT,59632,46.2306,-112.1138,1939,Patent attorney,1967-01-12,6b849c168bdad6f867558c3793159a81,1325376076,47.034331,-112.561071,0
4,4,2019-01-01 00:03:06,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,408 Bradley Rest,Doe Hill,VA,24433,38.4207,-79.4629,99,Dance movement psychotherapist,1986-03-28,a41d7549acf90789359a9aa5346dcb46,1325376186,38.674999,-78.632459,0


In [3]:
data.columns

Index(['Unnamed: 0', 'trans_date_trans_time', 'cc_num', 'merchant', 'category',
       'amt', 'first', 'last', 'gender', 'street', 'city', 'state', 'zip',
       'lat', 'long', 'city_pop', 'job', 'dob', 'trans_num', 'unix_time',
       'merch_lat', 'merch_long', 'is_fraud'],
      dtype='object')

In [4]:
data=data.drop(columns=["Unnamed: 0","cc_num","merchant","first","last","gender","city","state","trans_num","street","job"])
data_test=data_test.drop(columns=["Unnamed: 0","cc_num","merchant","first","last","gender","city","state","trans_num","street","job"])
data.head()

Unnamed: 0,trans_date_trans_time,category,amt,zip,lat,long,city_pop,dob,unix_time,merch_lat,merch_long,is_fraud
0,2019-01-01 00:00:18,misc_net,4.97,28654,36.0788,-81.1781,3495,1988-03-09,1325376018,36.011293,-82.048315,0
1,2019-01-01 00:00:44,grocery_pos,107.23,99160,48.8878,-118.2105,149,1978-06-21,1325376044,49.159047,-118.186462,0
2,2019-01-01 00:00:51,entertainment,220.11,83252,42.1808,-112.262,4154,1962-01-19,1325376051,43.150704,-112.154481,0
3,2019-01-01 00:01:16,gas_transport,45.0,59632,46.2306,-112.1138,1939,1967-01-12,1325376076,47.034331,-112.561071,0
4,2019-01-01 00:03:06,misc_pos,41.96,24433,38.4207,-79.4629,99,1986-03-28,1325376186,38.674999,-78.632459,0


In [5]:
data.isnull().sum()

trans_date_trans_time    0
category                 0
amt                      0
zip                      0
lat                      0
long                     0
city_pop                 0
dob                      0
unix_time                0
merch_lat                0
merch_long               0
is_fraud                 0
dtype: int64

In [6]:
data['unix_time'] = pd.to_datetime(data['unix_time'])
data['dob'] = pd.to_datetime(data['dob'])
data['trans_date_trans_time'] = pd.to_datetime(data['trans_date_trans_time'])

In [7]:
data['trans_date_trans_time_int'] = data['trans_date_trans_time'].astype('int64') // 10**9
data['unix_time_int'] = data['unix_time'].astype('int64')
data['dob_int'] = data['dob'].astype('int64') // 10 ** 9
data.head()

Unnamed: 0,trans_date_trans_time,category,amt,zip,lat,long,city_pop,dob,unix_time,merch_lat,merch_long,is_fraud,trans_date_trans_time_int,unix_time_int,dob_int
0,2019-01-01 00:00:18,misc_net,4.97,28654,36.0788,-81.1781,3495,1988-03-09,1970-01-01 00:00:01.325376018,36.011293,-82.048315,0,1546300818,1325376018,573868800
1,2019-01-01 00:00:44,grocery_pos,107.23,99160,48.8878,-118.2105,149,1978-06-21,1970-01-01 00:00:01.325376044,49.159047,-118.186462,0,1546300844,1325376044,267235200
2,2019-01-01 00:00:51,entertainment,220.11,83252,42.1808,-112.262,4154,1962-01-19,1970-01-01 00:00:01.325376051,43.150704,-112.154481,0,1546300851,1325376051,-250905600
3,2019-01-01 00:01:16,gas_transport,45.0,59632,46.2306,-112.1138,1939,1967-01-12,1970-01-01 00:00:01.325376076,47.034331,-112.561071,0,1546300876,1325376076,-93744000
4,2019-01-01 00:03:06,misc_pos,41.96,24433,38.4207,-79.4629,99,1986-03-28,1970-01-01 00:00:01.325376186,38.674999,-78.632459,0,1546300986,1325376186,512352000


In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1296675 entries, 0 to 1296674
Data columns (total 15 columns):
 #   Column                     Non-Null Count    Dtype         
---  ------                     --------------    -----         
 0   trans_date_trans_time      1296675 non-null  datetime64[ns]
 1   category                   1296675 non-null  object        
 2   amt                        1296675 non-null  float64       
 3   zip                        1296675 non-null  int64         
 4   lat                        1296675 non-null  float64       
 5   long                       1296675 non-null  float64       
 6   city_pop                   1296675 non-null  int64         
 7   dob                        1296675 non-null  datetime64[ns]
 8   unix_time                  1296675 non-null  datetime64[ns]
 9   merch_lat                  1296675 non-null  float64       
 10  merch_long                 1296675 non-null  float64       
 11  is_fraud                   1296675 no

In [9]:
data = pd.get_dummies(data,drop_first=True)
data.head()

Unnamed: 0,trans_date_trans_time,amt,zip,lat,long,city_pop,dob,unix_time,merch_lat,merch_long,is_fraud,trans_date_trans_time_int,unix_time_int,dob_int,category_food_dining,category_gas_transport,category_grocery_net,category_grocery_pos,category_health_fitness,category_home,category_kids_pets,category_misc_net,category_misc_pos,category_personal_care,category_shopping_net,category_shopping_pos,category_travel
0,2019-01-01 00:00:18,4.97,28654,36.0788,-81.1781,3495,1988-03-09,1970-01-01 00:00:01.325376018,36.011293,-82.048315,0,1546300818,1325376018,573868800,False,False,False,False,False,False,False,True,False,False,False,False,False
1,2019-01-01 00:00:44,107.23,99160,48.8878,-118.2105,149,1978-06-21,1970-01-01 00:00:01.325376044,49.159047,-118.186462,0,1546300844,1325376044,267235200,False,False,False,True,False,False,False,False,False,False,False,False,False
2,2019-01-01 00:00:51,220.11,83252,42.1808,-112.262,4154,1962-01-19,1970-01-01 00:00:01.325376051,43.150704,-112.154481,0,1546300851,1325376051,-250905600,False,False,False,False,False,False,False,False,False,False,False,False,False
3,2019-01-01 00:01:16,45.0,59632,46.2306,-112.1138,1939,1967-01-12,1970-01-01 00:00:01.325376076,47.034331,-112.561071,0,1546300876,1325376076,-93744000,False,True,False,False,False,False,False,False,False,False,False,False,False
4,2019-01-01 00:03:06,41.96,24433,38.4207,-79.4629,99,1986-03-28,1970-01-01 00:00:01.325376186,38.674999,-78.632459,0,1546300986,1325376186,512352000,False,False,False,False,False,False,False,False,True,False,False,False,False


In [10]:
x_train=data.drop(columns=["is_fraud","trans_date_trans_time","unix_time","dob"])
y_train=data['is_fraud']

In [11]:
log = LogisticRegression(max_iter=1000)
log.fit(x_train,y_train) 

In [12]:
data = data_test
data['unix_time'] = pd.to_datetime(data['unix_time'])
data['dob'] = pd.to_datetime(data['dob'])
data['trans_date_trans_time'] = pd.to_datetime(data['trans_date_trans_time'])
data['trans_date_trans_time_int'] = data['trans_date_trans_time'].astype('int64') // 10**9
data['unix_time_int'] = data['unix_time'].astype('int64')
data['dob_int'] = data['dob'].astype('int64') // 10 ** 9
data = pd.get_dummies(data,drop_first=True)
x_test=data.drop(columns=["is_fraud","trans_date_trans_time","unix_time","dob"])
y_test=data['is_fraud']

In [13]:
y_pred = log.predict(x_test)
accuracy_score(y_test,y_pred)

0.9955175187459849