In [39]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [40]:
df = pd.read_csv(r'C:\Users\JOE\Desktop\task 2\credit_card_purchases.csv')

In [41]:
df.head()

Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud,merch_zipcode
0,0,2019-01-01 00:00:18,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,...,-81.1781,3495,"Psychologist, counselling",1988-03-09,0b242abb623afc578575680df30655b9,1325376018,36.011293,-82.048315,0,28705.0
1,1,2019-01-01 00:00:44,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,...,-118.2105,149,Special educational needs teacher,1978-06-21,1f76529f8574734946361c461b024d99,1325376044,49.159047,-118.186462,0,
2,2,2019-01-01 00:00:51,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,594 White Dale Suite 530,...,-112.262,4154,Nature conservation officer,1962-01-19,a1a22d70485983eac12b5b88dad1cf95,1325376051,43.150704,-112.154481,0,83236.0
3,3,2019-01-01 00:01:16,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,White,M,9443 Cynthia Court Apt. 038,...,-112.1138,1939,Patent attorney,1967-01-12,6b849c168bdad6f867558c3793159a81,1325376076,47.034331,-112.561071,0,
4,4,2019-01-01 00:03:06,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,408 Bradley Rest,...,-79.4629,99,Dance movement psychotherapist,1986-03-28,a41d7549acf90789359a9aa5346dcb46,1325376186,38.674999,-78.632459,0,22844.0


In [42]:
missing_values = df.isnull().sum()
print("Missing values in each column:\n", missing_values)

Missing values in each column:
 Unnamed: 0                    0
trans_date_trans_time         0
cc_num                        0
merchant                      0
category                      0
amt                           0
first                         0
last                          0
gender                        0
street                        0
city                          0
state                         0
zip                           0
lat                           0
long                          0
city_pop                      0
job                           0
dob                           0
trans_num                     0
unix_time                     0
merch_lat                     0
merch_long                    0
is_fraud                      0
merch_zipcode            195973
dtype: int64


In [43]:
numeric_cols = df.select_dtypes(include=[np.number]).columns
print("Numeric columns:", numeric_cols)

Numeric columns: Index(['Unnamed: 0', 'cc_num', 'amt', 'zip', 'lat', 'long', 'city_pop',
       'unix_time', 'merch_lat', 'merch_long', 'is_fraud', 'merch_zipcode'],
      dtype='object')


In [44]:
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].median())


In [45]:
categorical_cols = df.select_dtypes(include=[object]).columns
print("Categorical columns:", categorical_cols)
df[categorical_cols] = df[categorical_cols].fillna(df[categorical_cols].mode().iloc[0])


Categorical columns: Index(['trans_date_trans_time', 'merchant', 'category', 'first', 'last',
       'gender', 'street', 'city', 'state', 'job', 'dob', 'trans_num'],
      dtype='object')


In [46]:
Q1 = df[numeric_cols].quantile(0.25)
Q3 = df[numeric_cols].quantile(0.75)
IQR = Q3 - Q1
outliers = ((df[numeric_cols] < (Q1 - 1.5 * IQR)) | (df[numeric_cols] > (Q3 + 1.5 * IQR))).sum()
print("Outliers in each column:\n", outliers)

Outliers in each column:
 Unnamed: 0            0
cc_num           118789
amt               67290
zip                   0
lat                4679
long              49922
city_pop         242674
unix_time             0
merch_lat          4967
merch_long        41994
is_fraud           7506
merch_zipcode         0
dtype: int64


In [47]:
df = df[~((df[numeric_cols] < (Q1 - 1.5 * IQR)) | (df[numeric_cols] > (Q3 + 1.5 * IQR))).any(axis=1)]

In [48]:
scaler = StandardScaler()
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

In [49]:
X = df.drop('is_fraud', axis=1)  # Drop the target column for features
y = df['is_fraud']


In [50]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [51]:
print("Training set shape:", X_train.shape)
print("Testing set shape:", X_test.shape)

Training set shape: (703953, 23)
Testing set shape: (175989, 23)


In [52]:
print("Processed Data Information:")
print(df.info())

print("Processed Data Description:")
print(df.describe())

Processed Data Information:
<class 'pandas.core.frame.DataFrame'>
Index: 879942 entries, 0 to 1296673
Data columns (total 24 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   Unnamed: 0             879942 non-null  float64
 1   trans_date_trans_time  879942 non-null  object 
 2   cc_num                 879942 non-null  float64
 3   merchant               879942 non-null  object 
 4   category               879942 non-null  object 
 5   amt                    879942 non-null  float64
 6   first                  879942 non-null  object 
 7   last                   879942 non-null  object 
 8   gender                 879942 non-null  object 
 9   street                 879942 non-null  object 
 10  city                   879942 non-null  object 
 11  state                  879942 non-null  object 
 12  zip                    879942 non-null  float64
 13  lat                    879942 non-null  float64
 14  long        