In [7]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

In [8]:
def haversine(lat1, lon1, lat2, lon2):
    """
    Calculate the great-circle distance in kilometers between two points 
    on the earth (specified in decimal degrees)
    """
    # Convert decimal degrees to radians
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])

    # Haversine formula
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
    r = 6371 # Radius of earth in kilometers.
    return c * r

In [9]:
# --- 1. Load Data ---
print("Loading 'fraudTrain.csv'...")
df = pd.read_csv("fraudTrain.csv")
print(f"Original shape: {df.shape}")
df.head()

Loading 'fraudTrain.csv'...
Original shape: (1296675, 23)


Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,0,2019-01-01 00:00:18,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,...,36.0788,-81.1781,3495,"Psychologist, counselling",1988-03-09,0b242abb623afc578575680df30655b9,1325376018,36.011293,-82.048315,0
1,1,2019-01-01 00:00:44,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,...,48.8878,-118.2105,149,Special educational needs teacher,1978-06-21,1f76529f8574734946361c461b024d99,1325376044,49.159047,-118.186462,0
2,2,2019-01-01 00:00:51,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,594 White Dale Suite 530,...,42.1808,-112.262,4154,Nature conservation officer,1962-01-19,a1a22d70485983eac12b5b88dad1cf95,1325376051,43.150704,-112.154481,0
3,3,2019-01-01 00:01:16,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,White,M,9443 Cynthia Court Apt. 038,...,46.2306,-112.1138,1939,Patent attorney,1967-01-12,6b849c168bdad6f867558c3793159a81,1325376076,47.034331,-112.561071,0
4,4,2019-01-01 00:03:06,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,408 Bradley Rest,...,38.4207,-79.4629,99,Dance movement psychotherapist,1986-03-28,a41d7549acf90789359a9aa5346dcb46,1325376186,38.674999,-78.632459,0


In [10]:
# --- 2. Drop Unnecessary/Identifier Columns ---
# Drop IDs, PII, and high-cardinality columns
cols_to_drop = [
    'Unnamed: 0',
    'cc_num',
    'first',
    'last',
    'street',
    'city',
    'job',
    'trans_num',
    'merchant'
]
df.drop(columns=cols_to_drop, inplace=True, errors='ignore')
print(f"Dropped {len(cols_to_drop)} unnecessary columns.")
print(f"Shape after dropping: {df.shape}")

Dropped 9 unnecessary columns.
Shape after dropping: (1296675, 14)


In [11]:
# --- 3. Handle Duplicates ---
dup_count_before = df.duplicated().sum()
if dup_count_before > 0:
    df.drop_duplicates(inplace=True)
    print(f"Removed {dup_count_before} duplicate rows.")
else:
    print("No duplicate rows found.")

# --- 4. Handle Missing Values (Robustness) ---
for col in df.columns:
    if df[col].dtype == 'object':
        df[col].fillna(df[col].mode()[0], inplace=True)
    else:
        df[col].fillna(df[col].median(), inplace=True)
print("Missing values check complete.")

No duplicate rows found.


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)


Missing values check complete.


In [12]:
# --- 5. Feature Engineering ---
print("Starting feature engineering...")

# A: Time & Date Features
df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'])
df['hour'] = df['trans_date_trans_time'].dt.hour
df['day_of_week'] = df['trans_date_trans_time'].dt.dayofweek
df['month'] = df['trans_date_trans_time'].dt.month
print("   - Time features created.")

# B: Age Feature
df['dob'] = pd.to_datetime(df['dob'])
df['age'] = (df['trans_date_trans_time'] - df['dob']).dt.days / 365.25
df['age'] = df['age'].astype(int)
print("   - Age feature created.")

# C: Location Distance Feature
df['distance_km'] = haversine(df['lat'], df['long'], df['merch_lat'], df['merch_long'])
print("   - Transaction distance (km) feature created.")

# D: Drop original columns used for engineering
df.drop(columns=['trans_date_trans_time', 'dob', 'lat', 'long', 'merch_lat', 'merch_long'], inplace=True)
print("   - Original engineering columns dropped.")

Starting feature engineering...
   - Time features created.
   - Age feature created.
   - Transaction distance (km) feature created.
   - Original engineering columns dropped.


In [13]:
# --- 6. Encode Categorical Data ---
print("Encoding categorical features...")

# Binary feature: 'gender'
le = LabelEncoder()
df['gender'] = le.fit_transform(df['gender'])
print("   - 'gender' label encoded.")

# Multi-class nominal features: 'category', 'state'
df = pd.get_dummies(df, columns=['category', 'state'], drop_first=True, dtype=int)
print("   - 'category' and 'state' one-hot encoded.")

Encoding categorical features...
   - 'gender' label encoded.
   - 'category' and 'state' one-hot encoded.


In [15]:
# --- 7. Final Check & Save ---
print("\n" + "="*30)
print("    FINAL CLEANING COMPLETE")
print("="*30)

# Note: Scaling (StandardScaler) should be done *after*
# train/test split in the modeling notebook.

df.to_csv("fraud_Train_cleaned_BEST.csv", index=False)
print(f"Cleaned dataset saved as 'fraud_Train_cleaned_.csv'")
print(f"Final shape: {df.shape}")

print("\nFinal DataFrame head:")
print(df.head())


    FINAL CLEANING COMPLETE
Cleaned dataset saved as 'fraud_Train_cleaned_.csv'
Final shape: (1296675, 74)

Final DataFrame head:
      amt  gender    zip  city_pop   unix_time  is_fraud  hour  day_of_week  \
0    4.97       0  28654      3495  1325376018         0     0            1   
1  107.23       0  99160       149  1325376044         0     0            1   
2  220.11       1  83252      4154  1325376051         0     0            1   
3   45.00       1  59632      1939  1325376076         0     0            1   
4   41.96       1  24433        99  1325376186         0     0            1   

   month  age  ...  state_SD  state_TN  state_TX  state_UT  state_VA  \
0      1   30  ...         0         0         0         0         0   
1      1   40  ...         0         0         0         0         0   
2      1   56  ...         0         0         0         0         0   
3      1   51  ...         0         0         0         0         0   
4      1   32  ...         0      