In [6]:
import sys
import os

# Force the path to the root
project_root = r'C:\Users\HP\PycharmProjects\fraud-detection'
if project_root not in sys.path:
    sys.path.insert(0, project_root)

# Check if we can see the file itself
file_path = os.path.join(project_root, 'src', 'preprocessing.py')
print(f"Checking for file at: {file_path}")
print(f"File exists: {os.path.exists(file_path)}")

try:
    from src.preprocessing import merge_fraud_with_countries
    print("✅ ✅ IMPORT SUCCESSFUL!")
except Exception as e:
    print(f"❌ Still failing. Error: {e}")
    print("Logic: We will define the function locally to proceed with Task 1.")

Checking for file at: C:\Users\HP\PycharmProjects\fraud-detection\src\preprocessing.py
File exists: False
❌ Still failing. Error: No module named 'src'
Logic: We will define the function locally to proceed with Task 1.


In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import socket
import struct

# Load datasets
fraud_data = pd.read_csv('../data/Fraud_Data.csv')
ip_to_country = pd.read_csv('../data/IpAddress_to_Country.csv')

# Handle missing values and duplicates
fraud_data = fraud_data.drop_duplicates()
print(f"Initial Fraud Data Shape: {fraud_data.shape}")
print(f"Missing Values:\n{fraud_data.isnull().sum()}")

# Correct data types for timestamps
fraud_data['signup_time'] = pd.to_datetime(fraud_data['signup_time'])
fraud_data['purchase_time'] = pd.to_datetime(fraud_data['purchase_time'])

Initial Fraud Data Shape: (151112, 11)
Missing Values:
user_id           0
signup_time       0
purchase_time     0
purchase_value    0
device_id         0
source            0
browser           0
sex               0
age               0
ip_address        0
class             0
dtype: int64


In [13]:
import pandas as pd
import numpy as np
import socket
import struct

# 1. Load Data
fraud_data = pd.read_csv('../data/Fraud_Data.csv')
ip_to_country = pd.read_csv('../data/IpAddress_to_Country.csv')

# --- INTELLIGENT PREPROCESSING ---

def ip_to_int(ip):
    """Converts IPv4 string to integer. Returns None if it fails."""
    try:
        return struct.unpack("!I", socket.inet_aton(ip))[0]
    except:
        return None

# CHECK: Is the IP address already a number?
# Many datasets already come with IPs converted to floats.
if np.issubdtype(fraud_data['ip_address'].dtype, np.number):
    print("ℹ️ IP addresses are already numeric. Skipping socket conversion.")
    fraud_data['ip_address_int'] = fraud_data['ip_address']
else:
    print("ℹ️ IP addresses are strings (e.g., '192.168.1.1'). Converting...")
    fraud_data['ip_address_int'] = fraud_data['ip_address'].apply(ip_to_int)

# 2. Handle Missing Values
# Drop rows only if the IP is truly missing/invalid
fraud_data = fraud_data.dropna(subset=['ip_address_int'])
ip_to_country = ip_to_country.dropna(subset=['lower_bound_ip_address'])

# 3. Force Integer Types
fraud_data['ip_address_int'] = fraud_data['ip_address_int'].astype('int64')
ip_to_country['lower_bound_ip_address'] = ip_to_country['lower_bound_ip_address'].astype('int64')
ip_to_country['upper_bound_ip_address'] = ip_to_country['upper_bound_ip_address'].astype('int64')

# 4. Sort and Merge
fraud_data = fraud_data.sort_values('ip_address_int')
ip_to_country = ip_to_country.sort_values('lower_bound_ip_address')

print(f"Merging {len(fraud_data)} fraud records...")
merged_data = pd.merge_asof(
    fraud_data,
    ip_to_country,
    left_on='ip_address_int',
    right_on='lower_bound_ip_address'
)

# 5. Correct the 'Country' Logic (Keep strictly valid matches)
# merge_asof gives the closest match; we must ensure it's within the upper bound
merged_data['country'] = np.where(
    merged_data['ip_address_int'] <= merged_data['upper_bound_ip_address'],
    merged_data['country'],
    'Unknown'
)

print(f"✅ Merge Successful! Final Shape: {merged_data.shape}")
print(f"Top Countries:\n{merged_data['country'].value_counts().head()}")

ℹ️ IP addresses are already numeric. Skipping socket conversion.
Merging 151112 fraud records...
✅ Merge Successful! Final Shape: (151112, 15)
Top Countries:
country
United States     58049
Unknown           21966
China             12038
Japan              7306
United Kingdom     4490
Name: count, dtype: int64


Step 3: Feature Engineering & Data Transformation

In [15]:
# 1. Force Convert Columns to Datetime (Fixes the TypeError)
print("Converting timestamps to datetime objects...")
merged_data['purchase_time'] = pd.to_datetime(merged_data['purchase_time'])
merged_data['signup_time'] = pd.to_datetime(merged_data['signup_time'])

# 2. Re-run Feature Engineering
print("calculating time features...")
# Now the subtraction will work because they are dates, not strings
merged_data['time_since_signup'] = (merged_data['purchase_time'] - merged_data['signup_time']).dt.total_seconds()
merged_data['hour_of_day'] = merged_data['purchase_time'].dt.hour
merged_data['day_of_week'] = merged_data['purchase_time'].dt.dayofweek

# 3. Velocity Feature
merged_data['device_usage_count'] = merged_data.groupby('device_id')['device_id'].transform('count')

# 4. Final Check
print("✅ Feature Engineering Complete!")
print(merged_data[['time_since_signup', 'hour_of_day']].head())

Converting timestamps to datetime objects...
calculating time features...
✅ Feature Engineering Complete!
   time_since_signup  hour_of_day
0          1763014.0           10
1          1084823.0           17
2           749320.0            8
3          7434634.0           21
4          1407619.0            7


In [16]:
# --- 1. HANDLE HIGH CARDINALITY (Country) ---
# We keep the top 10 countries and label the rest as 'Other'
# This prevents creating 180+ columns which would slow down the model
top_countries = merged_data['country'].value_counts().index[:10]
merged_data['country_processed'] = merged_data['country'].apply(lambda x: x if x in top_countries else 'Other')

# --- 2. ONE-HOT ENCODING ---
# Convert text categories (Source, Browser, Sex, Country) into binary columns (0 or 1)
columns_to_encode = ['source', 'browser', 'sex', 'country_processed']
# We use drop_first=True to reduce redundancy (e.g., if not Male, it's Female)
df_encoded = pd.get_dummies(merged_data, columns=columns_to_encode, drop_first=True)

# --- 3. DROP UNNECESSARY COLUMNS ---
# Remove ID columns and original timestamps (we already extracted the useful info)
cols_to_drop = ['user_id', 'device_id', 'ip_address', 'ip_address_int',
                'signup_time', 'purchase_time', 'country',
                'lower_bound_ip_address', 'upper_bound_ip_address']
df_model = df_encoded.drop(columns=cols_to_drop)

# --- 4. SEPARATE FEATURES (X) AND TARGET (y) ---
X = df_model.drop(columns=['class'])
y = df_model['class']

print(f"✅ Data Transformation Complete!")
print(f"Final Features shape: {X.shape}")
print(f"Features list: {list(X.columns[:10])}...") # Preview first 10 columns

✅ Data Transformation Complete!
Final Features shape: (151112, 23)
Features list: ['purchase_value', 'age', 'time_since_signup', 'hour_of_day', 'day_of_week', 'device_usage_count', 'source_Direct', 'source_SEO', 'browser_FireFox', 'browser_IE']...


In [20]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE

# --- 1. TRAIN-TEST SPLIT ---
# We use 'stratify=y' to ensure both train and test sets have the same % of fraud
print("Splitting data...")
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# --- 2. SCALING (StandardScaler) ---
# Fit ONLY on training data, then transform both
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# --- 3. APPLY SMOTE (Training Data Only) ---
print("Applying SMOTE to handle imbalance...")
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)

# --- 4. VERIFY RESULTS ---
print("-" * 30)
print(f"Original Training Shape: {X_train.shape}")
print(f"Original Fraud Count: {sum(y_train == 1)}")
print("-" * 30)
print(f"Resampled Training Shape: {X_train_resampled.shape}")
print(f"Resampled Fraud Count: {sum(y_train_resampled == 1)}")
print("-" * 30)
print("✅ Data is split, scaled, and balanced. Ready for Model Building!")
# Save the processed data for Task 2
print("Saving processed data...")
# combining X and y just for saving
processed_data = pd.DataFrame(X, columns=X.columns)
processed_data['class'] = y

# Save to the 'processed' folder
processed_data.to_csv('../data/processed/fraud_data_processed.csv', index=False)
print("✅ File saved to 'data/processed/fraud_data_processed.csv'")

Splitting data...
Applying SMOTE to handle imbalance...
------------------------------
Original Training Shape: (120889, 23)
Original Fraud Count: 11321
------------------------------
Resampled Training Shape: (219136, 23)
Resampled Fraud Count: 109568
------------------------------
✅ Data is split, scaled, and balanced. Ready for Model Building!
Saving processed data...


OSError: Cannot save file into a non-existent directory: '..\data\processed'

In [21]:
import os

# 1. Define the file path
save_path = '../data/processed/fraud_data_processed.csv'

# 2. Check if the directory exists; if not, create it!
# os.path.dirname(save_path) gets "../data/processed"
directory = os.path.dirname(save_path)
if not os.path.exists(directory):
    print(f"Directory '{directory}' not found. Creating it now...")
    os.makedirs(directory)

# 3. Save the file
print("Saving processed data...")
processed_data.to_csv(save_path, index=False)
print(f"✅ File successfully saved to '{save_path}'")

Directory '../data/processed' not found. Creating it now...
Saving processed data...
✅ File successfully saved to '../data/processed/fraud_data_processed.csv'


In [22]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_curve, auc

# 1. Load the processed data
print("Loading data...")
df = pd.read_csv('../data/processed/fraud_data_processed.csv')

# 2. Separate Features (X) and Target (y)
X = df.drop(columns=['class'])
y = df['class']

# 3. Stratified Train-Test Split (80/20)
# Stratify ensures we have the same proportion of fraud in both sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 4. Scale the Data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 5. Apply SMOTE (Only to Training Data!)
print("Applying SMOTE...")
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train_scaled, y_train)

print(f"✅ Data Ready. Training Shape: {X_train_res.shape}")

Loading data...
Applying SMOTE...
✅ Data Ready. Training Shape: (219136, 23)


In [23]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_curve, auc

# 1. Load the processed data
print("Loading data...")
df = pd.read_csv('../data/processed/fraud_data_processed.csv')

# 2. Separate Features (X) and Target (y)
X = df.drop(columns=['class'])
y = df['class']

# 3. Stratified Train-Test Split (80/20)
# Stratify ensures we have the same proportion of fraud in both sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 4. Scale the Data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 5. Apply SMOTE (Only to Training Data!)
print("Applying SMOTE...")
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train_scaled, y_train)

print(f"✅ Data Ready. Training Shape: {X_train_res.shape}")

Loading data...
Applying SMOTE...
✅ Data Ready. Training Shape: (219136, 23)
