In [10]:
import pandas as pd
import numpy as np
import pickle
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings("ignore") 

# --- Haversine Function (Must be identical to preprocessing) ---
def haversine(lat1, lon1, lat2, lon2):
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
    r = 6371 # Radius of earth in kilometers.
    return c * r

print("Libraries and helper function loaded.")

Libraries and helper function loaded.


In [11]:
# --- 1. Load Saved Tools ---
MODEL_FILE = 'rf_model.pkl'
SCALER_FILE = 'scaler.pkl'
FEATURES_FILE = 'model_features.pkl' # <-- Added this

try:
    with open(MODEL_FILE, 'rb') as f:
        model = pickle.load(f)
    with open(SCALER_FILE, 'rb') as f:
        scaler = pickle.load(f)
    with open(FEATURES_FILE, 'rb') as f: # <-- Added this block
        model_features = pickle.load(f)
        
    print("Model, Scaler, and Feature List loaded successfully.")
except FileNotFoundError:
    print("Error: Model/Scaler/Features files not found. Run '2_Model_Training.ipynb' first.")

Model, Scaler, and Feature List loaded successfully.


In [12]:
# --- 2. Load and Process New Data ---
RAW_DATA_FILE = "fraudTrain.csv" 
try:
    df_new = pd.read_csv(RAW_DATA_FILE)
    print(f"New data '{RAW_DATA_FILE}' loaded. Shape: {df_new.shape}")
except FileNotFoundError:
    print(f"Error: '{RAW_DATA_FILE}' not found.")

# --- START FEATURE ENGINEERING (Must match notebook 1) ---
print("Applying preprocessing steps...")

# A: Time & Date Features
df_new['trans_date_trans_time'] = pd.to_datetime(df_new['trans_date_trans_time'])
df_new['hour'] = df_new['trans_date_trans_time'].dt.hour
df_new['day_of_week'] = df_new['trans_date_trans_time'].dt.dayofweek
df_new['month'] = df_new['trans_date_trans_time'].dt.month

# B: Age Feature
df_new['dob'] = pd.to_datetime(df_new['dob'])
df_new['age'] = (df_new['trans_date_trans_time'] - df_new['dob']).dt.days / 365.25
df_new['age'] = df_new['age'].astype(int)

# C: Location Distance Feature
df_new['distance_km'] = haversine(df_new['lat'], df_new['long'], df_new['merch_lat'], df_new['merch_long'])

# D: Encode 'gender'
le = LabelEncoder()
df_new['gender'] = le.fit_transform(df_new['gender'])

# E: One-Hot Encode 'category' and 'state'
df_new = pd.get_dummies(df_new, columns=['category', 'state'], drop_first=True, dtype=int)

print("Preprocessing complete.")

New data 'fraudTrain.csv' loaded. Shape: (1296675, 23)
Applying preprocessing steps...
Preprocessing complete.


In [13]:
# --- 3. Align Columns ---
# We now use the list we loaded from the file:
print(f"Aligning to {len(model_features)} features...")

# Create a new dataframe with the correct columns, in order, filled with 0s
X_predict = pd.DataFrame(columns=model_features, index=df_new.index)
X_predict = X_predict.fillna(0)

# Fill in the columns we have
for col in model_features:
    if col in df_new.columns:
        X_predict[col] = df_new[col]
        
print(f"Columns aligned. Shape for prediction: {X_predict.shape}")

# --- 4. Scale Data ---
# Use the *loaded* scaler (do NOT .fit() again!)
X_predict_scaled = scaler.transform(X_predict)
print("New data scaled.")

Aligning to 73 features...
Columns aligned. Shape for prediction: (1296675, 73)
New data scaled.


In [14]:
# --- 5. Make Predictions ---
predictions = model.predict(X_predict_scaled)
probabilities = model.predict_proba(X_predict_scaled)[:, 1]

# Add predictions back to the original dataframe for review
df_new['is_fraud_PREDICTED'] = predictions
df_new['fraud_probability'] = probabilities

print("Predictions generated.")

# --- 6. Show Results ---
# Filter for just the transactions flagged as fraud
fraud_alerts = df_new[df_new['is_fraud_PREDICTED'] == 1]

print(f"\nTotal transactions analyzed: {len(df_new)}")
print(f"Transactions flagged as potential fraud: {len(fraud_alerts)}")

# Show the top 20 most likely frauds
print("\n--- Top 20 Most Likely Fraudulent Transactions ---")
display_cols = ['cc_num', 'merchant', 'amt', 'fraud_probability', 'is_fraud_PREDICTED']
print(fraud_alerts.sort_values(by='fraud_probability', ascending=False)
      [display_cols].head(20).to_markdown(index=False, floatfmt=".4f"))

Predictions generated.

Total transactions analyzed: 1296675
Transactions flagged as potential fraud: 7373

--- Top 20 Most Likely Fraudulent Transactions ---
|           cc_num | merchant                                |       amt |   fraud_probability |   is_fraud_PREDICTED |
|-----------------:|:----------------------------------------|----------:|--------------------:|---------------------:|
| 3524574586339330 | fraud_Schumm PLC                        | 1210.9100 |              1.0000 |                    1 |
| 3524574586339330 | fraud_Kassulke PLC                      |  977.0100 |              1.0000 |                    1 |
| 6564459919350820 | fraud_Johnson, Runolfsdottir and Mayer  |  725.6000 |              1.0000 |                    1 |
| 3573030041201292 | fraud_Crist, Jakubowski and Littel      |  222.6900 |              1.0000 |                    1 |
| 3560725013359375 | fraud_Koepp-Witting                     |  352.6700 |              1.0000 |                    1 |
|