In [7]:

import pandas as pd
from sklearn.preprocessing import StandardScaler
import mlflow
import uuid
from datetime import datetime
import os
import io
import kagglehub

# Download latest version
path = kagglehub.dataset_download("mlg-ulb/creditcardfraud")

print("Path to dataset files:", path)

# Load data
csv_path = os.path.join(path, "creditcard.csv")#paste path of download file 
df = pd.read_csv(csv_path)
df = df.sample(n=10000, random_state=42).reset_index(drop=True)

# Check missing values
print(df.isnull().sum())  # should be all 0

# Remove duplicates
df = df.drop_duplicates()

# Scale Amount column
scaler = StandardScaler()
df['scaled_amount'] = scaler.fit_transform(df[['Amount']])

# Optionally drop original Amount and Time
df = df.drop(['Amount', 'Time'], axis=1)
# === Step 2: Add Required Columns ===

df["TransactionID"] = [str(uuid.uuid4()) for _ in range(len(df))]
df["EventTime"] = datetime.utcnow().isoformat()

# === Step 3: Save Locally as Parquet ===
parquet_path = "data/fraud_data.parquet"
df.to_parquet(parquet_path, index=False)
print("✅ Parquet file saved:", parquet_path)
# Read Parquet file into DataFrame
parquet_path = "data/fraud_data.parquet"
df = pd.read_parquet(parquet_path)

print("✅ Parquet file loaded successfully.")

Path to dataset files: C:\Users\aman1\.cache\kagglehub\datasets\mlg-ulb\creditcardfraud\versions\3
Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64
✅ Parquet file saved: data/fraud_data.parquet
✅ Parquet file loaded successfully.
