Step 1: select small dataset from largedataset

In [None]:
import pandas as pd

# Load your original dataset
df = pd.read_csv("Credit Card Fraud Risk Analysis.csv")

# Take only first 1000 rows
df_small = df.head(1000)

# Save it as a new file
df_small.to_csv("small_dataset.csv", index=False)


Step 2: show overview of small dataset 

In [None]:
import pandas as pd

# Load your dataset
df = pd.read_csv("small_dataset.csv")

# Basic shape of the data
print("üßæ Dataset Shape:", df.shape)

# First 5 rows
print("\nüîç First 5 Rows:")
display(df.head())

# Info about columns and data types
print("\n‚ÑπÔ∏è Dataset Info:")
df.info()

# Summary statistics for numeric columns
print("\nüìä Summary Statistics:")
display(df.describe())

# Check for missing values
print("\n‚ùó Missing Values in Each Column:")
print(df.isnull().sum())

# Check for duplicate rows
print("\nüìã Number of Duplicate Rows:", df.duplicated().sum())

# Display column names
print("\nü™∂ Column Names:")
print(df.columns.tolist())


Step 3: Data Cleaning & Preprocessing

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Step 1: Load dataset
df = pd.read_csv("small_dataset.csv")

# Step 2: Drop unnecessary columns
# These are mostly identifiers or text not useful for ML
df = df.drop(columns=[
    "Transaction ID", 
    "Customer Name", 
    "Merchant Name", 
    "Transaction Date", 
    "Merchant Location"
])

# Step 3: Encode categorical columns
label_encoder = LabelEncoder()
categorical_cols = df.select_dtypes(include=["object"]).columns

for col in categorical_cols:
    df[col] = label_encoder.fit_transform(df[col])

# Step 4: Separate features and target
X = df.drop(columns=["IsFraud"])
y = df["IsFraud"]

# Step 5: Scale numerical features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 6: Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)

# Step 7: Print shapes to verify
print("Training shape:", X_train.shape)
print("Testing shape:", X_test.shape)

# Step 8: Show processed sample
print("\n‚úÖ Preprocessing Complete. Sample:")
print(pd.DataFrame(X_train, columns=X.columns).head())


Training shape: (800, 8)
Testing shape: (200, 8)

‚úÖ Preprocessing Complete. Sample:
   Transaction Amount (INR)  Fraud Risk  Fraud Type     State  Card Type  \
0                 -0.749992   -0.930734    1.499900  1.553043   0.429265   
1                 -0.744501   -0.930734    0.042271  0.495352  -1.337258   
2                  0.543772    0.118571   -0.686543  0.495352  -0.453996   
3                  1.606914    1.167877    1.499900 -1.620030  -0.453996   
4                  1.560593    1.167877   -0.686543 -1.267467   0.429265   

       Bank  Fraud Score  Transaction Category  
0  0.752030     1.471007             -1.453254  
1  1.135328    -0.084750             -0.293897  
2 -0.781161    -1.329356              0.285781  
3 -1.547756    -0.123644             -0.873575  
4 -1.547756     0.148614             -0.293897  


Step 4: Model Training

In [None]:
print(df.head())
print(df.shape)
print(df['is_Fraud'].value_counts())


In [None]:
# =========================
# STEP 4: MODEL TRAINING
# =========================
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Initialize and train model
model = LogisticRegression(max_iter=1000, random_state=42)
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)

# Evaluate
accuracy = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
report = classification_report(y_test, y_pred)

print("‚úÖ Model Training Complete\n")
print(f"Accuracy: {accuracy:.3f} ({accuracy*100:.2f}%)\n")
print("üìä Confusion Matrix:")
print(cm)
print("\nüìã Classification Report:")
print(report)


‚úÖ Model Training Complete

Accuracy: 0.715 (71.50%)

üìä Confusion Matrix:
[[143   0]
 [ 57   0]]

üìã Classification Report:
              precision    recall  f1-score   support

           0       0.71      1.00      0.83       143
           1       0.00      0.00      0.00        57

    accuracy                           0.71       200
   macro avg       0.36      0.50      0.42       200
weighted avg       0.51      0.71      0.60       200



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
