In [None]:
# Task 2- Credit Card Fraud Detection
# ---------------------------------------------------
# Step 1: Import Libraries
# ---------------------------------------------------
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Step 2: Load Dataset
# ---------------------------------------------------
import os
os.chdir("/Users/syedmadni/Desktop/Internship/Task2")  # update path if needed

df = pd.read_csv("Fraud Detection Dataset/fraudTrain.csv")
print("Dataset shape:", df.shape)

# Step 3: Quick Data Info
# ---------------------------------------------------
print(df.info())
print(df['is_fraud'].value_counts())

# Step 4: Clean and Preprocess Data
# ---------------------------------------------------
# Drop unnecessary columns that don’t help in prediction
drop_cols = ['trans_date_trans_time', 'unix_time', 'merchant', 'first', 'last', 
             'street', 'city', 'state', 'job', 'dob', 'trans_num']
df = df.drop(columns=[c for c in drop_cols if c in df.columns])

# Convert categorical columns to numeric using one-hot encoding
df = pd.get_dummies(df, drop_first=True)

# Step 5: Split features and target
# ---------------------------------------------------
X = df.drop('is_fraud', axis=1)
y = df['is_fraud']

# Step 6: Split train and test data
# ---------------------------------------------------
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, 
                                                    random_state=42, stratify=y)

# Step 7: Scale features for better model performance
# ---------------------------------------------------
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Step 8: Train Model - Random Forest
# ---------------------------------------------------
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Step 9: Predictions and Evaluation
# ---------------------------------------------------
y_pred = rf.predict(X_test)

print("\n Model Evaluation Results:")
print("Accuracy:", round(accuracy_score(y_test, y_pred), 4))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

# Step 10: Sample Prediction
# ---------------------------------------------------
sample = np.array(X_test[0]).reshape(1, -1)
pred = rf.predict(sample)
print("\nSample Prediction →", "Fraud" if pred[0] == 1 else "Legit Transaction")


Dataset shape: (1296675, 23)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1296675 entries, 0 to 1296674
Data columns (total 23 columns):
 #   Column                 Non-Null Count    Dtype  
---  ------                 --------------    -----  
 0   Unnamed: 0             1296675 non-null  int64  
 1   trans_date_trans_time  1296675 non-null  object 
 2   cc_num                 1296675 non-null  int64  
 3   merchant               1296675 non-null  object 
 4   category               1296675 non-null  object 
 5   amt                    1296675 non-null  float64
 6   first                  1296675 non-null  object 
 7   last                   1296675 non-null  object 
 8   gender                 1296675 non-null  object 
 9   street                 1296675 non-null  object 
 10  city                   1296675 non-null  object 
 11  state                  1296675 non-null  object 
 12  zip                    1296675 non-null  int64  
 13  lat                    1296675 non-null  fl