2. Classifying Credit Card Fraud Using Decision Trees
   Dataset: Credit Card Fraud Detection Dataset
   Preprocessing Steps:
     - Handle missing values if any.
     - Standardize features.
   Task: Implement a decision tree classifier to classify credit card transactions as fraud or not and evaluate the model using ROC-AUC and confusion matrix.

In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score, confusion_matrix
from sklearn.preprocessing import StandardScaler

# Load the dataset
df = pd.read_csv('/content/creditcard.csv')

# Handle missing values
df.fillna(df.mean(), inplace=True)

# Separate features and target variable
X = df.drop('Class', axis=1)
y = df['Class'].astype(int)  # Ensure target variable is integer type

# Check the distribution of classes
print("Class distribution before split:")
print(y.value_counts())

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42, stratify=y)

# Check the distribution of classes in training and test sets
print("Class distribution in training set:")
print(y_train.value_counts())
print("Class distribution in test set:")
print(y_test.value_counts())

# Ensure that both classes are present in the training and test sets
if len(y_train.unique()) < 2 or len(y_test.unique()) < 2:
    raise ValueError("Training or test set does not contain both classes. Adjust the split or dataset.")

# Train the Decision Tree Classifier
classifier = DecisionTreeClassifier(random_state=42)
classifier.fit(X_train, y_train)

# Make predictions
y_pred = classifier.predict(X_test)
y_pred_prob = classifier.predict_proba(X_test)[:, 1]

# Evaluate the model
roc_auc = roc_auc_score(y_test, y_pred_prob)
conf_matrix = confusion_matrix(y_test, y_pred)

print(f'ROC-AUC Score: {roc_auc}')
print('Confusion Matrix:')
print(conf_matrix)

Class distribution before split:
Class
0    1984
1       2
Name: count, dtype: int64
Class distribution in training set:
Class
0    1389
1       1
Name: count, dtype: int64
Class distribution in test set:
Class
0    595
1      1
Name: count, dtype: int64
ROC-AUC Score: 0.49831932773109244
Confusion Matrix:
[[593   2]
 [  1   0]]
