In [18]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE

# Simulate a dataset
np.random.seed(42)

# Simulating a credit card transaction dataset with fraud detection as the goal
data = {
    'Time': np.random.rand(1000) * 1000,  # Simulated time features
    'V1': np.random.randn(1000),
    'V2': np.random.randn(1000),
    'V3': np.random.randn(1000),
    'V4': np.random.randn(1000),
    'V5': np.random.randn(1000),
    'Amount': np.random.rand(1000) * 5000,  # Transaction amount
    'Class': np.random.choice([0, 1], size=1000, p=[0.95, 0.05])  # Fraud label (0 - no fraud, 1 - fraud)
}

df = pd.DataFrame(data)

# Show the first few rows
print(df.head())


         Time        V1        V2        V3        V4        V5       Amount  \
0  374.540119  0.177701 -1.406317  0.874517 -0.150320  0.279084  1761.544421   
1  950.714306 -1.335344 -0.083106 -0.649765 -0.326696  0.291564  1170.288875   
2  731.993942  0.380198 -1.504720 -1.203201 -1.042578 -1.690672   248.905008   
3  598.658484  0.610586  0.760056 -1.042044 -1.172234 -0.978500  1342.500381   
4  156.018640  0.559790  0.082440 -0.487203  0.464370  2.755218  3271.178900   

   Class  
0      0  
1      0  
2      0  
3      0  
4      0  


In [19]:
# Split the data into features (X) and target (y)
X = df.drop('Class', axis=1)
y = df['Class']

# Standardize the features using StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the dataset into training and testing sets (80% - 20% split)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Handle class imbalance using SMOTE (Synthetic Minority Oversampling Technique)
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Show the resampled class distribution
print(f"Original class distribution in y_train: {y_train.value_counts()}")
print(f"Resampled class distribution in y_train: {y_train_resampled.value_counts()}")


Original class distribution in y_train: Class
0    765
1     35
Name: count, dtype: int64
Resampled class distribution in y_train: Class
0    765
1    765
Name: count, dtype: int64


In [20]:
# Instantiate the RandomForestClassifier
rf_model = RandomForestClassifier(random_state=42)

# Train the model on the resampled data
rf_model.fit(X_train_resampled, y_train_resampled)

# Make predictions on the test set
y_pred = rf_model.predict(X_test)

# Evaluate the model's performance
print("Classification Report:")
print(classification_report(y_test, y_pred))


Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.91      0.94       194
           1       0.11      0.33      0.16         6

    accuracy                           0.90       200
   macro avg       0.54      0.62      0.55       200
weighted avg       0.95      0.90      0.92       200



In [21]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE

# Simulating data
np.random.seed(42)
data = {
    'Time': np.random.rand(1000) * 1000,
    'V1': np.random.randn(1000),
    'V2': np.random.randn(1000),
    'V3': np.random.randn(1000),
    'V4': np.random.randn(1000),
    'V5': np.random.randn(1000),
    'Amount': np.random.rand(1000) * 5000,
    'Class': np.random.choice([0, 1], size=1000, p=[0.95, 0.05])  # Imbalanced classes
}
df = pd.DataFrame(data)

# Preprocessing
X = df.drop('Class', axis=1)
y = df['Class']
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Handle class imbalance using SMOTE
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Train the RandomForest model
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train_resampled, y_train_resampled)

# Predictions and Evaluation
y_pred = rf_model.predict(X_test)
print("Classification Report:")
print(classification_report(y_test, y_pred))


Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.91      0.94       194
           1       0.11      0.33      0.16         6

    accuracy                           0.90       200
   macro avg       0.54      0.62      0.55       200
weighted avg       0.95      0.90      0.92       200

