In [ ]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils import resample
import joblib

def load_data(file_path):
    return pd.read_csv(file_path)

def explore_data(data):
    print(data.head())
    print(data.info())
    print(data['isFraud'].value_counts())

def visualize_transaction_amounts(data):
    plt.figure(figsize=(10, 5))
    sns.histplot(data[data['isFraud'] == 1]['amount'], color='red', label='Fraudulent', kde=True)
    sns.histplot(data[data['isFraud'] == 0]['amount'], color='blue', label='Non-fraudulent', kde=True)
    plt.title('Distribution of Transaction Amounts')
    plt.xlabel('Transaction Amount')
    plt.ylabel('Density')
    plt.legend()
    plt.show()

def balance_data(data):
    fraudulent_transactions = data[data['isFraud'] == 1]
    non_fraudulent_transactions = data[data['isFraud'] == 0]
    total_transactions_needed = len(non_fraudulent_transactions) * 10 // 9
    fraudulent_count_needed = total_transactions_needed - len(non_fraudulent_transactions)

    fraudulent_oversampled = resample(fraudulent_transactions,
                                      replace=True, 
                                      n_samples=fraudulent_count_needed, 
                                      random_state=42)

    balanced_data = pd.concat([non_fraudulent_transactions, fraudulent_oversampled])
    balanced_data = balanced_data.sample(frac=1, random_state=42).reset_index(drop=True)

    return balanced_data

def check_missing_values(data):
    return data.isnull().sum()

def visualize_data_distribution_before_after(data, balanced_data):
    plt.figure(figsize=(10, 5))
    plt.subplot(1, 2, 1)
    sns.countplot(x='isFraud', data=data)
    plt.title('Data Distribution (Before Balancing)')
    plt.subplot(1, 2, 2)
    sns.countplot(x='isFraud', data=balanced_data)
    plt.title('Data Distribution (After Balancing)')
    plt.tight_layout()
    plt.show()

def correlation_matrix(data):
    numeric_data = data.select_dtypes(include=['number'])
    correlation_matrix = numeric_data.corr()
    plt.figure(figsize=(10, 8))
    sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm')
    plt.title('Correlation Matrix')
    plt.show()

def select_features(data):
    return data[['oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest', 'isFraud']]

def train_logistic_regression(X_train, y_train):
    model = LogisticRegression(solver='lbfgs', max_iter=500)
    model.fit(X_train, y_train)
    return model

def train_random_forest(X_train, y_train):
    model = RandomForestClassifier()
    model.fit(X_train, y_train)
    return model

def evaluate_model(predictions, y_test):
    metrics = {
        'Accuracy': accuracy_score(y_test, predictions),
        'Precision': precision_score(y_test, predictions, average='weighted'),
        'Recall': recall_score(y_test, predictions, average='weighted'),
        'F1 Score': f1_score(y_test, predictions, average='weighted')
    }
    return metrics

def train_and_predict(model, X_train, y_train, X_test):
    model.fit(X_train, y_train)
    return model.predict(X_test)

def random_search(X_train, y_train):
    param_grid = {
        'n_estimators': [100],
        'max_features': ['sqrt'],
        'max_depth': [6],
        'criterion': ['gini'],
        'max_leaf_nodes': [6],
        'min_samples_leaf': [2],
        'min_samples_split': [5],
    }

    random_search_cv = RandomizedSearchCV(estimator=RandomForestClassifier(), 
                                         param_distributions=param_grid, 
                                         n_iter=30, cv=3, verbose=4, 
                                         random_state=42, n_jobs=-1)
    random_search_cv.fit(X_train, y_train)
    return random_search_cv.best_params_

def evaluate_model_hyper(predictions, y_test):
    metrics = {
        'Accuracy': accuracy_score(y_test, predictions),
        'Precision': precision_score(y_test, predictions, average='weighted'),
        'Recall': recall_score(y_test, predictions, average='weighted'),
        'F1 Score': f1_score(y_test, predictions, average='weighted')
    }
    return metrics

# Load data
data = load_data("C:\\Users\\LENOVO\\Downloads\\Online_Payments_Fraud_Detection.csv")

# Explore data
explore_data(data)

# Visualize transaction amounts
visualize_transaction_amounts(data)

# Balance data
balanced_data = balance_data(data)

# Check missing values
missing_values = check_missing_values(balanced_data)

# Visualize data distribution before and after balancing
visualize_data_distribution_before_after(data, balanced_data)

# Correlation matrix
correlation_matrix(balanced_data)

# Select features
selected_features = select_features(balanced_data)

# Split data into features and target variable for both balanced and unbalanced datasets
X_balanced = balanced_data.drop('isFraud', axis=1)
y_balanced = balanced_data['isFraud']

# Train logistic regression and random forest models
X_train, X_test, y_train, y_test = train_test_split(X_balanced, y_balanced, test_size=0.25, random_state=1, stratify=y_balanced)
log_reg_model = train_logistic_regression(X_train, y_train)
rf_model = train_random_forest(X_train, y_train)

# Make predictions
log_reg_predictions = train_and_predict(log_reg_model, X_train, y_train, X_test)
rf_predictions = train_and_predict(rf_model, X_train, y_train, X_test)

# Evaluate models
log_reg_metrics = evaluate_model(log_reg_predictions, y_test)
rf_metrics = evaluate_model(rf_predictions, y_test)

# Randomized search for hyperparameters
best_params = random_search(X_train, y_train)

# Train Random Forest with best hyperparameters
rf_hyper_model = RandomForestClassifier(**best_params)
rf_hyper_model.fit(X_train, y_train)
rf_hyper_predictions = rf_hyper_model.predict(X_test)
rf_hyper_metrics = evaluate_model_hyper(rf_hyper_predictions, y_test)

# Print metrics
print("Logistic Regression Metrics:", log_reg_metrics)
print("Random Forest Metrics:", rf_metrics)
print("Random Forest with Hyperparameters Metrics:", rf_hyper_metrics)


   step      type    amount     nameOrig  oldbalanceOrg  newbalanceOrig  \
0     1   PAYMENT   9839.64  C1231006815       170136.0       160296.36   
1     1   PAYMENT   1864.28  C1666544295        21249.0        19384.72   
2     1  TRANSFER    181.00  C1305486145          181.0            0.00   
3     1  CASH_OUT    181.00   C840083671          181.0            0.00   
4     1   PAYMENT  11668.14  C2048537720        41554.0        29885.86   

      nameDest  oldbalanceDest  newbalanceDest  isFraud  isFlaggedFraud  
0  M1979787155             0.0             0.0        0               0  
1  M2044282225             0.0             0.0        0               0  
2   C553264065             0.0             0.0        1               0  
3    C38997010         21182.0             0.0        1               0  
4  M1230701703             0.0             0.0        0               0  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6362620 entries, 0 to 6362619
Data columns (total 11 co

In [ ]:
data = pd.read_csv("C:\\Users\\LENOVO\\Downloads\\Online_Payments_Fraud_Detection.csv")
balanced_data = balance_data(data)
selected_features = select_features(balanced_data)
X_balanced = balanced_data.drop('isFraud', axis=1)
y_balanced = balanced_data['isFraud']

# Entraîner le modèle
rf_model = RandomForestClassifier()
rf_model.fit(X_balanced, y_balanced)

# Sauvegarder le modèle entraîné
joblib.dump(rf_model, 'rf_model.pkl')