# American companies Bankruptcy prediction

| Variable Name          | Description                                               |
|------------------------|-----------------------------------------------------------|
| X1                     | Current assets: All assets expected to be sold or used   |
|                        | in standard business operations over the next year       |
| X2                     | Cost of goods sold: Total cost directly related to the   |
|                        | sale of products                                          |
| X3                     | Depreciation and amortization: Loss of value of         |
|                        | tangible and intangible assets over time                 |
| X4                     | EBITDA: Earnings before interest, taxes, depreciation,   |
|                        | and amortization; alternative measure of financial      |
|                        | performance compared to net income                        |
| X5                     | Inventory: Accounting of items and raw materials used   |
|                        | in production or for sale                                 |
| X6                     | Net Income: Overall profitability after deducting       |
|                        | expenses and costs from total revenue                     |
| X7                     | Total Receivables: Balance of money due for delivered   |
|                        | goods or services not yet paid by customers              |
| X8                     | Market value: Asset price in the marketplace, in this    |
|                        | case, market capitalization since companies are publicly|
|                        | traded in the stock market                                |
| X9                     | Net sales: Gross sales minus returns, allowances, and   |
|                        | discounts                                                  |
| X10                    | Total assets: All items of value owned by a business     |
| X11                    | Total Long term debt: Loans and liabilities not due     |
|                        | within one year of the balance sheet date                |
| X12                    | EBIT: Earnings before interest and taxes                 |
| X13                    | Gross Profit: Profit after subtracting costs related     |
|                        | to manufacturing and selling products or services        |
| X14                    | Total Current Liabilities: Sum of accounts payable,      |
|                        | accrued liabilities, taxes, and bonds payable at year end|
| X15                    | Retained Earnings: Profit left after paying costs,       |
|                        | taxes, and dividends to shareholders                      |
| X16                    | Total Revenue: Total income from sales before expenses   |
| X17                    | Total Liabilities: Combined debts and obligations owed   |
|                        | to external parties                                       |
| X18                    | Total Operating Expenses: Business operation expenses    |
| year                   | Year                                                     |
| status_label           | Bank Status: Failed or Alive (Target column)             |

# Importing necessary libraries

In [None]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_curve, auc, precision_recall_curve, mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.exceptions import ConvergenceWarning
import warnings
warnings.filterwarnings('ignore', category=ConvergenceWarning)

# Loading Data

In [None]:
df= pd.read_csv('/kaggle/input/american-companies-bankruptcy-prediction-dataset/american_bankruptcy.csv')

# Exploratory data analysis (EDA)

In [None]:
df.head()

## Summary statistics

In [None]:
def summary(df):
    print(f'data shape: {df.shape}')  
    summ = pd.DataFrame(df.dtypes, columns=['data type'])
    summ['#missing'] = df.isnull().sum().values 
    summ['%missing'] = df.isnull().sum().values / len(df)* 100
    summ['#unique'] = df.nunique().values
    desc = pd.DataFrame(df.describe(include='all').transpose())
    summ['min'] = desc['min'].values
    summ['max'] = desc['max'].values
    summ['first value'] = df.loc[0].values
    summ['second value'] = df.loc[1].values
    summ['third value'] = df.loc[2].values

    return summ

summary(df)

In [None]:
def save_summary_as_image(df, filename):
    # Create the summary table
    summary = pd.DataFrame(df.dtypes, columns=['data type'])
    summary['#missing'] = df.isnull().sum().values 
    summary['%missing'] = df.isnull().sum().values / len(df)* 100
    summary['#unique'] = df.nunique().values
    desc = pd.DataFrame(df.describe(include='all').transpose())
    summary['min'] = desc['min'].values
    summary['max'] = desc['max'].values
    summary['first value'] = df.loc[0].values
    summary['second value'] = df.loc[1].values
    summary['third value'] = df.loc[2].values

    # Create a figure and axis
    fig, ax = plt.subplots(figsize=(10, 6))

    # Remove axes for cleaner visualization
    ax.axis('off')

    # Plot the table
    table = ax.table(cellText=summary.values, colLabels=summary.columns, cellLoc='center', loc='center')

    # Style the table
    table.auto_set_font_size(False)
    table.set_fontsize(10)
    table.scale(1.2, 1.2)

    # Save the figure as a PNG image
    plt.savefig(filename, bbox_inches='tight', dpi=300)
    plt.close()

# Call the function to save the summary table as an image
save_summary_as_image(df, 'summary_table.png')

According to the summary, the US Company Bankruptcy DataFrame has 78,682 rows and 21 columns. Here's how the data was analysed:

1. **Data Types:** The DataFrame contains columns with various data types, such as `object`, `int64`, and `float64`.

2. **Missing Values:** There are no missing values in any of the columns as indicated by the `%missing` column showing 0.0% missing values for all columns.

3. **Unique Values:** The number of unique values for each column varies. For instance, `company_name` has 8,971 unique values, `status_label` has 2 unique values, and other columns have different numbers of unique values.

4. **Summary Statistics:** The `describe` function provides statistical summary for numerical columns. It includes count, mean, standard deviation, minimum, 25th percentile (Q1), median (50th percentile or Q2), 75th percentile (Q3), and maximum values.

5. **Example Values:** The DataFrame displays the first three values for each column under the columns `first value`, `second value`, and `third value`. For instance, `company_name` has the values 'C_1', 'C_1', and 'C_1' for the first three rows.(Companies' names are coded for security reasons.)


# Target Column Distribution

In [None]:
alive_count = df['status_label'].value_counts()['alive']
failed_count = df['status_label'].value_counts()['failed']
total_count = alive_count + failed_count
alive_ratio = alive_count / total_count
failed_ratio = failed_count / total_count

print("Alive Ratio:", alive_ratio)
print("Failed Ratio:", failed_ratio)

In [None]:
import matplotlib.pyplot as plt

# Given counts and ratios
alive_count = df['status_label'].value_counts()['alive']
failed_count = df['status_label'].value_counts()['failed']
total_count = alive_count + failed_count
alive_ratio = alive_count / total_count
failed_ratio = failed_count / total_count

# Data for the pie chart
labels = ['Alive', 'Failed']
sizes = [alive_ratio, failed_ratio]
colors = ['green', 'red']
explode = (0.1, 0)  # Explode the first slice

# Create a pie chart
plt.figure(figsize=(6, 6))
plt.pie(sizes, explode=explode, labels=labels, colors=colors, autopct='%1.1f%%', shadow=True, startangle=140)
plt.axis('equal')  # Equal aspect ratio ensures the pie chart is circular

# Save the pie chart as a PNG image
plt.savefig('pie_chart.png', bbox_inches='tight', dpi=300)
plt.show()

- The "Alive Ratio" is approximately **0.934**, which means that around **93.4%** of the samples in the dataset belong to the "alive" class (e.g., companies that have not failed or gone bankrupt).

- The "Failed Ratio" is approximately **0.066**, which means that only about **6.6% of** the samples in the dataset belong to the "failed" class (e.g., companies that have experienced bankruptcy or failure).

This significant class imbalance can have implications for building predictive models. When the dataset is **highly imbalanced**, models may be biased towards the majority class (in this case, "alive") and may not perform well in identifying the minority class (in this case, "failed").

To address this issue, you may consider employing techniques such as:

- **Class Balancing Techniques:** Using methods like oversampling the minority class (e.g., Synthetic Minority Over-sampling Technique - SMOTE) or undersampling the majority class to balance the class distribution.

- **Different Evaluation Metrics:** Instead of accuracy, consider using evaluation metrics like precision, recall, F1-score, or area under the ROC curve (AUC), which are more suitable for imbalanced datasets.

- **Algorithm Selection:** Choose algorithms that are less sensitive to class imbalance, such as ensemble methods (e.g., Random Forest, Gradient Boosting) or anomaly detection methods.

- **Cost-sensitive Learning:** Assigning different misclassification costs for each class during model training to reflect the importance of correctly predicting the minority class.

## Outliers

In [None]:
Q1 = df[['X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X7', 'X8', 'X9', 'X10', 'X11', 'X12', 'X13', 'X14', 'X15', 'X16', 'X17', 'X18']].quantile(0.25)
Q3 = df[['X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X7', 'X8', 'X9', 'X10', 'X11', 'X12', 'X13', 'X14', 'X15', 'X16', 'X17', 'X18']].quantile(0.75)
IQR = Q3 - Q1
outliers = ((df[['X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X7', 'X8', 'X9', 'X10', 'X11', 'X12', 'X13', 'X14', 'X15', 'X16', 'X17', 'X18']] < (Q1 - 1.5 * IQR)) | (df[['X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X7', 'X8', 'X9', 'X10', 'X11', 'X12', 'X13', 'X14', 'X15', 'X16', 'X17', 'X18']] > (Q3 + 1.5 * IQR)))

plt.figure(figsize=(12, 8))
sns.boxplot(data=df[['X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X7', 'X8', 'X9', 'X10', 'X11', 'X12', 'X13', 'X14', 'X15', 'X16', 'X17', 'X18']])
plt.xticks(rotation=90)
plt.title("Box Plot of Numerical Features with Outliers")
plt.xlabel("Features")
plt.ylabel("Values")
plt.show()


Well Thats alot!!! Dealing with outliers is crucial to ensure that they do not adversely affect the performance of your predictive models. Outliers can skew the distribution of data and impact the model's ability to generalize to unseen data. Here are some strategies to handle outliers in your dataset:

- Remove Outliers: One straightforward approach is to remove the outliers from the dataset. However, this should be done with caution, as outliers may contain valuable information or represent rare but significant events. Removing too many outliers can lead to loss of important data.
- Outlier Detection Models: Use outlier detection algorithms (e.g., Isolation Forest, One-Class SVM) to identify and mark outliers. You can then choose whether to remove them or treat them separately during analysis.
- Transform Data: Instead of removing outliers, you can apply data transformations to reduce their impact. Common transformations include log-transform, square-root transform, or Box-Cox transform. These transformations can make the data more normally distributed and reduce the effect of extreme values.

## Correlation Matrix

In [None]:
variables = ['X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X7', 'X8', 'X9', 'X10', 'X11', 'X12', 'X13', 'X14', 'X15', 'X16', 'X17', 'X18']
data = df[variables]
correlation_matrix = data.corr()
mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))

plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5, mask=mask)
plt.title('Correlation Matrix')
plt.show()

In [None]:
df.head()

In [None]:
variables = ['X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X7', 'X8', 'X9', 'X10', 'X11', 'X12', 'X13', 'X14', 'X15', 'X16', 'X17', 'X18']
data = df[variables]
correlation_matrix = data.corr()
correlation_matrix

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

variables = ['X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X7', 'X8', 'X9', 'X10', 'X11', 'X12', 'X13', 'X14', 'X15', 'X16', 'X17', 'X18']
data = df[variables]
correlation_matrix = data.corr()
mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))

plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5, mask=mask)
plt.title('Correlation Matrix')

# Save the heatmap as an image
plt.savefig('correlation_heatmap.png', bbox_inches='tight', dpi=300)
plt.show()


# Pre-Processing

In [None]:
df['status_label'] = df['status_label'].map({'alive': 1, 'failed': 0})
X = df.drop(columns=['company_name', 'status_label'])
y = df['status_label']

- Convert the categorical target variable 'status_label' to numerical labels (1 for 'alive' and 0 for 'failed').
- Create the feature matrix X, excluding the 'company_name' and 'status_label' columns, to be used for training the machine learning model.
- Set the target variable y as the numerical 'status_label', representing the labels for the training dataset.

After these preprocessing steps, you can proceed with model training using X as the feature matrix and y as the target variable.

# Features Selection

# Using LogisticRegression and RandomForestRegressor

We are performing feature selection using two different algorithms: Logistic Regression and Random Forests. The goal of feature selection is to identify a subset of relevant features from the original dataset that will be used for model training. By selecting only the most important features, we aim to reduce the complexity of the model, improve its performance, and potentially avoid overfitting.

In [None]:
logit_model = LogisticRegression()

logit_model.fit(X, y)
logit_feature_importances = pd.Series(logit_model.coef_[0], index=X.columns).abs()
selected_features_logit = logit_feature_importances.nlargest(10).index.tolist()

rf_model = RandomForestClassifier()

rf_model.fit(X, y)
rf_feature_importances = pd.Series(rf_model.feature_importances_, index=X.columns).abs()
selected_features_rf = rf_feature_importances.nlargest(10).index.tolist()

df_logit_selected = df[['company_name', 'status_label'] + selected_features_logit]
df_rf_selected = df[['company_name', 'status_label'] + selected_features_rf]
print("Selected features using logistic regression:", selected_features_logit)
print("Selected features using random forests:", selected_features_rf)

In [None]:
selected_features_logit = ['year', 'X5', 'X1', 'X14', 'X12', 'X3', 'X8', 'X7', 'X17', 'X11']
selected_features_rf = ['X8', 'X15', 'X3', 'X1', 'X17', 'X7', 'X13', 'X10', 'X14', 'X6']

df_logit_selected = df[['status_label'] + selected_features_logit]
df_rf_selected = df[['status_label'] + selected_features_rf]

print("New dataset with selected features from logistic regression:")
print(df_logit_selected.head())

print("\nNew dataset with selected features from random forests:")
print(df_rf_selected.head())

df_logit_selected.to_csv('logit_selected_features_dataset.csv', index=False)
df_rf_selected.to_csv('rf_selected_features_dataset.csv', index=False)

print("Datasets saved to CSV files.")

In [None]:
df_rf_selected.shape, df_logit_selected.shape

# Model Building

# RandomForestClassifier

In [None]:
X_rf = df_rf_selected.drop('status_label', axis=1)
y_rf = df_rf_selected['status_label']

X_rf_train, X_rf_test, y_rf_train, y_rf_test = train_test_split(X_rf, y_rf, test_size=0.2, random_state=42)
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_rf_train, y_rf_train)
y_rf_pred = rf_model.predict(X_rf_test)
rf_accuracy = accuracy_score(y_rf_test, y_rf_pred)
print("Random Forest Accuracy: {:.2f}%".format(rf_accuracy * 100))

# Confusion Matrix
cm = confusion_matrix(y_rf_test, y_rf_pred)
print("Confusion Matrix:")
print(cm)

# Plot Confusion Matrix as a Heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", linewidths=0.5, square=True)
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix')
plt.show()

# Classification Report
cr = classification_report(y_rf_test, y_rf_pred)
print("Classification Report:")
print(cr)

# ROC Curve and AUC-ROC
y_rf_scores = rf_model.predict_proba(X_rf_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_rf_test, y_rf_scores)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = {:.2f})'.format(roc_auc))
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc="lower right")
plt.grid(True)
plt.show()

# Precision-Recall Curve
precision, recall, thresholds_pr = precision_recall_curve(y_rf_test, y_rf_scores)

plt.figure(figsize=(8, 6))
plt.plot(recall, precision, color='blue', lw=2)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.grid(True)
plt.show()

#Cross-Validation Scores
cv_scores = cross_val_score(rf_model, X_rf, y_rf, cv=5)
print("\nCross-Validation Scores:")
for i, score in enumerate(cv_scores):
    print("Fold {}: {:.2f}%".format(i + 1, score * 100))

# Average Cross-Validation Score
average_cv_score = cv_scores.mean()
print("Average Cross-Validation Score: {:.2f}%".format(average_cv_score * 100))

In [None]:
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", linewidths=0.5, square=True)
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix')
plt.savefig('confusion_matrix.png', bbox_inches='tight', dpi=300)
plt.show()

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = {:.2f})'.format(roc_auc))
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc="lower right")
plt.grid(True)
plt.savefig('roc_curve.png', bbox_inches='tight', dpi=300)
plt.show()


plt.figure(figsize=(8, 6))
plt.plot(recall, precision, color='blue', lw=2)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.grid(True)
plt.savefig('precision_recall_curve.png', bbox_inches='tight', dpi=300)
plt.show()


## Support Vector Machine (SVM)

In [None]:
X_rf = df_rf_selected.drop('status_label', axis=1)
y_rf = df_rf_selected['status_label']

X_rf_train, X_rf_test, y_rf_train, y_rf_test = train_test_split(X_rf, y_rf, test_size=0.2, random_state=42)

svm_model = SVC(random_state=42)
svm_model.fit(X_rf_train, y_rf_train)
y_rf_pred = svm_model.predict(X_rf_test)

svm_accuracy = accuracy_score(y_rf_test, y_rf_pred)
print("SVM Accuracy: {:.2f}%".format(svm_accuracy * 100))

# Confusion Matrix
cm_svm = confusion_matrix(y_rf_test, y_rf_pred)
print("Confusion Matrix:")
print(cm_svm)

# Classification Report
cr_svm = classification_report(y_rf_test, y_rf_pred)
print("Classification Report:")
print(cr_svm)

## DecisionTreeClassifier

In [None]:
X_logit = df_logit_selected.drop('status_label', axis=1)
y_logit = df_logit_selected['status_label']
X_logit_train, X_logit_test, y_logit_train, y_logit_test = train_test_split(X_logit, y_logit, test_size=0.2, random_state=42)

dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_logit_train, y_logit_train)

y_logit_pred = dt_model.predict(X_logit_test)

dt_accuracy = accuracy_score(y_logit_test, y_logit_pred)
print("Decision Tree Accuracy: {:.2f}%".format(dt_accuracy * 100))

# Confusion Matrix
cm_dt = confusion_matrix(y_logit_test, y_logit_pred)
print("Confusion Matrix:")
print(cm_dt)

# Classification Report
cr_dt = classification_report(y_logit_test, y_logit_pred)
print("Classification Report:")
print(cr_dt)

## Hyperparameters tuning using Grid Search

In [None]:
param_grid = {
    'max_depth': [None, 5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'criterion': ['gini', 'entropy']
}

dt_tuned = DecisionTreeClassifier(random_state=42)
grid_search = GridSearchCV(dt_tuned, param_grid, cv=5)
grid_search.fit(X_logit_train, y_logit_train)

best_dt_model = grid_search.best_estimator_
best_dt_pred = best_dt_model.predict(X_logit_test)
best_dt_accuracy = accuracy_score(y_logit_test, best_dt_pred)
print("Best Decision Tree Accuracy: {:.2f}%".format(best_dt_accuracy * 100))

# Confusion Matrix
cm_dt = confusion_matrix(y_logit_test, best_dt_pred)
print("Confusion Matrix:")
print(cm_dt)

# Classification Report
cr_dt = classification_report(y_logit_test, best_dt_pred)
print("Classification Report:")
print(cr_dt)

## xgboost

In [None]:
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_curve, auc, precision_recall_curve
import matplotlib.pyplot as plt
import seaborn as sns

# Splitting the data
X_xgb = df_rf_selected.drop('status_label', axis=1)
y_xgb = df_rf_selected['status_label']
X_xgb_train, X_xgb_test, y_xgb_train, y_xgb_test = train_test_split(X_xgb, y_xgb, test_size=0.2, random_state=42)


label_encoder = LabelEncoder()
y_xgb_train = label_encoder.fit_transform(y_xgb_train)
y_xgb_test = label_encoder.transform(y_xgb_test)
y_xgb_encoded = label_encoder.transform(y_xgb)

# Create XGBoost model
xgb_model = xgb.XGBClassifier(random_state=42)
xgb_model.fit(X_xgb_train, y_xgb_train)

# Predictions
y_xgb_pred = xgb_model.predict(X_xgb_test)

# Accuracy
xgb_accuracy = accuracy_score(y_xgb_test, y_xgb_pred)
print("XGBoost Accuracy: {:.2f}%".format(xgb_accuracy * 100))

# Confusion Matrix
cm_xgb = confusion_matrix(y_xgb_test, y_xgb_pred)
print("Confusion Matrix:")
print(cm_xgb)

# Classification Report
cr_xgb = classification_report(y_xgb_test, y_xgb_pred)
print("Classification Report:")
print(cr_xgb)

# ROC Curve and AUC-ROC
y_xgb_scores = xgb_model.predict_proba(X_xgb_test)[:, 1]
fpr_xgb, tpr_xgb, thresholds_xgb = roc_curve(y_xgb_test, y_xgb_scores)
roc_auc_xgb = auc(fpr_xgb, tpr_xgb)

plt.figure(figsize=(8, 6))
plt.plot(fpr_xgb, tpr_xgb, color='darkorange', lw=2, label='ROC curve (area = {:.2f})'.format(roc_auc_xgb))
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc="lower right")
plt.grid(True)
plt.show()
plt.savefig('xgb_roc_curve.png', bbox_inches='tight', dpi=300)
plt.show()


# Precision-Recall Curve
precision_xgb, recall_xgb, thresholds_pr_xgb = precision_recall_curve(y_xgb_test, y_xgb_scores)

plt.figure(figsize=(8, 6))
plt.plot(recall_xgb, precision_xgb, color='blue', lw=2)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.grid(True)
plt.show()
plt.savefig('xgb_precision_recall_curve.png', bbox_inches='tight', dpi=300)
plt.show()

# Cross-Validation Scores
cv_scores_xgb = cross_val_score(xgb_model, X_xgb, y_xgb_encoded, cv=5)
print("\nCross-Validation Scores:")
for i, score in enumerate(cv_scores_xgb):
    print("Fold {}: {:.2f}%".format(i + 1, score * 100))

# Average Cross-Validation Score
average_cv_score_xgb = cv_scores_xgb.mean()
print("Average Cross-Validation Score: {:.2f}%".format(average_cv_score_xgb * 100))

In [None]:
y_xgb_scores = xgb_model.predict_proba(X_xgb_test)[:, 1]
fpr_xgb, tpr_xgb, thresholds_xgb = roc_curve(y_xgb_test, y_xgb_scores)
roc_auc_xgb = auc(fpr_xgb, tpr_xgb)

plt.figure(figsize=(8, 6))
plt.plot(fpr_xgb, tpr_xgb, color='darkorange', lw=2, label='ROC curve (area = {:.2f})'.format(roc_auc_xgb))
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc="lower right")
plt.grid(True)
plt.savefig('xgb_roc_curve.png', bbox_inches='tight', dpi=300)  # Save before showing
plt.show()

# Precision-Recall Curve
precision_xgb, recall_xgb, thresholds_pr_xgb = precision_recall_curve(y_xgb_test, y_xgb_scores)

plt.figure(figsize=(8, 6))
plt.plot(recall_xgb, precision_xgb, color='blue', lw=2)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.grid(True)
plt.savefig('xgb_precision_recall_curve.png', bbox_inches='tight', dpi=300)  # Save before showing
plt.show()

## GradientBoostingClassifier

In [None]:
X_rf_train, X_rf_test, y_rf_train, y_rf_test = train_test_split(X_rf, y_rf, test_size=0.2, random_state=42)
gb_model = GradientBoostingClassifier(random_state=42)
gb_model.fit(X_rf_train, y_rf_train)
y_gb_pred = gb_model.predict(X_rf_test)

# Calculate accuracy
gb_accuracy = accuracy_score(y_rf_test, y_gb_pred)
print("Gradient Boosting Accuracy: {:.2f}%".format(gb_accuracy * 100))

# Confusion Matrix
cm_gb = confusion_matrix(y_rf_test, y_gb_pred)
print("Confusion Matrix:")
print(cm_gb)

# Classification Report
cr_gb = classification_report(y_rf_test, y_gb_pred)
print("Classification Report:")
print(cr_gb)

In [None]:
from sklearn.metrics import roc_auc_score, f1_score

In [None]:
y_rf_pred = svm_model.predict(X_rf_test)

# Get decision function scores for SVM
y_svm_scores = svm_model.decision_function(X_rf_test)

# Calculate ROC AUC for SVM
roc_auc_svm = roc_auc_score(y_rf_test, y_svm_scores)

# Calculate F1 Score for SVM
f1_score_svm = f1_score(y_rf_test, y_rf_pred)

In [None]:
cv_scores_svm = cross_val_score(best_dt_model, X_rf, y_rf, cv=5)
average_cv_score_svm = cv_scores_svm.mean()

In [None]:
# Calculate ROC AUC for Decision Tree
roc_auc_dt = roc_auc_score(y_logit_test, best_dt_pred)

# Calculate F1 Score for Decision Tree
f1_score_dt = f1_score(y_logit_test, best_dt_pred)

# Calculate Cross-Validation Scores for Decision Tree using the same data and model from Random Forest
cv_scores_dt = cross_val_score(best_dt_model, X_rf, y_rf, cv=5)
average_cv_score_dt = cv_scores_dt.mean()

In [None]:
# Calculate ROC AUC for Gradient Boosting
roc_auc_gb = roc_auc_score(y_rf_test, y_gb_pred)

# Calculate F1 Score for Gradient Boosting
f1_score_gb = f1_score(y_rf_test, y_gb_pred)

# Calculate Cross-Validation Scores for Gradient Boosting
cv_scores_gb = cross_val_score(gb_model, X_rf, y_rf, cv=5)
average_cv_score_gb = cv_scores_gb.mean()


In [None]:
y_rf_pred = rf_model.predict(X_rf_test)
y_rf_true = y_rf_test  # Replace with your true labels

# Calculate F1 Score for Random Forest
f1_score_rf = f1_score(y_rf_true, y_rf_pred)

In [None]:
f1_score_rf = f1_score(y_rf_test, y_gb_pred)
f1_score_xgb = f1_score(y_rf_test, y_rf_pred)

In [None]:
import pandas as pd

# Create a dictionary to store the model names and their respective metrics
model_metrics = {
    'Model': ['RandomForest', 'SVM', 'DecisionTree', 'XGBoost', 'GradientBoosting'],
    'Accuracy': [rf_accuracy, svm_accuracy, dt_accuracy, xgb_accuracy, gb_accuracy],
    'ROC AUC': [roc_auc, roc_auc_svm, roc_auc_dt, roc_auc_xgb, roc_auc_gb],
    'F1 Score': [f1_score_rf, f1_score_svm, f1_score_dt, f1_score_xgb, f1_score_gb],
    'Cross-Validation Score': [average_cv_score, average_cv_score_svm, average_cv_score_dt, average_cv_score_xgb, average_cv_score_gb]
}

# Create a DataFrame from the dictionary
summary_df = pd.DataFrame(model_metrics)

# Display the summary table
print(summary_df)


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# Create a DataFrame with your model metrics
model_metrics = {
    'Model': ['RandomForest', 'SVM', 'DecisionTree', 'XGBoost', 'GradientBoosting'],
    'Accuracy': [rf_accuracy, svm_accuracy, dt_accuracy, xgb_accuracy, gb_accuracy],
    'ROC AUC': [roc_auc, roc_auc_svm, roc_auc_dt, roc_auc_xgb, roc_auc_gb],
    'F1 Score': [f1_score_rf, f1_score_svm, f1_score_dt, f1_score_xgb, f1_score_gb],
}

summary_df = pd.DataFrame(model_metrics)

# Set the style of the plot to white background with no grid lines
sns.set(style="white", rc={"axes.grid": False})

# Create a bar plot for Accuracy
plt.figure(figsize=(8, 5))
sns.barplot(x='Model', y='Accuracy', data=summary_df, palette="Blues_d")
plt.title('Accuracy Comparison')

# Save the plot as an image
plt.savefig('accuracy_comparison.png', bbox_inches='tight', dpi=300)

# Show the plot
plt.show()

# Create a bar plot for ROC AUC
plt.figure(figsize=(8, 5))
sns.barplot(x='Model', y='ROC AUC', data=summary_df, palette="Blues_d")
plt.title('ROC AUC Comparison')

# Save the plot as an image
plt.savefig('roc_auc_comparison.png', bbox_inches='tight', dpi=300)

# Show the plot
plt.show()

# Create a bar plot for F1 Score
plt.figure(figsize=(8, 5))
sns.barplot(x='Model', y='F1 Score', data=summary_df, palette="Blues_d")
plt.title('F1 Score Comparison')

# Save the plot as an image
plt.savefig('f1_score_comparison.png', bbox_inches='tight', dpi=300)

# Show the plot
plt.show()

In [None]:
# import matplotlib.pyplot as plt
# import seaborn as sns

# # Create a DataFrame with your model metrics
# model_metrics = {
#     'Model': ['RandomForest', 'SVM', 'DecisionTree', 'XGBoost', 'GradientBoosting'],
#     'Accuracy': [rf_accuracy, svm_accuracy, dt_accuracy, xgb_accuracy, gb_accuracy],
#     'ROC AUC': [roc_auc, roc_auc_svm, roc_auc_dt, roc_auc_xgb, roc_auc_gb],
#     'F1 Score': [f1_score_rf, f1_score_svm, f1_score_dt, f1_score_xgb, f1_score_gb],
# }

# summary_df = pd.DataFrame(model_metrics)

# # Set the style of the plot
# sns.set(style="whitegrid")

# # Create a bar plot for Accuracy
# plt.figure(figsize=(8, 5))
# sns.barplot(x='Model', y='Accuracy', data=summary_df)
# plt.title('Accuracy Comparison')

# # Save the plot as an image
# plt.savefig('accuracy_comparison.png', bbox_inches='tight')

# # Show the plot
# plt.show()

# # Create a bar plot for ROC AUC
# plt.figure(figsize=(8, 5))
# sns.barplot(x='Model', y='ROC AUC', data=summary_df)
# plt.title('ROC AUC Comparison')

# # Save the plot as an image
# plt.savefig('roc_auc_comparison.png', bbox_inches='tight')

# # Show the plot
# plt.show()

# # Create a bar plot for F1 Score
# plt.figure(figsize=(8, 5))
# sns.barplot(x='Model', y='F1 Score', data=summary_df)
# plt.title('F1 Score Comparison')

# # Save the plot as an image
# plt.savefig('f1_score_comparison.png', bbox_inches='tight')

# # Show the plot
# plt.show()