In [None]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import pandas as pd
import seaborn as sns 
import matplotlib.pyplot as plt
import numpy as np
from sklearn.ensemble import RandomForestRegressor

## Explore the data

df=pd.read_csv("dataSetFinal.csv")
df 

columns=df.columns
columns

## Checking data type


df.dtypes

df.shape
# df.info 

# Preprocessing

## Data reduction

columns_to_drop = ['cik', 'ticker', 'accessionNo', 'companyName', 'fy', 'fp', 'form', 'filed']

# Drop the specified columns if they exist in the DataFrame
df.drop(columns=[col for col in columns_to_drop if col in df.columns], inplace=True)

df.shape 

## Duplicate values

df.duplicated().sum()

df= df.drop_duplicates()
df.duplicated().sum()

## Checking if there is null values


print("Missing values:")
print(df.isnull().sum())

import seaborn as sns
import matplotlib.pyplot as plt

df_melt = df.isnull().melt(var_name='variable', value_name='missing')


plt.figure(figsize=(8, 6))
sns.histplot(
    data=df_melt,
    y='variable',
    hue='missing',
    multiple='stack',
    palette={True: 'lightcoral', False: 'lightblue'}
)

plt.title('Missing Data Visualization')
plt.xlabel('Count')
plt.ylabel('Variable')
plt.show()

# ## dendogramme
# # Create a boolean DataFrame showing where values are missing
# missing_data = df.isnull()

# # Perform hierarchical clustering using linkage
# Z = linkage(missing_data.T, method='ward')

# # Plotting the dendrogram
# plt.figure(figsize=(10, 8))
# dendrogram(Z, labels=missing_data.columns, leaf_rotation=90)

# plt.title('Missing Data Visualization 2')
# plt.xlabel('Variables')
# plt.ylabel('Distance')
# plt.show()

### I decided to delete columns with a lot missing data (55% )


# using a 55% threshold is a standard approach for dropping columns, always validate against your problem, domain, and dataset

# Calculate the percentage of missing values for each column
missing_percentages = df.isnull().mean() * 100

# Filter and sort columns with more than 55% missing values
columns_to_drop = missing_percentages[missing_percentages > 55].sort_values(ascending=False)
columns_to_drop1 = missing_percentages[missing_percentages > 55].index.tolist()

# Print the columns and their corresponding percentages
print("Columns with more than 55% missing values (in descending order):")
for column, percentage in columns_to_drop.items():
    print(f"{column}: {percentage:.2f}%")


#columns_to_drop = ['Noncurrent_Liabilities','ShortTerm_Debt','Nonoperating_Income','GrossProfit','Intangible_Assets','Current_Other_Assets','Noncurrent_Assets']

df.drop(columns=[col for col in columns_to_drop1 if col in df.columns], inplace=True)
df.shape 

## Distribution between bankrupt and normal company 

X = df.drop(['is_bankrupt'], axis=1)
y = df['is_bankrupt']
y.value_counts()

sns.countplot(x='is_bankrupt', data=df , palette=['skyblue', 'lightcoral'])
plt.title('Distribution of companies (successful vs is_bankrupt')
plt.show()

## Splitting data

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### ** Separte the majority and minority **

# Separating training data 
#minority
X_train_bankrupted = X_train[y_train == 1]
y_train_bankrupted = y_train[y_train == 1]
#majority
X_train_successful = X_train[y_train == 0]
y_train_successful = y_train[y_train == 0]

# Separating testing data
#minority
X_test_bankrupted = X_test[y_test == 1]
y_test_bankrupted = y_test[y_test == 1]
#majority
X_test_successful = X_test[y_test == 0]
y_test_successful = y_test[y_test == 0]


### **for the majority i decided to drop columns with null values**

# Dropping missing values from the training dataset
X_train_successful = X_train_successful.dropna()
y_train_successful = y_train_successful[X_train_successful.index]  # Aligning the target with features

# Dropping missing values from the testing dataset
X_test_successful = X_test_successful.dropna()
y_test_successful = y_test_successful[X_test_successful.index]  # Aligning the target with features


### **for the minority i decided to do data imputation**

### Multiple Imputation with Chained Equations (MICE)

# !pip install fancyimpute

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from fancyimpute import IterativeImputer
from sklearn.linear_model import LinearRegression

# Step 3: Visualize Missing Data for Bankrupted Cases (Initial State)
plt.figure(figsize=(8, 6))
sns.heatmap(X_train_bankrupted.isnull(), cbar=False, cmap='Reds')
plt.title('Initial Missing Data in Bankrupted Cases (Red Indicates Missing)')
plt.show()

# Step 4: Randomly fill missing data for bankrupted cases
random_fill_bankrupted = X_train_bankrupted.apply(lambda col: col.fillna(np.random.choice(col.dropna())), axis=0)

# Step 5: Apply MICE for Bankrupted Cases
imputer_bankrupted = IterativeImputer(estimator=LinearRegression(), max_iter=10, random_state=0)
imputed_train_bankrupted = imputer_bankrupted.fit_transform(random_fill_bankrupted)

# Convert the result back to a DataFrame
df_train_imputed_bankrupted = pd.DataFrame(imputed_train_bankrupted, columns=random_fill_bankrupted.columns)

# Perform imputation on the test set for bankrupted cases
imputed_test_bankrupted = imputer_bankrupted.transform(X_test_bankrupted)
df_test_imputed_bankrupted = pd.DataFrame(imputed_test_bankrupted, columns=X_test_bankrupted.columns)

# Step 6: Visualize Imputed Data for Bankrupted Cases (Final State)
plt.figure(figsize=(8, 6))
sns.heatmap(df_train_imputed_bankrupted.isnull(), cbar=False, cmap='Reds')
plt.title('Imputed Training Data for Bankrupted Cases (Red Indicates Remaining Missing)')
plt.show()


## Concat the data again


# Concatenate imputed features and target variable for the training set
X_train = pd.concat([df_train_imputed_bankrupted, X_train_successful], axis=0)  
y_train = pd.concat([y_train_bankrupted, y_train_successful], axis=0)  

# Concatenate imputed features and target variable for the testing set
X_test = pd.concat([df_test_imputed_bankrupted, X_test_successful], axis=0)  # Combine features
y_test = pd.concat([y_test_bankrupted, y_test_successful], axis=0)  # Combine targets

# Resetting index (optional but often useful)
X_train.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)

X_test.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)

print(X_train.isnull().sum())

# Set the size of the plot
plt.figure(figsize=(20, 10))
columns=df.columns
# Loop through the columns and create subplots
for i, column in enumerate(columns, 1):
    plt.subplot(5, 4, i)
    sns.boxplot(y=df[column])
    plt.title(column)

# Adjust layout
plt.tight_layout()
plt.show()

# Set the size of the overall plot
plt.figure(figsize=(20, 5))  # Adjust height to fit one row nicely

columns = ['Stockholder_Equity', 'Retained_Earnings', 'Working_capital', 'Liabilities', 'LongTerm_Debt']
rows, cols = 2, 3  # 1 row, 5 columns

# Set y-axis limit
y_max = 1500

# Create subplots
for i, column in enumerate(columns):
    plt.subplot(rows, cols, i + 1)
    sns.histplot(df[column], kde=True)
    plt.ylim(0, y_max)  # Set the y-axis (ordinate axis) limit
    plt.title(f'Distribution of {column}')

# Adjust layout
plt.tight_layout()

# Show the plot
plt.show()


# Set the size of the overall plot
plt.figure(figsize=(20, 5))  # Adjust height to fit one row nicely

columns = ['Stockholder_Equity', 'Retained_Earnings', 'Working_capital', 'Liabilities', 'LongTerm_Debt']
rows, cols = 1, 5  # 1 row, 5 columns

# Set y-axis limit
y_max = 1500

# Create subplots
for i, column in enumerate(columns):
    plt.subplot(rows, cols, i + 1)
    sns.histplot(df[column], kde=True)
    plt.ylim(0, y_max)  # Set the y-axis (ordinate axis) limit
    plt.title(f'Distribution of {column}')

# Adjust layout
plt.tight_layout()

# Show the plot
plt.show()


### Normalization

# Initialize the RobustScaler
scaler = RobustScaler()

# Fit and transform the training data
X_train_scaled = scaler.fit_transform(X_train)

# Transform the test data (without re-fitting the scaler)
X_test_scaled = scaler.transform(X_test)

# Convert the numpy arrays back to DataFrames with the same column names as X_train and X_test
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns)

# Optionally, if you want to reassign df to the scaled data (use X_train or X_test as required)
X_train = X_train_scaled
X_train = X_test_scaled

# Display the first few rows of the scaled training data
print(X_train.head())


## features engineering **************************

# Define a function to calculate all 19 financial ratios
def calculate_ratios(df):
    # R1: Current Ratio = Current Assets / Current Liabilities
    df['R1'] = df['Current_Assets'] / df['Current_liabilities']

    # R3: Debt to Equity Ratio = Liabilities / Stockholder Equity
    df['R3'] = df['Liabilities'] / df['Stockholder_Equity']

    # R4: Working Capital Ratio = Working Capital / Total Assets
    df['R4'] = df['Working_capital'] / df['Assets']

    # R5: Net Income Margin = Net Income / Revenues
    df['R5'] = df['NetIncome'] / df['Revenues']

    # R6: Return on Assets (ROA) = Net Income / Total Assets
    df['R6'] = df['NetIncome'] / df['Assets']

    # R7: Return on Equity (ROE) = Net Income / Stockholder Equity
    df['R7'] = df['NetIncome'] / df['Stockholder_Equity']

    # R8: Cash Ratio = Cash / Current Liabilities
    df['R8'] = df['Cash'] / df['Current_liabilities']

    # R9: Operating Cash Flow to Total Debt Ratio = Net Cash Operating Activities / Total Liabilities
    df['R9'] = df['NetCash_OperatingActivities'] / df['Liabilities']

    # R10: Interest Coverage Ratio = Earnings Before Interest and Taxes (EBIT) / Interest Expense
    df['R10'] = df['Earning_Before_Interest_And_Taxes'] / df['InterestExpense']

    # R12: Debt to Assets Ratio = Liabilities / Total Assets
    df['R12'] = df['Liabilities'] / df['Assets']

    # R13: Net Working Capital to Revenues = Working Capital / Revenues
    df['R13'] = df['Working_capital'] / df['Revenues']

    # R14: Retained Earnings to Assets Ratio = Retained Earnings / Total Assets
    df['R14'] = df['Retained_Earnings'] / df['Assets']

    # R16: Long-Term Debt to Total Capitalization = Long-Term Debt / (Long-Term Debt + Stockholder Equity)
    df['R16'] = df['LongTerm_Debt'] / (df['LongTerm_Debt'] + df['Stockholder_Equity'])

    # R17: Cash Flow to Sales Ratio = Net Cash Operating Activities / Revenues
    df['R17'] = df['NetCash_OperatingActivities'] / df['Revenues']

    # R18: Investing Cash Flow to Assets Ratio = Net Cash Investing Activities / Total Assets
    df['R18'] = df['NetCash_InvestingActivities'] / df['Assets']

    # R19: Financing Cash Flow to Total Debt Ratio = Net Cash Financing Activities / Total Liabilities
    df['R19'] = df['NetCash_FinancingActivities'] / df['Liabilities']

    return df

# Apply the ratio calculations to both X_train and X_test
X_train = calculate_ratios(X_train)
X_test = calculate_ratios(X_test)


# Check for NaN values
nan_values = X_train.isna().sum()
print("NaN values in each column:\n", nan_values)

# Check for infinite values
inf_values = X_train.isin([np.inf, -np.inf]).sum()
print("Infinite values in each column:\n", inf_values)

# **Feature Selection**

### **Filter based Methods** 
Basic Statistical Filter Methods
VarianceThreshod (Remove the Constant Feature and Quasi-Constant Features)
Remove Duplicate Features
Correlation & Ranking based statistical Filter Methods
Pearson’s correlation coefficient
Spearman’s rank coefficient
Kendall’s rank coefficient
Statistical Test based Methods
Anova or F-Test
Mutual Information
Chi Square


### ** Basic Statistical Filter Methods **

### 1)  VarianceThreshod (Remove the Constant Feature and Quasi-Constant Features)


from sklearn.feature_selection import VarianceThreshold
var_thres=VarianceThreshold(threshold=0.01)
var_thres.fit(X_train)

var_thres.get_support()

# Get features which have the variance greater than the set threshold value = 0.1
sum(var_thres.get_support())

there is no constant,Quasi-constant Features 

### 2) Duplicate columns

X_train_t = X_train.T


# Print the duplicate features
print("Duplicate Features") 
print(X_train_t.duplicated(keep = 'first')) # keep : {'first', 'last', False}, default 'first'
duplicate_feat = X_train_t.duplicated().sum()
print("Count of duplicate_feature :",duplicate_feat)


# # Identify rows where values differ
difference_indices = X_train.index[X_train['Liabilities_And_StockholderEquity'] != X_train['Assets']].tolist()
print(f'Differences found at indices: {difference_indices}')
different_rows = X_train.loc[difference_indices]
print('Rows with different values:')
print(different_rows)

# Print the duplicate features Names only
duplicate_features = X_train_t[X_train_t.duplicated()].index.values
print("duplicate_features: ",duplicate_features)

# Let drop the duplcate features in original dataframe
X_train = X_train.drop(columns=duplicate_features,axis=1)
X_train.head()

# plt.figure(figsize=(19,20))
# sns.scatterplot(data=X_train)

### **  Correlation & Ranking based statistical Filter Methods **


### 1) Feature Selection with Pearson’s correlation coefficient


# cols=X_train.columns
# fig, axs = plt.subplots(5, 4, figsize=(20, 15)) 
# [sns.regplot(y = "y_train",x=i, data=X_train,scatter_kws={"color": "blue"}, 
#              line_kws={"color": "red"},ax=axs.flatten()[j]) for j,i in enumerate(cols[0:])]
# [axs.flatten()[j].set_title(i) for j,i in enumerate(cols[0:])]
# fig.tight_layout()
# plt.plot()
# cols

import matplotlib.pyplot as plt
import seaborn as sns

# Calculate the correlation matrix
matrice_correlation = X.corr()

# Set the figure size (width, height) in inches to a larger size
plt.figure(figsize=(24, 16))  # Increased size for a bigger heatmap

# Plot the heatma
sns.heatmap(matrice_correlation, annot=True, cmap=plt.cm.CMRmap_r, fmt=".2f")

# Set the title
plt.title('Matrice de corrélation')

# Show the plot
plt.show()

# Compute the correlation matrix of features
corr_matrix = X_train.corr()

# Compute correlation of features with the target
target_corr = X_train.apply(lambda x: np.corrcoef(x, y_train)[0, 1])

# Set the threshold for high correlation
threshold = 0.8

# Function to find highly correlated groups
def find_highly_correlated_groups(corr_matrix, threshold):
    correlated_groups = []
    visited = set()  # Track features we've already grouped
    features = corr_matrix.columns
    
    for i in range(len(features)):
        if features[i] not in visited:
            # Find features highly correlated with the current feature
            current_group = [features[i]]
            for j in range(i + 1, len(features)):
                if abs(corr_matrix.iloc[i, j]) > threshold:
                    current_group.append(features[j])
                    visited.add(features[j])
            
            if len(current_group) > 1:  # Only store groups of 2 or more correlated features
                correlated_groups.append(current_group)
    
    return correlated_groups

# Get highly correlated groups of features
highly_correlated_groups = find_highly_correlated_groups(corr_matrix, threshold)

# Function to drop all features except the one most correlated with the target
def find_features_to_drop(groups, target_corr):
    features_to_drop = []
    
    for group in groups:
        # Find the feature in the group that has the highest correlation with the target
        best_feature = max(group, key=lambda x: abs(target_corr[x]))
        
        # Add all other features from the group to the drop list
        for feature in group:
            if feature != best_feature:
                features_to_drop.append(feature)
    
    return features_to_drop

# Identify features to drop based on correlation with the target
features_to_drop = find_features_to_drop(highly_correlated_groups, target_corr)

# Print the features to drop
print(f"Features to drop: {features_to_drop}")

# Drop the selected features from X_train and X_test
X_train_reduced = X_train.drop(columns=features_to_drop)
X_test_reduced = X_test.drop(columns=features_to_drop)

# Proceed with training your model on the reduced dataset


### 2) Feature Selection with Spearman’s correlation coefficient


corr_matrix = X_train.corr(method='spearman')
plt.figure(figsize=(24,16))
sns.heatmap(corr_matrix,cmap = "RdYlGn",annot=True)


# Compute correlation of features with the target
target_corr = X_train.apply(lambda x: np.corrcoef(x, y_train)[0, 1])

# Set the threshold for high correlation
threshold = 0.8

# Function to find highly correlated groups
def find_highly_correlated_groups(corr_matrix, threshold):
    correlated_groups = []
    visited = set()  # Track features we've already grouped
    features = corr_matrix.columns
    
    for i in range(len(features)):
        if features[i] not in visited:
            # Find features highly correlated with the current feature
            current_group = [features[i]]
            for j in range(i + 1, len(features)):
                if abs(corr_matrix.iloc[i, j]) > threshold:
                    current_group.append(features[j])
                    visited.add(features[j])
            
            if len(current_group) > 1:  # Only store groups of 2 or more correlated features
                correlated_groups.append(current_group)
    
    return correlated_groups

# Get highly correlated groups of features
highly_correlated_groups = find_highly_correlated_groups(corr_matrix, threshold)

# Function to drop all features except the one most correlated with the target
def find_features_to_drop(groups, target_corr):
    features_to_drop = []
    
    for group in groups:
        # Find the feature in the group that has the highest correlation with the target
        best_feature = max(group, key=lambda x: abs(target_corr[x]))
        
        # Add all other features from the group to the drop list
        for feature in group:
            if feature != best_feature:
                features_to_drop.append(feature)
    
    return features_to_drop

# Identify features to drop based on correlation with the target
features_to_drop = find_features_to_drop(highly_correlated_groups, target_corr)

# Print the features to drop
print(f"Features to drop: {features_to_drop}")

# Drop the selected features from X_train and X_test
X_train_reduced = X_train.drop(columns=features_to_drop)
X_test_reduced = X_test.drop(columns=features_to_drop)

# Proceed with training your model on the reduced dataset


# Features to drop from Pearson
features_to_drop_pearson = ['Assets', 'Current_Assets', 'Current_liabilities', 
                             'Liabilities_And_StockholderEquity', 'Liabilities', 
                             'Earning_Before_Interest_And_Taxes', 'NetCash_OperatingActivities']

# Features to drop from Spearman
features_to_drop_spearman = ['Assets', 'Current_Assets', 'Current_liabilities', 
                              'Liabilities_And_StockholderEquity', 'Revenues', 
                              'Liabilities', 'Cash', 'AccountsReceivable', 
                              'AccountsPayable', 'InterestExpense', 
                              'LongTerm_Debt', 'Earning_Before_Interest_And_Taxes']

# Convert to sets for easier comparison
set_pearson = set(features_to_drop_pearson)
set_spearman = set(features_to_drop_spearman)

# Find unique features
unique_to_pearson = set_pearson - set_spearman
unique_to_spearman = set_spearman - set_pearson
overlapping_features = set_pearson & set_spearman# Combine all features, including unique and overlapping
all_features_to_drop = unique_to_pearson.union(unique_to_spearman).union(overlapping_features)

# Print results
print(f"Unique features to Pearson: {unique_to_pearson}")
print(f"Unique features to Spearman: {unique_to_spearman}")
print(f"Overlapping features: {overlapping_features}")
print(f"All features to drop: {all_features_to_drop}")

## **Decision**

# # Drop features from the training DataFrame (assuming X_train is your training DataFrame)
# X_train = X_train.drop(columns=all_features_to_drop, errors='ignore')

# # Drop features from the test DataFrame (assuming X_test is your test DataFrame)
# X_test = X_test.drop(columns=all_features_to_drop, errors='ignore')

# # Check the resulting DataFrames
# print("Training DataFrame after dropping features:")
# print(X_train.head())

# print("Test DataFrame after dropping features:")
# print(X_test.head())

### ** Statistcal Methods  **

### 1) Feature Selection for Classification Problem using Mutual Information(MI)




from sklearn.feature_selection import SelectKBest,mutual_info_classif
# https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.mutual_info_classif.html#id4

from sklearn.feature_selection import mutual_info_classif
# determine the mutual information
mutual_info = mutual_info_classif(X_train, y_train)
mutual_info

mutual_info = pd.Series(mutual_info)
mutual_info.index = X_train.columns
mutual_info.sort_values(ascending=False)

#let's plot the ordered mutual_info values per feature
mutual_info.sort_values(ascending=False).plot.bar(figsize=(20, 8))

from sklearn.feature_selection import SelectKBest
#No we Will select the  top 5 important features
sel_five_cols = SelectKBest(mutual_info_classif, k=5)
sel_five_cols.fit(X_train, y_train)
X_train.columns[sel_five_cols.get_support()]

# Index(['Stockholder_Equity', 'Retained_Earnings', 'Working_capital',
#        'Liabilities', 'LongTerm_Debt'],
#       dtype='object')

### **Wrapper Methods** 


### ** Forword selection  **



## **Balancing the data**

## undersampling

import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA

# Use X_train_successful dataset with all features (except target variable)
X_train_successful = X_train[y_train == 0]
y_train_successful = y_train[y_train == 0]

# Assuming X_train_successful has only the features (exclude the target if needed)
X = X_train_successful  # All features

# Elbow Method and Silhouette Score to find the best k
wcss = []  # Store WCSS values for Elbow method
silhouette_scores = []  # Store silhouette scores
K_range = range(2, 11)  # Testing k from 2 to 10

for k in K_range:
    kmeans = KMeans(n_clusters=k, random_state=0)
    kmeans.fit(X)
    
    # Append WCSS (within-cluster sum of squares) for the Elbow method
    wcss.append(kmeans.inertia_)
    
    # Append silhouette score (quality of clustering)
    silhouette_avg = silhouette_score(X, kmeans.labels_)
    silhouette_scores.append(silhouette_avg)

# Plot Elbow Method (WCSS)
plt.figure(figsize=(12, 5))

# Elbow Method plot
plt.subplot(1, 2, 1)
plt.plot(K_range, wcss, marker='o')
plt.title('Elbow Method')
plt.xlabel('Number of clusters (k)')
plt.ylabel('WCSS (Within-cluster sum of squares)')
plt.grid(True)

# Plot Silhouette Scores
plt.subplot(1, 2, 2)
plt.plot(K_range, silhouette_scores, marker='o', color='orange')
plt.title('Silhouette Scores')
plt.xlabel('Number of clusters (k)')
plt.ylabel('Silhouette Score')
plt.grid(True)
plt.show()

# Suggest the best k using the Elbow Method (find the elbow point)
# Calculate the differences between consecutive WCSS values to locate the elbow
wcss_differences = np.diff(wcss)

# # Find the point where the decrease slows down the most (the "elbow")
# elbow_k = K_range[np.argmin(np.abs(np.diff(wcss_differences))) + 1]
# print(f"The best number of clusters based on the Elbow Method is: {elbow_k}")
elbow_k=4
# Now, create and fit the K-Means model using the best k (from the Elbow Method)
kmeans = KMeans(n_clusters=elbow_k, random_state=0)
X['Cluster'] = kmeans.fit_predict(X)

# Example for 2D Visualization (using PCA to reduce to 2 components)
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X.drop('Cluster', axis=1))

plt.figure(figsize=(8, 6))
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=X['Cluster'], cmap='viridis')
plt.title(f'Clustering with k={elbow_k} (2D PCA)')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.colorbar()
plt.show()

# Display the first few rows of the clustered data
X.head()



# Suggest the best k using the Elbow Method
elbow_k = 4  # For example, you've determined that k = 5 is optimal
print(f"The best number of clusters based on the Elbow Method is: {elbow_k}")

# Now, create and fit the K-Means model using the best k (from the Elbow Method)
kmeans = KMeans(n_clusters=elbow_k, random_state=0)
X['Cluster'] = kmeans.fit_predict(X)

# Create DataFrames for each cluster
clustered_data = {}
for cluster in range(elbow_k):  # From 0 to 4 (for k=5)
    clustered_data[f'Cluster_{cluster + 1}'] = X[X['Cluster'] == cluster]

# Determine the number of samples to take from each cluster
total_samples = 200
samples_per_cluster = total_samples // elbow_k  # Equal samples from each cluster

# Undersample each cluster
undersampled_dfs = []
for key, df in clustered_data.items():
    if df.shape[0] >= samples_per_cluster:  # Ensure there are enough samples
        undersampled_df = df.sample(n=samples_per_cluster, random_state=0)
        undersampled_dfs.append(undersampled_df)
    else:
        print(f"Cluster {key} has only {df.shape[0]} samples, not enough to undersample to {samples_per_cluster}.")

# Combine the undersampled DataFrames
X_train = pd.concat(undersampled_dfs, ignore_index=True)

# Display the final undersampled DataFrame
print("\nFinal Undersampled DataFrame:")
print(final_undersampled_df.head())
print(f"Total rows in final DataFrame: {final_undersampled_df.shape[0]}")








## Oversampling 

### Apply SMOTE


# #!pip install imbalanced-learn

# from imblearn.over_sampling import SMOTE
# from sklearn.datasets import make_classification

# smote = SMOTE(random_state=42)
# X_train, y_train = smote.fit_resample(X_train, y_train)

### Apply ADASYN

from imblearn.over_sampling import ADASYN

# Apply ADASYN to the training data
adasyn = ADASYN(random_state=42)
X_train, y_train= adasyn.fit_resample(X_train, y_train)

c=y_train.value_counts()
plt.pie(c, labels=['stay', 'is_bankrupt'], autopct='%1.1f%%', startangle=90, colors=['skyblue', 'lightcoral'])
plt.title('Distribution des employés (is_bankrupt vs healthy)')
plt.show()
c

print(X_train.shape)
print(X_test.shape)
print(y_train.shatest.shape)
pe)
print(y_

In [None]:
# Modeling

**1- Random Forest**

A powerful ensemble learning method that builds multiple decision trees and combines their predictions to improve accuracy and robustness.

from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, recall_score, f1_score, roc_curve
from sklearn.model_selection import GridSearchCV, StratifiedKFold
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# ================== Random Forest without Feature Selection ==================
# Train a Random Forest classifier without feature selection
rf_w = RandomForestClassifier(random_state=100, n_estimators=50)
rf_w.fit(X_train, y_train)

# Make predictions and evaluate accuracy
y_pred_rf_w = rf_w.predict(X_test)
accuracy_rf_w = accuracy_score(y_test, y_pred_rf_w)
print(f"Random Forest Accuracy (without feature selection): {accuracy_rf_w:.4f}")

# ================== Feature Importance (Without Feature Selection) ================
# Get feature importance from the trained Random Forest model
importances = rf_w.feature_importances_

# Create a DataFrame for visualization
final_df = pd.DataFrame({"Features": X_train.columns, "Importances": importances})
final_df.set_index('Importances')

# Sort features by importance in ascending order for better visualization
final_df = final_df.sort_values('Importances')

# Plot the feature importances as a bar chart
plt.figure(figsize=(10, 3))
plt.xticks(rotation=45)
sns.barplot(x="Features", y="Importances", data=final_df)
plt.title("Feature Importance (Random Forest without feature selection)")
plt.show()

# ================== Random Forest with RFE (Feature Selection) ==================
# Define a new Random Forest model for RFE (Recursive Feature Elimination)
model_tree = RandomForestClassifier(n_estimators=100, random_state=42)

# Apply RFE to select the top 7 most important features
sel_rfe_tree = RFE(estimator=model_tree, n_features_to_select=8, step=1)
X_train_rfe_tree = sel_rfe_tree.fit_transform(X_train, y_train)
X_test_rfe_tree = sel_rfe_tree.transform(X_test)

# Print selected features and their rankings
print(f"Selected Features Support: {sel_rfe_tree.get_support()}")
print(f"Feature Rankings: {sel_rfe_tree.ranking_}")

# Train a Random Forest model using RFE-selected features
rf_rfe = RandomForestClassifier(random_state=42)
rf_rfe.fit(X_train_rfe_tree, y_train)

# Make predictions and evaluate accuracy
y_pred_rf_rfe = rf_rfe.predict(X_test_rfe_tree)
accuracy_rf_rfe = accuracy_score(y_test, y_pred_rf_rfe)
print(f"Random Forest Accuracy (with RFE): {accuracy_rf_rfe:.4f}")

# find the number of selected features with the help of the following script:
selected_cols = [column for column in X_train.columns if column in X_train.columns[sel_rfe_tree.get_support()]]
selected_cols

# ================== Hyperparameter Tuning on Selected Features ==================
# Perform Grid Search to tune hyperparameters on RFE-selected features
param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

grid_search_rf_rfe = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_grid=param_grid_rf,
    scoring='roc_auc',
    cv=StratifiedKFold(n_splits=5),
    verbose=1,
    n_jobs=-1
)

# Fit the Grid Search model
grid_search_rf_rfe.fit(X_train_rfe_tree, y_train)

# Get the best parameters and the tuned model
best_params_rf_rfe = grid_search_rf_rfe.best_params_
tuned_rf_model_rfe = grid_search_rf_rfe.best_estimator_

# Evaluate the tuned model
y_pred_rf_rfe_tuned = tuned_rf_model_rfe.predict(X_test_rfe_tree)
accuracy_rf_rfe_tuned = accuracy_score(y_test, y_pred_rf_rfe_tuned)
print(f"Tuned Random Forest (with RFE) Accuracy: {accuracy_rf_rfe_tuned:.4f}")

**2- Decision Tree**

Decision Trees (DTs) are a non-parametric supervised learning method used for classification and regression. The goal is to create a model that predicts the value of a target variable by learning simple decision rules inferred from the data features.

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve
import matplotlib.pyplot as plt

# 1. Baseline Decision Tree model
baseline_dt_model = DecisionTreeClassifier(random_state=42)
baseline_dt_model.fit(X_train, y_train)

# Baseline predictions
y_pred_baseline_dt = baseline_dt_model.predict(X_test)
y_pred_baseline_dt_proba = baseline_dt_model.predict_proba(X_test)[:, 1]

# Baseline model evaluation
accuracy_baseline_dt = accuracy_score(y_test, y_pred_baseline_dt)
auc_baseline_dt = roc_auc_score(y_test, y_pred_baseline_dt_proba)
print(f"Baseline Decision Tree Accuracy: {accuracy_baseline_dt:.4f}")
print(f"Baseline Decision Tree AUC: {auc_baseline_dt:.4f}")

# 2. Hyperparameter tuning with Grid Search for Decision Tree
param_grid_dt = {
    'max_depth': [3, 5, 7, 10, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'criterion': ['gini', 'entropy']
}

# GridSearchCV setup for Decision Tree
grid_search_dt = GridSearchCV(
    estimator=DecisionTreeClassifier(random_state=42),
    param_grid=param_grid_dt,
    scoring='roc_auc',
    cv=StratifiedKFold(n_splits=5),
    verbose=1,
    n_jobs=-1
)

# Fit Grid Search for Decision Tree
grid_search_dt.fit(X_train, y_train)

# Best parameters and model
best_params_dt = grid_search_dt.best_params_
print(f"Best Decision Tree parameters found: {best_params_dt}")

# 3. Evaluate tuned model with cross-validation
tuned_dt_model = grid_search_dt.best_estimator_

# Cross-validation with the tuned Decision Tree model (5 folds)
cv_results_dt = cross_val_score(tuned_dt_model, X_train, y_train, cv=StratifiedKFold(n_splits=5), scoring='roc_auc')
print(f"Cross-validation AUC scores (Decision Tree): {cv_results_dt}")
print(f"Mean Cross-validation AUC (Decision Tree): {cv_results_dt.mean():.4f}")

# 4. Final Decision Tree model and evaluation
tuned_dt_model.fit(X_train, y_train)

# Make predictions
y_pred_dt = tuned_dt_model.predict(X_test)
y_pred_dt_proba = tuned_dt_model.predict_proba(X_test)[:, 1]

# Evaluation metrics
accuracy_dt = accuracy_score(y_test, y_pred_dt)
precision_dt = precision_score(y_test, y_pred_dt)
recall_dt = recall_score(y_test, y_pred_dt)
f1_dt = f1_score(y_test, y_pred_dt)
auc_dt = roc_auc_score(y_test, y_pred_dt_proba)

print(f"Tuned Decision Tree Accuracy: {accuracy_dt:.4f}")
print(f"Tuned Decision Tree Precision: {precision_dt:.4f}")
print(f"Tuned Decision Tree Recall: {recall_dt:.4f}")
print(f"Tuned Decision Tree F1 Score: {f1_dt:.4f}")
print(f"Tuned Decision Tree AUC: {auc_dt:.4f}")

# Plot ROC curve for Decision Tree
fpr_dt, tpr_dt, thresholds_dt = roc_curve(y_test, y_pred_dt_proba)
plt.figure(figsize=(8, 6))
plt.plot(fpr_dt, tpr_dt, color='blue', label=f'Decision Tree ROC (AUC = {auc_dt:.4f})')
plt.plot([0, 1], [0, 1], color='red', linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate (Recall)')
plt.title('Decision Tree ROC Curve')
plt.legend()
plt.grid()
plt.show()


**1- SVM**

from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve
import matplotlib.pyplot as plt

# 1. Baseline SVM model
baseline_svm_model = SVC(random_state=42, probability=True)
baseline_svm_model.fit(X_train, y_train)

# Baseline predictions
y_pred_baseline_svm = baseline_svm_model.predict(X_test)
y_pred_baseline_svm_proba = baseline_svm_model.predict_proba(X_test)[:, 1]

# Baseline model evaluation
accuracy_baseline_svm = accuracy_score(y_test, y_pred_baseline_svm)
auc_baseline_svm = roc_auc_score(y_test, y_pred_baseline_svm_proba)
print(f"Baseline SVM Accuracy: {accuracy_baseline_svm:.4f}")
print(f"Baseline SVM AUC: {auc_baseline_svm:.4f}")

# 2. Hyperparameter tuning with Grid Search for SVM
# param_grid_svm = {
#     'C': [0.1, 1, 10, 100],
#     'kernel': ['linear', 'rbf', 'poly'],
#     'gamma': ['scale', 'auto'],
#     'degree': [2, 3, 4]  # Only applicable for 'poly' kernel
# }
param_grid_svm = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'poly'],
    'degree': [2, 3]  
}

# GridSearchCV setup for SVM
grid_search_svm = GridSearchCV(
    estimator=SVC(random_state=42, probability=True),
    param_grid=param_grid_svm,
    scoring='roc_auc',
    cv=StratifiedKFold(n_splits=5),
    verbose=1,
    n_jobs=-1
)

# Fit Grid Search for SVM
grid_search_svm.fit(X_train, y_train)

# Best parameters and model
best_params_svm = grid_search_svm.best_params_
print(f"Best SVM parameters found: {best_params_svm}")

# 3. Evaluate tuned model with cross-validation
tuned_svm_model = grid_search_svm.best_estimator_

# Cross-validation with the tuned SVM model (5 folds)
cv_results_svm = cross_val_score(tuned_svm_model, X_train, y_train, cv=StratifiedKFold(n_splits=5), scoring='roc_auc')
print(f"Cross-validation AUC scores (SVM): {cv_results_svm}")
print(f"Mean Cross-validation AUC (SVM): {cv_results_svm.mean():.4f}")

# 4. Final SVM model and evaluation
tuned_svm_model.fit(X_train, y_train)

# Make predictions
y_pred_svm = tuned_svm_model.predict(X_test)
y_pred_svm_proba = tuned_svm_model.predict_proba(X_test)[:, 1]

# Evaluation metrics
accuracy_svm = accuracy_score(y_test, y_pred_svm)
precision_svm = precision_score(y_test, y_pred_svm)
recall_svm = recall_score(y_test, y_pred_svm)
f1_svm = f1_score(y_test, y_pred_svm)
auc_svm = roc_auc_score(y_test, y_pred_svm_proba)

print(f"Tuned SVM Accuracy: {accuracy_svm:.4f}")
print(f"Tuned SVM Precision: {precision_svm:.4f}")
print(f"Tuned SVM Recall: {recall_svm:.4f}")
print(f"Tuned SVM F1 Score: {f1_svm:.4f}")
print(f"Tuned SVM AUC: {auc_svm:.4f}")

# Plot ROC curve for SVM
fpr_svm, tpr_svm, thresholds_svm = roc_curve(y_test, y_pred_svm_proba)
plt.figure(figsize=(8, 6))
plt.plot(fpr_svm, tpr_svm, color='blue', label=f'SVM ROC (AUC = {auc_svm:.4f})')
plt.plot([0, 1], [0, 1], color='red', linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate (Recall)')
plt.title('SVM ROC Curve')
plt.legend()
plt.grid()
plt.show()


**4-fully connected neural network**

from keras.models import Sequential
from keras.layers import Dense, Dropout
model = Sequential([
    Dense(units=20, input_dim = X_train.shape[1], activation='relu'),
    Dense(units=24,activation='relu'),
    Dropout(0.5),
    Dense(units=20,activation='relu'),
    Dense(units=24,activation='relu'),
    Dense(1, activation='sigmoid')
])
model.summary()
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, batch_size=30, epochs=30)

score = model.evaluate(X_test, y_test)
print('Test Accuracy: {:.2f}%\nTest Loss: {}'.format(score[1]*100,score[0]))

y_pred = model.predict(X_test)
y_test = pd.DataFrame(y_test)
cm = confusion_matrix(y_test, y_pred.round())
sns.heatmap(cm, annot=True, fmt='.0f')
plt.show()

**5- XgBoost**

An implementation of gradient boosting specifically designed to be efficient and effective. It often performs better than other boosting methods.

import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold

# Load your data (X_train, X_test, y_train, y_test)

# 1. Start with a baseline model
baseline_model = xgb.XGBClassifier(objective='binary:logistic', eval_metric='auc', use_label_encoder=False)
baseline_model.fit(X_train, y_train)

# Baseline predictions
y_pred_baseline = baseline_model.predict(X_test)
y_pred_baseline_proba = baseline_model.predict_proba(X_test)[:, 1]

# Baseline model evaluation
accuracy_baseline = accuracy_score(y_test, y_pred_baseline)
auc_baseline = roc_auc_score(y_test, y_pred_baseline_proba)
print(f"Baseline Accuracy: {accuracy_baseline:.4f}")
print(f"Baseline AUC: {auc_baseline:.4f}")

# 2. Hyperparameter tuning with Grid Search
param_grid = {
    'max_depth': [4, 6, 8],
    'learning_rate': [0.01, 0.1, 0.3],
    'subsample': [0.7, 0.8, 0.9],
    'colsample_bytree': [0.7, 0.8, 0.9],
    'n_estimators': [100, 200]
}

# GridSearchCV setup
grid_search = GridSearchCV(
    estimator=xgb.XGBClassifier(objective='binary:logistic', eval_metric='auc', use_label_encoder=False),
    param_grid=param_grid,
    scoring='roc_auc',
    cv=StratifiedKFold(n_splits=5),
    verbose=1,
    n_jobs=-1
)

# Fit Grid Search
grid_search.fit(X_train, y_train)

# Get best parameters and model
best_params = grid_search.best_params_
print(f"Best parameters found: {best_params}")

# 3. Evaluate tuned model with cross-validation
tuned_model = grid_search.best_estimator_

# Cross-validation with the tuned model (5 folds)
cv_results = cross_val_score(tuned_model, X_train, y_train, cv=StratifiedKFold(n_splits=5), scoring='roc_auc')
print(f"Cross-validation AUC scores: {cv_results}")
print(f"Mean Cross-validation AUC: {cv_results.mean():.4f}")

# 4. Train final model on the full training data using best parameters
tuned_model.fit(X_train, y_train)

# Make predictions on test data
y_pred = tuned_model.predict(X_test)
y_pred_proba = tuned_model.predict_proba(X_test)[:, 1]

# Evaluate model performance
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_pred_proba)

print(f"Tuned Model Accuracy: {accuracy:.4f}")
print(f"Tuned Model Precision: {precision:.4f}")
print(f"Tuned Model Recall: {recall:.4f}")
print(f"Tuned Model F1 Score: {f1:.4f}")
print(f"Tuned Model AUC: {auc:.4f}")

# Plot ROC curve for tuned model
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='blue', label=f'ROC Curve (AUC = {auc:.4f})')
plt.plot([0, 1], [0, 1], color='red', linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate (Recall)')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend()
plt.grid()
plt.show()


**3- BaggingClassifier**

A Bagging classifier is an ensemble meta-estimator that fits base classifiers each on random subsets of the original dataset and then aggregate their individual predictions (either by voting or by averaging) to form a final prediction.


from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve
import matplotlib.pyplot as plt

# 1. Baseline Bagging Classifier model
baseline_bagging_model = BaggingClassifier(random_state=42)
baseline_bagging_model.fit(X_train, y_train)

# Baseline predictions
y_pred_baseline_bagging = baseline_bagging_model.predict(X_test)
y_pred_baseline_bagging_proba = baseline_bagging_model.predict_proba(X_test)[:, 1]

# Baseline model evaluation
accuracy_baseline_bagging = accuracy_score(y_test, y_pred_baseline_bagging)
auc_baseline_bagging = roc_auc_score(y_test, y_pred_baseline_bagging_proba)
print(f"Baseline Bagging Classifier Accuracy: {accuracy_baseline_bagging:.4f}")
print(f"Baseline Bagging Classifier AUC: {auc_baseline_bagging:.4f}")

# 2. Hyperparameter tuning with Grid Search for Bagging Classifier
param_grid_bagging = {
    'n_estimators': [10, 50, 100],
    'max_samples': [0.5, 0.7, 1.0],
    'max_features': [0.5, 0.7, 1.0],
    'bootstrap': [True, False],
    'bootstrap_features': [True, False]
}

# GridSearchCV setup for Bagging Classifier
grid_search_bagging = GridSearchCV(
    estimator=BaggingClassifier(random_state=42),
    param_grid=param_grid_bagging,
    scoring='roc_auc',
    cv=StratifiedKFold(n_splits=5),
    verbose=1,
    n_jobs=-1
)

# Fit Grid Search for Bagging Classifier
grid_search_bagging.fit(X_train, y_train)

# Best parameters and model
best_params_bagging = grid_search_bagging.best_params_
print(f"Best Bagging Classifier parameters found: {best_params_bagging}")

# 3. Evaluate tuned model with cross-validation
tuned_bagging_model = grid_search_bagging.best_estimator_

# Cross-validation with the tuned Bagging Classifier model (5 folds)
cv_results_bagging = cross_val_score(tuned_bagging_model, X_train, y_train, cv=StratifiedKFold(n_splits=5), scoring='roc_auc')
print(f"Cross-validation AUC scores (Bagging Classifier): {cv_results_bagging}")
print(f"Mean Cross-validation AUC (Bagging Classifier): {cv_results_bagging.mean():.4f}")

# 4. Final Bagging Classifier model and evaluation
tuned_bagging_model.fit(X_train, y_train)

# Make predictions
y_pred_bagging = tuned_bagging_model.predict(X_test)
y_pred_bagging_proba = tuned_bagging_model.predict_proba(X_test)[:, 1]

# Evaluation metrics
accuracy_bagging = accuracy_score(y_test, y_pred_bagging)
precision_bagging = precision_score(y_test, y_pred_bagging)
recall_bagging = recall_score(y_test, y_pred_bagging)
f1_bagging = f1_score(y_test, y_pred_bagging)
auc_bagging = roc_auc_score(y_test, y_pred_bagging_proba)

print(f"Tuned Bagging Classifier Accuracy: {accuracy_bagging:.4f}")
print(f"Tuned Bagging Classifier Precision: {precision_bagging:.4f}")
print(f"Tuned Bagging Classifier Recall: {recall_bagging:.4f}")
print(f"Tuned Bagging Classifier F1 Score: {f1_bagging:.4f}")
print(f"Tuned Bagging Classifier AUC: {auc_bagging:.4f}")

# Plot ROC curve for Bagging Classifier
fpr_bagging, tpr_bagging, thresholds_bagging = roc_curve(y_test, y_pred_bagging_proba)
plt.figure(figsize=(8, 6))
plt.plot(fpr_bagging, tpr_bagging, color='blue', label=f'Bagging Classifier ROC (AUC = {auc_bagging:.4f})')
plt.plot([0, 1], [0, 1], color='red', linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate (Recall)')
plt.title('Bagging Classifier ROC Curve')
plt.legend()
plt.grid()
plt.show()


**4- Extra Trees Classifier (Extremely Randomized Trees)**

Similar to a Random Forest, but with a key difference in how trees are constructed. In Extra Trees, both the selection of the split points and the features to split on are randomized, which can lead to better performance in some cases.

from sklearn.ensemble import ExtraTreesClassifier

# 1. Baseline ExtraTrees Classifier model
baseline_extratree_model = ExtraTreesClassifier(random_state=42)
baseline_extratree_model.fit(X_train, y_train)

# Baseline predictions
y_pred_baseline_extratree = baseline_extratree_model.predict(X_test)
y_pred_baseline_extratree_proba = baseline_extratree_model.predict_proba(X_test)[:, 1]

# Baseline model evaluation
accuracy_baseline_extratree = accuracy_score(y_test, y_pred_baseline_extratree)
auc_baseline_extratree = roc_auc_score(y_test, y_pred_baseline_extratree_proba)
print(f"Baseline ExtraTrees Classifier Accuracy: {accuracy_baseline_extratree:.4f}")
print(f"Baseline ExtraTrees Classifier AUC: {auc_baseline_extratree:.4f}")

# 2. Hyperparameter tuning with Grid Search for ExtraTrees Classifier
param_grid_extratree = {
    'n_estimators': [100, 200, 300],
    'max_depth': [5, 10, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

# GridSearchCV setup for ExtraTrees Classifier
grid_search_extratree = GridSearchCV(
    estimator=ExtraTreesClassifier(random_state=42),
    param_grid=param_grid_extratree,
    scoring='roc_auc',
    cv=StratifiedKFold(n_splits=5),
    verbose=1,
    n_jobs=-1
)

# Fit Grid Search for ExtraTrees Classifier
grid_search_extratree.fit(X_train, y_train)

# Best parameters and model
best_params_extratree = grid_search_extratree.best_params_
print(f"Best ExtraTrees Classifier parameters found: {best_params_extratree}")

# 3. Evaluate tuned model with cross-validation
tuned_extratree_model = grid_search_extratree.best_estimator_

# Cross-validation with the tuned ExtraTrees Classifier model (5 folds)
cv_results_extratree = cross_val_score(tuned_extratree_model, X_train, y_train, cv=StratifiedKFold(n_splits=5), scoring='roc_auc')
print(f"Cross-validation AUC scores (ExtraTrees Classifier): {cv_results_extratree}")
print(f"Mean Cross-validation AUC (ExtraTrees Classifier): {cv_results_extratree.mean():.4f}")

# 4. Final ExtraTrees Classifier model and evaluation
tuned_extratree_model.fit(X_train, y_train)

# Make predictions
y_pred_extratree = tuned_extratree_model.predict(X_test)
y_pred_extratree_proba = tuned_extratree_model.predict_proba(X_test)[:, 1]

# Evaluation metrics
accuracy_extratree = accuracy_score(y_test, y_pred_extratree)
precision_extratree = precision_score(y_test, y_pred_extratree)
recall_extratree = recall_score(y_test, y_pred_extratree)
f1_extratree = f1_score(y_test, y_pred_extratree)
auc_extratree = roc_auc_score(y_test, y_pred_extratree_proba)

print(f"Tuned ExtraTrees Classifier Accuracy: {accuracy_extratree:.4f}")
print(f"Tuned ExtraTrees Classifier Precision: {precision_extratree:.4f}")
print(f"Tuned ExtraTrees Classifier Recall: {recall_extratree:.4f}")
print(f"Tuned ExtraTrees Classifier F1 Score: {f1_extratree:.4f}")
print(f"Tuned ExtraTrees Classifier AUC: {auc_extratree:.4f}")

# Plot ROC curve for ExtraTrees Classifier
fpr_extratree, tpr_extratree, thresholds_extratree = roc_curve(y_test, y_pred_extratree_proba)
plt.figure(figsize=(8, 6))
plt.plot(fpr_extratree, tpr_extratree, color='blue', label=f'ExtraTrees Classifier ROC (AUC = {auc_extratree:.4f})')
plt.plot([0, 1], [0, 1], color='red', linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate (Recall)')
plt.title('ExtraTrees Classifier ROC Curve')
plt.legend()
plt.grid()
plt.show()


**6- LightGBM Classifier**

A gradient boosting framework that uses decision trees and is designed to be distributed and efficient, especially on large datasets. It's known for its speed and performance

import lightgbm as lgb
from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve
import matplotlib.pyplot as plt

# 1. Baseline LightGBM model
baseline_lgb_model = lgb.LGBMClassifier(random_state=42)
baseline_lgb_model.fit(X_train, y_train)

# Baseline predictions
y_pred_baseline_lgb = baseline_lgb_model.predict(X_test)
y_pred_baseline_lgb_proba = baseline_lgb_model.predict_proba(X_test)[:, 1]

# Baseline model evaluation
accuracy_baseline_lgb = accuracy_score(y_test, y_pred_baseline_lgb)
auc_baseline_lgb = roc_auc_score(y_test, y_pred_baseline_lgb_proba)
print(f"Baseline LightGBM Accuracy: {accuracy_baseline_lgb:.4f}")
print(f"Baseline LightGBM AUC: {auc_baseline_lgb:.4f}")


# 2. Hyperparameter tuning with Grid Search for LightGBM
param_grid_lgb = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 5, 7, -1],  # -1 means no limit
    'learning_rate': [0.01, 0.05, 0.1, 0.3],
    'num_leaves': [31, 63, 127],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
    'reg_alpha': [0, 0.01, 0.1],
    'reg_lambda': [0, 0.01, 0.1]
}

# GridSearchCV setup for LightGBM
grid_search_lgb = GridSearchCV(
    estimator=lgb.LGBMClassifier(random_state=42),
    param_grid=param_grid_lgb,
    scoring='roc_auc',
    cv=StratifiedKFold(n_splits=5),
    verbose=1,
    n_jobs=-1
)

# Fit Grid Search for LightGBM
grid_search_lgb.fit(X_train, y_train)

# Best parameters and model
best_params_lgb = grid_search_lgb.best_params_
print(f"Best LightGBM parameters found: {best_params_lgb}")


# 3. Evaluate tuned model with cross-validation
tuned_lgb_model = grid_search_lgb.best_estimator_

# Cross-validation with the tuned LightGBM model (5 folds)
cv_results_lgb = cross_val_score(tuned_lgb_model, X_train, y_train, cv=StratifiedKFold(n_splits=5), scoring='roc_auc')
print(f"Cross-validation AUC scores (LightGBM): {cv_results_lgb}")
print(f"Mean Cross-validation AUC (LightGBM): {cv_results_lgb.mean():.4f}")


# 4. Final LightGBM model and evaluation
tuned_lgb_model.fit(X_train, y_train)

# Make predictions
y_pred_lgb = tuned_lgb_model.predict(X_test)
y_pred_lgb_proba = tuned_lgb_model.predict_proba(X_test)[:, 1]

# Evaluation metrics
accuracy_lgb = accuracy_score(y_test, y_pred_lgb)
precision_lgb = precision_score(y_test, y_pred_lgb)
recall_lgb = recall_score(y_test, y_pred_lgb)
f1_lgb = f1_score(y_test, y_pred_lgb)
auc_lgb = roc_auc_score(y_test, y_pred_lgb_proba)

print(f"Tuned LightGBM Accuracy: {accuracy_lgb:.4f}")
print(f"Tuned LightGBM Precision: {precision_lgb:.4f}")
print(f"Tuned LightGBM Recall: {recall_lgb:.4f}")
print(f"Tuned LightGBM F1 Score: {f1_lgb:.4f}")
print(f"Tuned LightGBM AUC: {auc_lgb:.4f}")

# Plot ROC curve for LightGBM
fpr_lgb, tpr_lgb, thresholds_lgb = roc_curve(y_test, y_pred_lgb_proba)
plt.figure(figsize=(8, 6))
plt.plot(fpr_lgb, tpr_lgb, color='blue', label=f'LightGBM ROC (AUC = {auc_lgb:.4f})')
plt.plot([0, 1], [0, 1], color='red', linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate (Recall)')
plt.title('LightGBM ROC Curve')
plt.legend()
plt.grid()
plt.show()


import pandas as pd
import matplotlib.pyplot as plt

# Collect all evaluation metrics
results = [
    {'Model': 'Decision Tree', 'Accuracy': accuracy_dt, 'Precision': precision_dt, 'Recall': recall_dt, 'F1 Score': f1_dt, 'AUC': auc_dt},
    {'Model': 'Bagging Classifier', 'Accuracy': accuracy_bagging, 'Precision': precision_bagging, 'Recall': recall_bagging, 'F1 Score': f1_bagging, 'AUC': auc_bagging},
    {'Model': 'Random Forest', 'Accuracy': accuracy_rf, 'Precision': precision_rf, 'Recall': recall_rf, 'F1 Score': f1_rf, 'AUC': auc_rf},
    {'Model': 'Extra Trees', 'Accuracy': accuracy_extratree, 'Precision': precision_extratree, 'Recall': recall_extratree, 'F1 Score': f1_extratree, 'AUC': auc_extratree},
    # {'Model': 'LightGBM', 'Accuracy': accuracy_lgb, 'Precision': precision_lgb, 'Recall': recall_lgb, 'F1 Score': f1_lgb, 'AUC': auc_lgb},
    {'Model': 'XGboost', 'Accuracy': accuracy, 'Precision': precision, 'Recall': recall, 'F1 Score': f1, 'AUC': auc},

]

# Convert results to DataFrame
results_df = pd.DataFrame(results)

# Print the results
print(results_df)

# Plotting the results
fig, ax = plt.subplots(2, 2, figsize=(16, 12))

# Plot Accuracy
results_df.set_index('Model')['Accuracy'].plot(kind='bar', ax=ax[0, 0], color='skyblue', legend=False)
ax[0, 0].set_title('Model Accuracy')
ax[0, 0].set_ylabel('Accuracy')

# Plot Precision
results_df.set_index('Model')['Precision'].plot(kind='bar', ax=ax[0, 1], color='salmon', legend=False)
ax[0, 1].set_title('Model Precision')
ax[0, 1].set_ylabel('Precision')

# Plot Recall
results_df.set_index('Model')['Recall'].plot(kind='bar', ax=ax[1, 0], color='lightgreen', legend=False)
ax[1, 0].set_title('Model Recall')
ax[1, 0].set_ylabel('Recall')

# Plot F1 Score
results_df.set_index('Model')['F1 Score'].plot(kind='bar', ax=ax[1, 1], color='orange', legend=False)
ax[1, 1].set_title('Model F1 Score')
ax[1, 1].set_ylabel('F1 Score')

# Plot AUC
fig, ax2 = plt.subplots(figsize=(12, 6))
results_df.set_index('Model')['AUC'].plot(kind='bar', color='purple', legend=False, ax=ax2)
ax2.set_title('Model AUC')
ax2.set_ylabel('AUC')

plt.tight_layout()
plt.show()

In [None]:
# lstm forcast bunkrupsy

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import classification_report, roc_auc_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
import tensorflow as tf

# Load your dataset
data = pd.read_csv('your_data.csv')

# Convert 'filed' to datetime and sort by date
data['filed'] = pd.to_datetime(data['filed'])
data = data.sort_values(by='filed')

# Create lag features (e.g., past values of financial metrics)
data['Assets_lag1'] = data['Assets'].shift(1)
data['NetIncome_lag1'] = data['NetIncome'].shift(1)

# Drop missing values (caused by shifting/lagging)
data.dropna(inplace=True)

# Features (lagged financial data) and target (is_bankrupt)
features = ['Assets_lag1', 'NetIncome_lag1']
X = data[features]
y = data['is_bankrupt']

# Split data into training and testing sets before scaling
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

# Scale the features after the split to avoid data leakage
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Reshape input for LSTM [samples, timesteps, features]
X_train_scaled = X_train_scaled.reshape((X_train_scaled.shape[0], 1, X_train_scaled.shape[1]))
X_test_scaled = X_test_scaled.reshape((X_test_scaled.shape[0], 1, X_test_scaled.shape[1]))

# Define a function to build the LSTM model (required for KerasClassifier)
def create_model(units=50, dropout_rate=0.2):
    model = Sequential()
    model.add(LSTM(units=units, return_sequences=False, input_shape=(X_train_scaled.shape[1], X_train_scaled.shape[2])))
    model.add(Dropout(dropout_rate))
    model.add(Dense(1, activation='sigmoid'))  # Binary output (0 or 1)
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Wrap the model using KerasClassifier
model = KerasClassifier(build_fn=create_model, epochs=10, batch_size=32, verbose=0)

# Define the hyperparameter grid
param_grid = {
    'units': [50, 100],  # Number of LSTM units
    'dropout_rate': [0.2, 0.4],  # Dropout rates
    'batch_size': [32, 64],  # Batch sizes
    'epochs': [10, 20]  # Number of epochs
}

# Perform GridSearchCV for hyperparameter tuning
grid = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, n_jobs=-1, verbose=1)
grid_result = grid.fit(X_train_scaled, y_train)

# Get the best hyperparameters
print(f"Best: {grid_result.best_score_} using {grid_result.best_params_}")

# Use the best model to predict on the test set
best_model = grid_result.best_estimator_
y_pred_prob = best_model.predict(X_test_scaled)
y_pred = (y_pred_prob > 0.5).astype(int)  # Convert probabilities to binary predictions

# Evaluate the best model
print(classification_report(y_test, y_pred))
roc_auc = roc_auc_score(y_test, y_pred_prob)
print(f'ROC AUC: {roc_auc}')


# arima( credit score )

import pandas as pd
import numpy as np
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt

# Load your dataset
# data = pd.read_csv('your_data.csv')  # Replace with your actual data path

# Convert 'filed' to datetime and sort by date
data['filed'] = pd.to_datetime(data['filed'])
data = data.sort_values(by='filed')

# Extract 'filed' and 'CreditScore'
credit_score_data = data.set_index('filed')['CreditScore'].dropna()

# Train-test split
train_size = int(len(credit_score_data) * 0.8)
train_data, test_data = credit_score_data[:train_size], credit_score_data[train_size:]

# Fit ARIMA model on training data
model = ARIMA(train_data, order=(5, 1, 0))
model_fit = model.fit()

# Forecast on test data
forecast = model_fit.forecast(steps=len(test_data))

# Evaluate the model
mse = mean_squared_error(test_data, forecast)
print(f'Mean Squared Error (MSE): {mse}')

# Plot actual vs. forecasted values
plt.figure(figsize=(10, 6))
plt.plot(train_data.index, train_data, label='Train')
plt.plot(test_data.index, test_data, label='Test')
plt.plot(test_data.index, forecast, label='Forecast')
plt.title('CreditScore Forecast using ARIMA')
plt.xlabel('Date')
plt.ylabel('CreditScore')
plt.legend()
plt.show()

# transformers 


In [142]:
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt

# Define Transformer model class
class TransformerModel(nn.Module):
    def __init__(self, input_dim, model_dim, num_heads, num_encoder_layers, dropout):
        super(TransformerModel, self).__init__()
        self.encoder_layer = nn.TransformerEncoderLayer(
            d_model=model_dim, nhead=num_heads, dropout=dropout
        )
        self.transformer_encoder = nn.TransformerEncoder(
            self.encoder_layer, num_layers=num_encoder_layers
        )
        self.fc = nn.Linear(model_dim, 1)

    def forward(self, src):
        output = self.transformer_encoder(src)
        output = self.fc(output[-1])
        return output

# Convert 'filed' to datetime and sort by date
data['filed'] = pd.to_datetime(data['filed'])
data = data.sort_values(by='filed')

# Extract 'filed' and 'CreditScore'
credit_score_data = data.set_index('filed')['CreditScore'].dropna()

# Train-test split
train_size = int(len(credit_score_data) * 0.8)
train_data, test_data = credit_score_data[:train_size], credit_score_data[train_size:]

# Normalize the data
scaler = MinMaxScaler()
train_data_scaled = scaler.fit_transform(train_data.values.reshape(-1, 1))
test_data_scaled = scaler.transform(test_data.values.reshape(-1, 1))

# Convert to PyTorch tensors
train_data_tensor = torch.tensor(train_data_scaled, dtype=torch.float32).view(-1, 1)
test_data_tensor = torch.tensor(test_data_scaled, dtype=torch.float32).view(-1, 1)

# Define Transformer model
model = TransformerModel(input_dim=1, model_dim=64, num_heads=4, num_encoder_layers=3, dropout=0.2)

# Define loss function and optimizer
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Training loop
model.train()
epochs = 100
for epoch in range(epochs):
    optimizer.zero_grad()
    output = model(train_data_tensor.unsqueeze(1))  # Add dimension for sequence length
    loss = criterion(output, train_data_tensor[-1])  # Compare only last value
    loss.backward()
    optimizer.step()
    
    if (epoch+1) % 10 == 0:
        print(f'Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}')

# Forecast on test data
model.eval()
with torch.no_grad():
    predicted = model(test_data_tensor.unsqueeze(1))

# Inverse scale the predictions
predicted_inverse = scaler.inverse_transform(predicted.detach().numpy())

# Plot actual vs predicted CreditScore
plt.figure(figsize=(10, 6))
plt.plot(test_data.index, test_data.values, label='Actual CreditScore')
plt.plot(test_data.index, predicted_inverse, label='Predicted CreditScore')
plt.title('CreditScore Forecast using Transformer')
plt.xlabel('Date')
plt.ylabel('CreditScore')
plt.legend()
plt.show()


ModuleNotFoundError: No module named 'torch'