<a href="https://colab.research.google.com/github/vldzio/Multimodal-Risk-Profiling/blob/main/US_bankrupt.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#drive link
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_curve, auc, precision_recall_curve, mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from scipy.stats import norm


from sklearn.exceptions import ConvergenceWarning
import warnings
warnings.filterwarnings('ignore', category=ConvergenceWarning)

In [None]:
df = pd.read_csv("/content/drive/MyDrive/USBDT/american_bankruptcy.csv")

In [None]:
df.head()

In [None]:
def summary(df):
    print(f'data shape: {df.shape}')
    summ = pd.DataFrame(df.dtypes, columns=['data type'])
    summ['#missing'] = df.isnull().sum().values
    summ['%missing'] = df.isnull().sum().values / len(df)* 100
    summ['#unique'] = df.nunique().values
    desc = pd.DataFrame(df.describe(include='all').transpose())
    summ['min'] = desc['min'].values
    summ['max'] = desc['max'].values
    summ['first value'] = df.loc[0].values
    summ['second value'] = df.loc[1].values
    summ['third value'] = df.loc[2].values

    return summ

summary(df)

In [None]:
alive_count = df['status_label'].value_counts()['alive']
failed_count = df['status_label'].value_counts()['failed']
total_count = alive_count + failed_count
alive_ratio = alive_count / total_count
failed_ratio = failed_count / total_count

print("Alive Ratio:", alive_ratio)
print("Failed Ratio:", failed_ratio)

In [None]:
print(alive_count)
print(failed_count)

In [None]:
print(alive_ratio+failed_ratio)

In [None]:
import matplotlib.pyplot as plt

alive_count = df['status_label'].value_counts()['alive']
failed_count = df['status_label'].value_counts()['failed']
total_count = alive_count + failed_count
alive_ratio = alive_count / total_count
failed_ratio = failed_count / total_count

labels = ['Alive', 'Failed']
sizes = [alive_ratio, failed_ratio]
colors = ['green', 'red']
explode = (0.1, 0)

plt.figure(figsize=(6, 6))
plt.pie(sizes, explode=explode, labels=labels, colors=colors, autopct='%1.1f%%', shadow=True, startangle=140)
plt.axis('equal')

plt.savefig('pie_chart.png', bbox_inches='tight', dpi=300)
plt.show()

In [None]:
import pandas as pd
from google.colab import drive

drive.mount('/content/drive')

folder_path = "/content/drive/MyDrive/USBDT/"
input_file = folder_path + "american_bankruptcy.csv"
output_file = folder_path + "preprocessed_df.csv"

df = pd.read_csv(input_file)

df_latest = df.sort_values(by=['company_name', 'year'], ascending=[True, False]).drop_duplicates(subset=['company_name'], keep='first')

df_latest.to_csv(output_file, index=False)

print(f"Filtered dataset saved as {output_file}")


In [None]:
preprocessed_df = pd.read_csv(output_file)
preprocessed_df.head()

In [None]:
rename_mapping = {
    "company_name": "Company_Name",
    "status_label": "Status_Label",
    "year": "Year",
    "X1": "Current_Assets",
    "X2": "Cost_of_Goods_Sold",
    "X3": "Depreciation_and_Amortization",
    "X4": "EBITDA",
    "X5": "Inventory",
    "X6": "Net_Income",
    "X7": "Total_Receivables",
    "X8": "Market_Value",
    "X9": "Net_Sales",
    "X10": "Total_Assets",
    "X11": "Total_Long_Term_Debt",
    "X12": "EBIT",
    "X13": "Gross_Profit",
    "X14": "Total_Current_Liabilities",
    "X15": "Retained_Earnings",
    "X16": "Total_Revenue",
    "X17": "Total_Liabilities",
    "X18": "Total_Operating_Expenses"
}

preprocessed_df.rename(columns=rename_mapping, inplace=True)

print("Preprocessed DataFrame has been saved to Google Drive as preprocessed_df.csv.")

# Calculate ratios for the Altman Z‑Score:


In [None]:
# 1. Working Capital / Total Assets:
preprocessed_df['WC_to_TA'] = (preprocessed_df['Current_Assets'] - preprocessed_df['Total_Current_Liabilities']) / preprocessed_df['Total_Assets']

# 2. Retained Earnings / Total Assets:
preprocessed_df['RE_to_TA'] = preprocessed_df['Retained_Earnings'] / preprocessed_df['Total_Assets']

# 3. EBIT / Total Assets:
preprocessed_df['EBIT_to_TA'] = preprocessed_df['EBIT'] / preprocessed_df['Total_Assets']

# 4. Market Value of Equity / Total Liabilities:
preprocessed_df['MVE_to_TL'] = preprocessed_df['Market_Value'] / preprocessed_df['Total_Liabilities']

# 5. Net Sales / Total Assets:
preprocessed_df['Sales_to_TA'] = preprocessed_df['Net_Sales'] / preprocessed_df['Total_Assets']


# Calculate ratios for the Zmijewski Model:


In [None]:
# 6. Return on Assets (ROA) = Net Income / Total Assets
preprocessed_df['ROA'] = preprocessed_df['Net_Income'] / preprocessed_df['Total_Assets']

# 7. Leverage = Total Liabilities / Total Assets
preprocessed_df['Leverage'] = preprocessed_df['Total_Liabilities'] / preprocessed_df['Total_Assets']

# 8. Liquidity (using the Current Ratio):
preprocessed_df['Current_Ratio_Calc'] = preprocessed_df['Current_Assets'] / preprocessed_df['Total_Current_Liabilities']


# Calculate ratios for the Ohlson O‑Score:


In [None]:
preprocessed_df['ln_TA'] = np.log(preprocessed_df['Total_Assets'])
preprocessed_df['CL_to_CA'] = preprocessed_df['Total_Current_Liabilities'] / preprocessed_df['Current_Assets']
preprocessed_df['Neg_Net_Income'] = (preprocessed_df['Net_Income'] < 0).astype(int)

In [None]:
preprocessed_df.to_csv('/content/drive/My Drive/USBDT/preprocessed_df.csv', index=False)

print("Updated preprocessed_df.csv with computed ratios has been saved to Google Drive.")

verification

In [None]:
print(preprocessed_df[['Company_Name', 'WC_to_TA', 'RE_to_TA', 'EBIT_to_TA',
                       'MVE_to_TL', 'Sales_to_TA', 'ROA', 'Leverage', 'Current_Ratio_Calc',
                       'ln_TA', 'Neg_Net_Income']].head())

In [None]:
df = pd.read_csv('/content/drive/My Drive/preprocessed_df.csv')

# Calculate the Altman Z‑Score
Formula: Z = 1.2*(WC_to_TA) + 1.4*(RE_to_TA) + 3.3*(EBIT_to_TA) + 0.6*(MVE_to_TL) + 1.0*(Sales_to_TA)

In [None]:
preprocessed_df['Altman_Z'] = (1.2 * preprocessed_df['WC_to_TA'] +
                  1.4 * preprocessed_df['RE_to_TA'] +
                  3.3 * preprocessed_df['EBIT_to_TA'] +
                  0.6 * preprocessed_df['MVE_to_TL'] +
                  1.0 * preprocessed_df['Sales_to_TA'])

 Assigning risk categories for the Altman Z‑Score (using common thresholds):
   Z > 2.99   => Safe (Low risk)
   1.81 < Z <= 2.99 => Gray Zone (Moderate risk)
   Z <= 1.81  => Distress (High risk)

In [None]:
def altman_risk(z):
    if z > 2.99:
        return 'Safe'
    elif z > 1.81:
        return 'Gray'
    else:
        return 'Distress'

preprocessed_df['Altman_Risk'] = preprocessed_df['Altman_Z'].apply(altman_risk)

# Calculate the Zmijewski Score
Zmijewski_Score = -4.3 - 4.5 * ROA + 5.7 * Leverage - 0.004 * Current_Ratio_Calc

In [None]:
preprocessed_df['Zmijewski_Score'] = (-4.3
                         - 4.5 * preprocessed_df['ROA']
                         + 5.7 * preprocessed_df['Leverage']
                         - 0.004 * preprocessed_df['Current_Ratio_Calc'])

Convert the Zmijewski Score into a bankruptcy probability using the standard normal CDF:
If Zmijewski_Prob > 0.5, classify as "High Risk"
Otherwise, classify as "Low Risk"

In [None]:
preprocessed_df['Zmijewski_Prob'] = norm.cdf(preprocessed_df['Zmijewski_Score'])


preprocessed_df['Zmijewski_Risk'] = preprocessed_df['Zmijewski_Prob'].apply(lambda p: 'High Risk' if p > 0.5 else 'Low Risk')

# 3. Ohlson O‑Score
O = -1.32 - 0.407*ln_TA + 6.03*(Total_Liabilities/Total_Assets)
 - 1.43*(WC_to_TA) + 0.0757*(CL_to_CA) - 2.37*(ROA) - 1.83*(FFOI) + 0.285*(Neg_Net_Income)

In [None]:

preprocessed_df['Ohlson_Score'] = (-1.32
                      - 0.407 * preprocessed_df['ln_TA']
                      + 6.03 * (preprocessed_df['Total_Liabilities'] / preprocessed_df['Total_Assets'])
                      - 1.43 * preprocessed_df['WC_to_TA']
                      + 0.0757 * preprocessed_df['CL_to_CA']
                      - 2.37 * (preprocessed_df['Net_Income'] / preprocessed_df['Total_Assets'])
                      + 0.285 * preprocessed_df['Neg_Net_Income'])

Convert the Ohlson score to a bankruptcy probability using the logistic function:
Classify as 'High Risk' if probability > 0.5, else 'Low Risk

In [None]:
preprocessed_df['Ohlson_Prob'] = 1 / (1 + np.exp(-preprocessed_df['Ohlson_Score']))

preprocessed_df['Ohlson_Risk'] = preprocessed_df['Ohlson_Prob'].apply(lambda p: 'High Risk' if p > 0.5 else 'Low Risk')

Combining risk profile metrics

If Altman is in Distress or either Zmijewski or Ohlson indicate High Risk, classify as High Risk.


In [None]:
def overall_risk(altman, zmijewski, ohlson):
    if altman == 'Distress' or zmijewski == 'High Risk' or ohlson == 'High Risk':
        return 'Risk'
    elif altman == 'Gray':
        return 'Risk'
    else:
        return 'Low Risk'

preprocessed_df['Overall_Risk'] = preprocessed_df.apply(lambda row: overall_risk(row['Altman_Risk'], row['Zmijewski_Risk'], row['Ohlson_Risk']), axis=1)

saving risk profiles

In [None]:
preprocessed_df.to_csv('/content/drive/My Drive/USBDT/preprocessed_df.csv', index=False)
print("Risk profile values have been calculated and saved to preprocessed_df.csv.")

overall risk metric to bankruptcy accuracy check

Convert overall risk into binary predictions:

We'll assume that if Overall_Risk is "High Risk", then we predict 'failed',

otherwise we predict 'alive'.

In [None]:
preprocessed_df.head(), df.shape

In [None]:
preprocessed_df['Status_Label'].value_counts().get('alive', 0), preprocessed_df['Status_Label'].value_counts().get('failed', 0)

##Data Imbalance Handling
##random undersampling

In [None]:
pip install imbalanced-learn

In [None]:
file_path = '/content/drive/My Drive/USBDT/preprocessed_df.csv'
df = pd.read_csv(file_path)

alive_df = df[df['Status_Label'] == 'alive']
failed_df = df[df['Status_Label'] == 'failed']

alive_sampled = alive_df.sample(n=len(failed_df), random_state=42)  # Downsample "alive" to match "failed"
undersampled_df = pd.concat([alive_sampled, failed_df])

undersampled_df = undersampled_df.sample(frac=1, random_state=42).reset_index(drop=True)

undersampled_file_path = '/content/drive/My Drive/USBDT/undersampled_df.csv'
undersampled_df.to_csv(undersampled_file_path, index=False)

print(undersampled_df['Status_Label'].value_counts())

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

file_path = "/content/drive/My Drive/USBDT/undersampled_df.csv"
df = pd.read_csv(file_path)

df = df.drop(columns=['Company_Name', 'Year', 'Altman_Risk', 'Zmijewski_Risk', 'Ohlson_Risk', 'Overall_Risk'])

X = df.drop(columns=['Status_Label'])  # Features
y = df['Status_Label']                 # Target

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_normalized = pd.DataFrame(X_scaled, columns=X.columns)

X_normalized['Status_Label'] = y.reset_index(drop=True)

X_normalized.to_csv("/content/drive/My Drive/USBDT/undersampled_normalized_df.csv", index=False)

X_train, X_test, y_train, y_test = train_test_split(
    X_normalized.drop(columns=['Status_Label']),
    X_normalized['Status_Label'],
    test_size=0.2,
    random_state=42,
    stratify=X_normalized['Status_Label']
)

print(f"Training Set: {X_train.shape}, Test Set: {X_test.shape}")
print("✅ Normalized undersampled dataset saved.")


### Logistic Regression on Undersampled dataset

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, roc_curve

lr = LogisticRegression(max_iter = 100)
lr.fit(X_train, y_train)

y_pred = lr.predict(X_test)
y_prob = lr.predict_proba(X_test)[:, 1]

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, pos_label="failed")
recall = recall_score(y_test, y_pred, pos_label="failed")
f1 = f1_score(y_test, y_pred, pos_label="failed")
roc_auc = roc_auc_score(y_test.map({'alive': 0, 'failed': 1}), y_prob)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"AUC-ROC: {roc_auc:.4f}")

plt.figure(figsize=(5,4))
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='d', cmap="Blues", xticklabels=['alive', 'failed'], yticklabels=['alive', 'failed'])
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix - Logistic Regression")
plt.show()

fpr, tpr, _ = roc_curve(y_test.map({'alive': 0, 'failed': 1}), y_prob)
plt.figure(figsize=(6,5))
plt.plot(fpr, tpr, color='blue', label=f'Logistic Regression (AUC = {roc_auc:.4f})')
plt.plot([0, 1], [0, 1], linestyle='--', color='gray')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve - Logistic Regression")
plt.legend()
plt.show()


### Random forest on undersampled dataset

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, roc_curve

rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)
y_prob = rf.predict_proba(X_test)[:, 1]  # Probabilities for ROC curve

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, pos_label="failed")
recall = recall_score(y_test, y_pred, pos_label="failed")
f1 = f1_score(y_test, y_pred, pos_label="failed")
roc_auc = roc_auc_score(y_test.map({'alive': 0, 'failed': 1}), y_prob)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"AUC-ROC: {roc_auc:.4f}")

plt.figure(figsize=(5,4))
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='d', cmap="Blues", xticklabels=['alive', 'failed'], yticklabels=['alive', 'failed'])
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix - Random Forest")
plt.show()

fpr, tpr, _ = roc_curve(y_test.map({'alive': 0, 'failed': 1}), y_prob)
plt.figure(figsize=(6,5))
plt.plot(fpr, tpr, color='blue', label=f'Random Forest (AUC = {roc_auc:.4f})')
plt.plot([0, 1], [0, 1], linestyle='--', color='gray')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve - Random Forest")
plt.legend()
plt.show()

###XGBoost on Undersample

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, roc_curve

y_train_numeric = y_train.map({'alive': 0, 'failed': 1})
y_test_numeric = y_test.map({'alive': 0, 'failed': 1})

xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb.fit(X_train, y_train_numeric)

y_pred = xgb.predict(X_test)
y_prob = xgb.predict_proba(X_test)[:, 1]

accuracy = accuracy_score(y_test_numeric, y_pred)
precision = precision_score(y_test_numeric, y_pred)
recall = recall_score(y_test_numeric, y_pred)
f1 = f1_score(y_test_numeric, y_pred)
roc_auc = roc_auc_score(y_test_numeric, y_prob)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"AUC-ROC: {roc_auc:.4f}")

plt.figure(figsize=(5,4))
sns.heatmap(confusion_matrix(y_test_numeric, y_pred), annot=True, fmt='d', cmap="Blues", xticklabels=['alive', 'failed'], yticklabels=['alive', 'failed'])
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix - XGBoost")
plt.show()

fpr, tpr, _ = roc_curve(y_test_numeric, y_prob)
plt.figure(figsize=(6,5))
plt.plot(fpr, tpr, color='blue', label=f'XGBoost (AUC = {roc_auc:.4f})')
plt.plot([0, 1], [0, 1], linestyle='--', color='gray')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve - XGBoost")
plt.legend()
plt.show()

###SVM with undersampling

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, roc_curve

y_train_numeric = y_train.map({'alive': 0, 'failed': 1})
y_test_numeric = y_test.map({'alive': 0, 'failed': 1})

svm = SVC(probability=True)
svm.fit(X_train, y_train_numeric)

y_pred = svm.predict(X_test)
y_prob = svm.predict_proba(X_test)[:, 1]

accuracy = accuracy_score(y_test_numeric, y_pred)
precision = precision_score(y_test_numeric, y_pred)
recall = recall_score(y_test_numeric, y_pred)
f1 = f1_score(y_test_numeric, y_pred)
roc_auc = roc_auc_score(y_test_numeric, y_prob)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"AUC-ROC: {roc_auc:.4f}")

plt.figure(figsize=(5,4))
sns.heatmap(confusion_matrix(y_test_numeric, y_pred), annot=True, fmt='d', cmap="Blues", xticklabels=['alive', 'failed'], yticklabels=['alive', 'failed'])
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix - SVM")
plt.show()

fpr, tpr, _ = roc_curve(y_test_numeric, y_prob)
plt.figure(figsize=(6,5))
plt.plot(fpr, tpr, color='blue', label=f'SVM (AUC = {roc_auc:.4f})')
plt.plot([0, 1], [0, 1], linestyle='--', color='gray')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve - SVM")
plt.legend()
plt.show()

###KNN with undersampling

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, roc_curve

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

y_pred = knn.predict(X_test)
y_prob = knn.predict_proba(X_test)[:, 1]

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, pos_label="failed")
recall = recall_score(y_test, y_pred, pos_label="failed")
f1 = f1_score(y_test, y_pred, pos_label="failed")
roc_auc = roc_auc_score(y_test.map({'alive': 0, 'failed': 1}), y_prob)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"AUC-ROC: {roc_auc:.4f}")

plt.figure(figsize=(5,4))
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='d', cmap="Blues", xticklabels=['alive', 'failed'], yticklabels=['alive', 'failed'])
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix - KNN")
plt.show()

fpr, tpr, _ = roc_curve(y_test.map({'alive': 0, 'failed': 1}), y_prob)
plt.figure(figsize=(6,5))
plt.plot(fpr, tpr, color='blue', label=f'KNN (AUC = {roc_auc:.4f})')
plt.plot([0, 1], [0, 1], linestyle='--', color='gray')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve - KNN")
plt.legend()
plt.show()

###Using smote on the same dataset

In [None]:
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

file_path = "/content/drive/My Drive/USBDT/preprocessed_df.csv"
df = pd.read_csv(file_path)

df = df.drop(columns=['Company_Name', 'Year', 'Altman_Risk', 'Zmijewski_Risk', 'Ohlson_Risk', 'Overall_Risk'])

X = df.drop(columns=['Status_Label'])
y = df['Status_Label']

y_numeric = y.map({'alive': 0, 'failed': 1})

smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y_numeric)

scaler = StandardScaler()
X_resampled_scaled = scaler.fit_transform(X_resampled)

X_resampled_normalized = pd.DataFrame(X_resampled_scaled, columns=X.columns)

y_resampled = pd.Series(y_resampled).map({0: 'alive', 1: 'failed'})

oversampled_normalized_df = pd.concat([X_resampled_normalized, y_resampled.rename("Status_Label")], axis=1)

X_train, X_test, y_train, y_test = train_test_split(
    X_resampled_normalized,
    y_resampled,
    test_size=0.2,
    random_state=42,
    stratify=y_resampled
)


oversampled_path = "/content/drive/My Drive/USBDT/oversampled_normalized_df.csv"
oversampled_normalized_df.to_csv(oversampled_path, index=False)

print(f"✅ Normalized oversampled dataset saved at: {oversampled_path}")


Logistic regression on SMOTE

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, roc_curve

lr = LogisticRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_test)
y_prob = lr.predict_proba(X_test)[:, 1]

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, pos_label="failed")
recall = recall_score(y_test, y_pred, pos_label="failed")
f1 = f1_score(y_test, y_pred, pos_label="failed")
roc_auc = roc_auc_score(y_test.map({'alive': 0, 'failed': 1}), y_prob)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"AUC-ROC: {roc_auc:.4f}")

plt.figure(figsize=(5,4))
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='d', cmap="Blues", xticklabels=['alive', 'failed'], yticklabels=['alive', 'failed'])
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix - Logistic Regression")
plt.show()

fpr, tpr, _ = roc_curve(y_test.map({'alive': 0, 'failed': 1}), y_prob)
plt.figure(figsize=(6,5))
plt.plot(fpr, tpr, color='blue', label=f'Logistic Regression (AUC = {roc_auc:.4f})')
plt.plot([0, 1], [0, 1], linestyle='--', color='gray')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve - Logistic Regression")
plt.legend()
plt.show()

Random Forest on SMOTE

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, roc_curve

rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)
y_prob = rf.predict_proba(X_test)[:, 1]

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, pos_label="failed")
recall = recall_score(y_test, y_pred, pos_label="failed")
f1 = f1_score(y_test, y_pred, pos_label="failed")
roc_auc = roc_auc_score(y_test.map({'alive': 0, 'failed': 1}), y_prob)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"AUC-ROC: {roc_auc:.4f}")

plt.figure(figsize=(5,4))
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='d', cmap="Blues", xticklabels=['alive', 'failed'], yticklabels=['alive', 'failed'])
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix - Random Forest")
plt.show()

fpr, tpr, _ = roc_curve(y_test.map({'alive': 0, 'failed': 1}), y_prob)
plt.figure(figsize=(6,5))
plt.plot(fpr, tpr, color='blue', label=f'Random Forest (AUC = {roc_auc:.4f})')
plt.plot([0, 1], [0, 1], linestyle='--', color='gray')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve - Random Forest")
plt.legend()
plt.show()

XGBoost on SMOTE

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, roc_curve

xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb.fit(X_train, y_train.map({'alive': 0, 'failed': 1}))

y_pred = xgb.predict(X_test)
y_prob = xgb.predict_proba(X_test)[:, 1]

accuracy = accuracy_score(y_test.map({'alive': 0, 'failed': 1}), y_pred)
precision = precision_score(y_test.map({'alive': 0, 'failed': 1}), y_pred)
recall = recall_score(y_test.map({'alive': 0, 'failed': 1}), y_pred)
f1 = f1_score(y_test.map({'alive': 0, 'failed': 1}), y_pred)
roc_auc = roc_auc_score(y_test.map({'alive': 0, 'failed': 1}), y_prob)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"AUC-ROC: {roc_auc:.4f}")

plt.figure(figsize=(5,4))
sns.heatmap(confusion_matrix(y_test.map({'alive': 0, 'failed': 1}), y_pred), annot=True, fmt='d', cmap="Blues", xticklabels=['alive', 'failed'], yticklabels=['alive', 'failed'])
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix - XGBoost")
plt.show()

fpr, tpr, _ = roc_curve(y_test.map({'alive': 0, 'failed': 1}), y_prob)
plt.figure(figsize=(6,5))
plt.plot(fpr, tpr, color='blue', label=f'XGBoost (AUC = {roc_auc:.4f})')
plt.plot([0, 1], [0, 1], linestyle='--', color='gray')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve - XGBoost")
plt.legend()
plt.show()

SVM on SMOTE

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, roc_curve

svm = SVC(probability=True)
svm.fit(X_train, y_train)

y_pred = svm.predict(X_test)
y_prob = svm.predict_proba(X_test)[:, 1]


accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, pos_label="failed")
recall = recall_score(y_test, y_pred, pos_label="failed")
f1 = f1_score(y_test, y_pred, pos_label="failed")
roc_auc = roc_auc_score(y_test.map({'alive': 0, 'failed': 1}), y_prob)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"AUC-ROC: {roc_auc:.4f}")

plt.figure(figsize=(5,4))
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='d', cmap="Blues", xticklabels=['alive', 'failed'], yticklabels=['alive', 'failed'])
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix - SVM")
plt.show()

fpr, tpr, _ = roc_curve(y_test.map({'alive': 0, 'failed': 1}), y_prob)
plt.figure(figsize=(6,5))
plt.plot(fpr, tpr, color='blue', label=f'SVM (AUC = {roc_auc:.4f})')
plt.plot([0, 1], [0, 1], linestyle='--', color='gray')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve - SVM")
plt.legend()
plt.show()

KNN on SMOTE

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, roc_curve

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

y_pred = knn.predict(X_test)
y_prob = knn.predict_proba(X_test)[:, 1]

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, pos_label="failed")
recall = recall_score(y_test, y_pred, pos_label="failed")
f1 = f1_score(y_test, y_pred, pos_label="failed")
roc_auc = roc_auc_score(y_test.map({'alive': 0, 'failed': 1}), y_prob)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"AUC-ROC: {roc_auc:.4f}")

plt.figure(figsize=(5,4))
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='d', cmap="Blues", xticklabels=['alive', 'failed'], yticklabels=['alive', 'failed'])
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix - KNN")
plt.show()

fpr, tpr, _ = roc_curve(y_test.map({'alive': 0, 'failed': 1}), y_prob)
plt.figure(figsize=(6,5))
plt.plot(fpr, tpr, color='blue', label=f'KNN (AUC = {roc_auc:.4f})')
plt.plot([0, 1], [0, 1], linestyle='--', color='gray')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve - KNN")
plt.legend()
plt.show()

# Sentiment analysis

In [None]:
!pip install transformers torch

In [None]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

MODEL_NAME = "yiyanghkust/finbert-tone"
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)
model = BertForSequenceClassification.from_pretrained(MODEL_NAME).to(device)
model.eval()

In [None]:
def predict_sentiment(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)

    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        predicted_class = torch.argmax(logits, dim=1).item()

    sentiment_map = {0: "Negative", 1: "Neutral", 2: "Positive"}
    return sentiment_map[predicted_class]


In [None]:
def batch_predict(texts):
    inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=512).to(device)

    with torch.no_grad():
        outputs = model(**inputs)
        predicted_classes = torch.argmax(outputs.logits, dim=1).tolist()

    sentiment_map = {0: "Negative", 1: "Neutral", 2: "Positive"}
    return [sentiment_map[label] for label in predicted_classes]


In [None]:
sample_text = "The company's earnings report exceeded expectations, driving stock prices up."
print("Sentiment:", predict_sentiment(sample_text))

financial_texts = [
    "The market is expected to see a major downturn following weak economic data.",
    "Company profits surged 15% last quarter.",
    "Investors remain cautious as geopolitical risks increase."
]
print("Batch Sentiments:", batch_predict(financial_texts))


In [None]:
from google.colab import drive
import pandas as pd

drive.mount('/content/drive')

data_path = '/content/drive/MyDrive/Phrasebank/phrasebank_data.csv'
df = pd.read_csv(data_path)

df.head()


In [None]:
from sklearn.model_selection import train_test_split

label_map = {'positive': 2, 'neutral': 1, 'negative': 0}
df['Sentiment'] = df['Sentiment'].map(label_map)

import re
def clean_text(text):
    text = re.sub(r'\$\w+', '', text)
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    return text.strip()

df['Sentence'] = df['Sentence'].apply(clean_text)

train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['Sentence'].tolist(),
    df['Sentiment'].tolist(),
    test_size=0.2,
    random_state=42
)

In [None]:
!pip install datasets

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=512)

train_dataset = Dataset.from_dict({
    'input_ids': train_encodings['input_ids'],
    'attention_mask': train_encodings['attention_mask'],
    'labels': train_labels
})

val_dataset = Dataset.from_dict({
    'input_ids': val_encodings['input_ids'],
    'attention_mask': val_encodings['attention_mask'],
    'labels': val_labels
})


In [None]:
training_args = TrainingArguments(
    output_dir="./finbert_finetuned",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=4,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_strategy="epoch",
    load_best_model_at_end=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

trainer.train()


In [None]:
from transformers import EarlyStoppingCallback

training_args = TrainingArguments(
    output_dir="./finbert_finetuned",
    evaluation_strategy="epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_strategy="epoch",
    load_best_model_at_end=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=1)]
)

trainer.train()


In [None]:
pip install evaluate


In [None]:
import numpy as np
import evaluate
from transformers import EarlyStoppingCallback

f1_metric = evaluate.load("f1")

def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    f1_score = f1_metric.compute(predictions=preds, references=labels, average='weighted')
    return {"f1_score": f1_score['f1']}

training_args = TrainingArguments(
    output_dir="./finbert_finetuned",
    evaluation_strategy="epoch",
    learning_rate=8e-6,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=10,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_strategy="epoch",
    load_best_model_at_end=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=1)]
)

trainer.train()