In [None]:
import os
print(os.getcwd())


In [None]:
import os
import pandas as pd

# Path where all quarter folders are stored
DATA_PATH = r"D:\Resume\KPMG\Automated Financial Risk Scoring Platform (Machine Learning + LLM)"

# Required financial tags
REQUIRED_TAGS = [
    # Income Statement
    "Revenues",
    "NetIncomeLoss",
    "OperatingIncomeLoss",
    "InterestExpense",
    
    # Balance Sheet
    "Assets",
    "Liabilities",
    "StockholdersEquity",
    "AssetsCurrent",
    "LiabilitiesCurrent",
    "CashAndCashEquivalentsAtCarryingValue",
    "LongTermDebtNoncurrent",
    
    # Cash Flow
    "NetCashProvidedByUsedInOperatingActivities",
    "PaymentsToAcquirePropertyPlantAndEquipment"
]

def load_all_quarters():
    all_data = []
    
    for quarter_folder in os.listdir(DATA_PATH):
        quarter_path = os.path.join(DATA_PATH, quarter_folder, "num.txt")
        
        if os.path.exists(quarter_path):
            print(f"Loading {quarter_folder}...")
            
            df = pd.read_csv(quarter_path, sep="\t", low_memory=False)
            
            # Filter required tags
            df = df[df["tag"].isin(REQUIRED_TAGS)]
            
            # Keep only USD values
            df = df[df["uom"] == "USD"]
            
            # Keep annual and balance sheet values
            df = df[df["qtrs"].isin([0, 4])]
            
            # Keep required columns
            df = df[["adsh", "tag", "ddate", "qtrs", "value"]]
            
            all_data.append(df)
    
    final_df = pd.concat(all_data, ignore_index=True)
    
    return final_df


if __name__ == "__main__":
    df = load_all_quarters()
    print(df.head())
    print("Total rows:", len(df))


In [None]:
import os
import pandas as pd

REQUIRED_TAGS = [
    "Revenues",
    "NetIncomeLoss",
    "OperatingIncomeLoss",
    "InterestExpense",
    "Assets",
    "Liabilities",
    "StockholdersEquity",
    "AssetsCurrent",
    "LiabilitiesCurrent",
    "CashAndCashEquivalentsAtCarryingValue",
    "LongTermDebtNoncurrent",
    "NetCashProvidedByUsedInOperatingActivities",
    "PaymentsToAcquirePropertyPlantAndEquipment"
]

all_data = []

for quarter_folder in os.listdir(DATA_PATH):
    quarter_path = os.path.join(DATA_PATH, quarter_folder, "num.txt")
    
    if os.path.exists(quarter_path):
        print(f"Loading {quarter_folder}...")
        
        df = pd.read_csv(quarter_path, sep="\t", low_memory=False)
        df = df[df["tag"].isin(REQUIRED_TAGS)]
        df = df[df["uom"] == "USD"]
        df = df[df["qtrs"].isin([0, 4])]
        df = df[["adsh", "tag", "ddate", "qtrs", "value"]]
        
        all_data.append(df)

final_df = pd.concat(all_data, ignore_index=True)

print("Final shape:", final_df.shape)


### Pivot to Structured Company-Level Data

In [None]:
structured_df = final_df.pivot_table(
    index=["adsh", "ddate"],
    columns="tag",
    values="value",
    aggfunc="first"
).reset_index()

print("Structured shape:", structured_df.shape)
structured_df.head()

In [None]:
structured_df.isnull().sum()

In [None]:
print("Total rows:", structured_df.shape[0])

step1 = structured_df.dropna(subset=["Assets", "Liabilities"])
print("After keeping Assets & Liabilities:", step1.shape[0])

step2 = step1.dropna(subset=["Revenues"])
print("After keeping Revenues:", step2.shape[0])

step3 = step2.dropna(subset=["NetIncomeLoss"])
print("After keeping NetIncomeLoss:", step3.shape[0])

step4 = step3.dropna(subset=["StockholdersEquity"])
print("After keeping Equity:", step4.shape[0])


### Create Clean Modeling Dataset

In [None]:
clean_df = structured_df.dropna(subset=[
    "Assets",
    "Liabilities",
    "Revenues",
    "NetIncomeLoss",
    "StockholdersEquity"
])

print("Final clean dataset shape:", clean_df.shape)

clean_df.head()


### Financial Ratio Engineering

In [None]:
import numpy as np

# Copy clean dataset
feature_df = clean_df.copy()

# ---------------------------
# Liquidity Ratios
# ---------------------------
feature_df["current_ratio"] = feature_df["AssetsCurrent"] / feature_df["LiabilitiesCurrent"]
feature_df["cash_ratio"] = feature_df["CashAndCashEquivalentsAtCarryingValue"] / feature_df["LiabilitiesCurrent"]

# ---------------------------
# Leverage Ratios
# ---------------------------
feature_df["debt_to_equity"] = feature_df["Liabilities"] / feature_df["StockholdersEquity"]
feature_df["debt_to_assets"] = feature_df["Liabilities"] / feature_df["Assets"]

# ---------------------------
# Profitability Ratios
# ---------------------------
feature_df["net_profit_margin"] = feature_df["NetIncomeLoss"] / feature_df["Revenues"]
feature_df["return_on_assets"] = feature_df["NetIncomeLoss"] / feature_df["Assets"]
feature_df["return_on_equity"] = feature_df["NetIncomeLoss"] / feature_df["StockholdersEquity"]

# ---------------------------
# Cash Flow Metrics
# ---------------------------
feature_df["free_cash_flow"] = (
    feature_df["NetCashProvidedByUsedInOperatingActivities"] -
    feature_df["PaymentsToAcquirePropertyPlantAndEquipment"]
)

feature_df["operating_cf_ratio"] = (
    feature_df["NetCashProvidedByUsedInOperatingActivities"] /
    feature_df["Liabilities"]
)

# ---------------------------
# Replace infinite values
# ---------------------------
feature_df.replace([np.inf, -np.inf], np.nan, inplace=True)

feature_df.head()


In [None]:
feature_df.isnull().sum()

### Handle Missing Values

In [None]:
# ---------------------------------------
# Fill missing numeric values with median
# ---------------------------------------

numeric_cols = feature_df.select_dtypes(include=["float64", "int64"]).columns

for col in numeric_cols:
    feature_df[col] = feature_df[col].fillna(feature_df[col].median())

print("Remaining missing values:")
print(feature_df.isnull().sum().sum())


In [None]:
feature_df.shape

In [None]:
feature_df.isnull().sum().sum()

### Create Composite Risk Score

In [None]:
from sklearn.preprocessing import StandardScaler

risk_features = [
    "debt_to_equity",
    "debt_to_assets",
    "current_ratio",
    "cash_ratio",
    "net_profit_margin",
    "return_on_assets",
    "return_on_equity",
    "free_cash_flow",
    "operating_cf_ratio"
]

scaler = StandardScaler()
scaled_values = scaler.fit_transform(feature_df[risk_features])

scaled_df = pd.DataFrame(scaled_values, columns=risk_features)

feature_df["risk_score"] = (
    scaled_df["debt_to_equity"] +
    scaled_df["debt_to_assets"] -
    scaled_df["current_ratio"] -
    scaled_df["cash_ratio"] -
    scaled_df["net_profit_margin"] -
    scaled_df["return_on_assets"] -
    scaled_df["return_on_equity"] -
    scaled_df["free_cash_flow"] -
    scaled_df["operating_cf_ratio"]
)

feature_df["risk_score"].describe()


In [None]:
feature_df.shape


In [None]:
from sklearn.preprocessing import StandardScaler

risk_features = [
    "debt_to_equity",
    "debt_to_assets",
    "current_ratio",
    "cash_ratio",
    "net_profit_margin",
    "return_on_assets",
    "return_on_equity",
    "free_cash_flow",
    "operating_cf_ratio"
]

scaler = StandardScaler()
scaled_values = scaler.fit_transform(feature_df[risk_features])

scaled_df = pd.DataFrame(scaled_values, columns=risk_features)

feature_df["risk_score"] = (
    scaled_df["debt_to_equity"] +
    scaled_df["debt_to_assets"] -
    scaled_df["current_ratio"] -
    scaled_df["cash_ratio"] -
    scaled_df["net_profit_margin"] -
    scaled_df["return_on_assets"] -
    scaled_df["return_on_equity"] -
    scaled_df["free_cash_flow"] -
    scaled_df["operating_cf_ratio"]
)

feature_df["risk_score"].describe()


In [None]:
feature_df.shape

In [None]:
from sklearn.preprocessing import StandardScaler

risk_features = [
    "debt_to_equity",
    "debt_to_assets",
    "current_ratio",
    "cash_ratio",
    "net_profit_margin",
    "return_on_assets",
    "return_on_equity",
    "free_cash_flow",
    "operating_cf_ratio"
]

scaler = StandardScaler()
scaled_values = scaler.fit_transform(feature_df[risk_features])

# Preserve original index
scaled_df = pd.DataFrame(
    scaled_values,
    columns=risk_features,
    index=feature_df.index
)

feature_df["risk_score"] = (
    scaled_df["debt_to_equity"] +
    scaled_df["debt_to_assets"] -
    scaled_df["current_ratio"] -
    scaled_df["cash_ratio"] -
    scaled_df["net_profit_margin"] -
    scaled_df["return_on_assets"] -
    scaled_df["return_on_equity"] -
    scaled_df["free_cash_flow"] -
    scaled_df["operating_cf_ratio"]
)

feature_df["risk_score"].describe()


### Create Risk Categories (Stable Method)

In [None]:
# Compute percentile thresholds
low_thresh = feature_df["risk_score"].quantile(0.33)
high_thresh = feature_df["risk_score"].quantile(0.66)

print("Low threshold:", low_thresh)
print("High threshold:", high_thresh)


### Assign Risk Categories

In [None]:
def assign_risk(x):
    if x <= low_thresh:
        return 0   # Low Risk
    elif x <= high_thresh:
        return 1   # Medium Risk
    else:
        return 2   # High Risk

feature_df["risk_category"] = feature_df["risk_score"].apply(assign_risk)

feature_df["risk_category"].value_counts()


In [None]:
feature_df.head()

In [None]:
feature_df.isnull().sum()

## saving CSV file

In [None]:
feature_df.to_csv(
    r"D:\Resume\KPMG\Automated Financial Risk Scoring Platform (Machine Learning + LLM)\final_engineered_financial_dataset.csv",
    index=False
)


In [None]:
import os

os.path.exists(
    r"D:\Resume\KPMG\Automated Financial Risk Scoring Platform (Machine Learning + LLM)\final_engineered_financial_dataset.csv"
)


### EDA

In [None]:
import matplotlib.pyplot as plt

feature_df["risk_category"].value_counts().plot(kind="bar")
plt.title("Risk Category Distribution")
plt.xlabel("Risk Category")
plt.ylabel("Count")
plt.show()


### Risk Score Distribution (Understand Spread)

In [None]:
plt.figure()
feature_df["risk_score"].hist(bins=50)
plt.title("Distribution of Risk Score")
plt.xlabel("Risk Score")
plt.ylabel("Frequency")
plt.show()


### Boxplot of Risk Score (Outlier Detection)

In [None]:
plt.figure()
plt.boxplot(feature_df["risk_score"])
plt.title("Risk Score Boxplot")
plt.show()


### Feature Distribution (Example: Debt to Equity)

In [None]:
plt.figure()
feature_df["debt_to_equity"].hist(bins=50)
plt.title("Debt to Equity Distribution")
plt.xlabel("Debt to Equity")
plt.ylabel("Frequency")
plt.show()


### Professional Fix ‚Äî Cap Extreme Ratios (Winsorization)

In [None]:
# Cap extreme leverage values at 1st and 99th percentile

for col in ["debt_to_equity", "debt_to_assets"]:
    lower = feature_df[col].quantile(0.01)
    upper = feature_df[col].quantile(0.99)
    feature_df[col] = feature_df[col].clip(lower, upper)

feature_df["debt_to_equity"].describe()


### Professional Fix ‚Äî Log Transform (Stable Version)

In [None]:
import numpy as np

feature_df["debt_to_equity"] = np.sign(feature_df["debt_to_equity"]) * \
                               np.log1p(np.abs(feature_df["debt_to_equity"]))

feature_df["debt_to_equity"].describe()


Important: We Must Recompute Risk Score

Because we changed debt_to_equity, the previous risk_score is no longer valid.

So now we recompute it.

In [None]:
from sklearn.preprocessing import StandardScaler

risk_features = [
    "debt_to_equity",
    "debt_to_assets",
    "current_ratio",
    "cash_ratio",
    "net_profit_margin",
    "return_on_assets",
    "return_on_equity",
    "free_cash_flow",
    "operating_cf_ratio"
]

scaler = StandardScaler()
scaled_values = scaler.fit_transform(feature_df[risk_features])

scaled_df = pd.DataFrame(
    scaled_values,
    columns=risk_features,
    index=feature_df.index
)

feature_df["risk_score"] = (
    scaled_df["debt_to_equity"] +
    scaled_df["debt_to_assets"] -
    scaled_df["current_ratio"] -
    scaled_df["cash_ratio"] -
    scaled_df["net_profit_margin"] -
    scaled_df["return_on_assets"] -
    scaled_df["return_on_equity"] -
    scaled_df["free_cash_flow"] -
    scaled_df["operating_cf_ratio"]
)

feature_df["risk_score"].describe()


### Recreate Risk Categories

In [None]:
# Recompute thresholds
low_thresh = feature_df["risk_score"].quantile(0.33)
high_thresh = feature_df["risk_score"].quantile(0.66)

print("Low threshold:", low_thresh)
print("High threshold:", high_thresh)


### Final Step ‚Äî Assign Risk Categories Again

In [None]:
def assign_risk(x):
    if x <= low_thresh:
        return 0   # Low Risk
    elif x <= high_thresh:
        return 1   # Medium Risk
    else:
        return 2   # High Risk

feature_df["risk_category"] = feature_df["risk_score"].apply(assign_risk)

feature_df["risk_category"].value_counts()


In [None]:
plt.figure()
feature_df["debt_to_equity"].hist(bins=50)
plt.title("Debt to Equity Distribution")
plt.xlabel("Debt to Equity")
plt.ylabel("Frequency")
plt.show()


### Final Sanity Check

In [None]:
feature_df.groupby("risk_category")[
    ["debt_to_equity", "current_ratio", "return_on_assets"]
].mean()


### professional Fix ‚Äî Log Transform Current Ratiom

In [None]:
import numpy as np

feature_df["current_ratio"] = np.log1p(feature_df["current_ratio"])
feature_df["cash_ratio"] = np.log1p(feature_df["cash_ratio"])

feature_df[["current_ratio", "cash_ratio"]].describe()


### Fill Any New NaNs

In [None]:
feature_df["current_ratio"] = feature_df["current_ratio"].fillna(feature_df["current_ratio"].median())
feature_df["cash_ratio"] = feature_df["cash_ratio"].fillna(feature_df["cash_ratio"].median())

feature_df[["current_ratio", "cash_ratio"]].isnull().sum()


### Recompute Risk Score (Final Version)

In [None]:
from sklearn.preprocessing import StandardScaler

risk_features = [
    "debt_to_equity",
    "debt_to_assets",
    "current_ratio",
    "cash_ratio",
    "net_profit_margin",
    "return_on_assets",
    "return_on_equity",
    "free_cash_flow",
    "operating_cf_ratio"
]

scaler = StandardScaler()
scaled_values = scaler.fit_transform(feature_df[risk_features])

scaled_df = pd.DataFrame(
    scaled_values,
    columns=risk_features,
    index=feature_df.index
)

feature_df["risk_score"] = (
    scaled_df["debt_to_equity"] +
    scaled_df["debt_to_assets"] -
    scaled_df["current_ratio"] -
    scaled_df["cash_ratio"] -
    scaled_df["net_profit_margin"] -
    scaled_df["return_on_assets"] -
    scaled_df["return_on_equity"] -
    scaled_df["free_cash_flow"] -
    scaled_df["operating_cf_ratio"]
)

feature_df["risk_score"].describe()


### Now Recreate Risk Categories (Final Clean Version)

In [None]:
# Compute thresholds again
low_thresh = feature_df["risk_score"].quantile(0.33)
high_thresh = feature_df["risk_score"].quantile(0.66)

print("Low threshold:", low_thresh)
print("High threshold:", high_thresh)


### Final Label Assignment (Clean Version)

In [None]:
def assign_risk(x):
    if x <= low_thresh:
        return 0   # Low Risk
    elif x <= high_thresh:
        return 1   # Medium Risk
    else:
        return 2   # High Risk

feature_df["risk_category"] = feature_df["risk_score"].apply(assign_risk)

feature_df["risk_category"].value_counts()


### Correlation Heatmap (Very Important)

In [None]:
import seaborn as sns

plt.figure(figsize=(10,8))
sns.heatmap(
    feature_df[[
        "debt_to_equity",
        "debt_to_assets",
        "current_ratio",
        "cash_ratio",
        "net_profit_margin",
        "return_on_assets",
        "return_on_equity",
        "free_cash_flow",
        "operating_cf_ratio"
    ]].corr(),
    annot=True
)
plt.title("Feature Correlation Matrix")
plt.show()


### Feature Behavior Across Risk Categories

In [None]:
plt.figure()
feature_df.boxplot(column="debt_to_equity", by="risk_category")
plt.title("Debt to Equity by Risk Category")
plt.suptitle("")
plt.show()


### what we changed

### save new CSV file

In [None]:
feature_df.to_csv(
    r"D:\Resume\KPMG\Automated Financial Risk Scoring Platform (Machine Learning + LLM)\engineered_financial_dataset_v2.csv",
    index=False
)


In [None]:
import os

os.path.exists(
    r"D:\Resume\KPMG\Automated Financial Risk Scoring Platform (Machine Learning + LLM)\engineered_financial_dataset_v2.csv"
)


## importing the data set

In [None]:
import pandas as pd

feature_df = pd.read_csv(
    r"D:\Resume\KPMG\Automated Financial Risk Scoring Platform (Machine Learning + LLM)\engineered_financial_dataset_v2.csv"
)

print("Dataset loaded.")
print("Shape:", feature_df.shape)
feature_df.head()


In [None]:
print(feature_df.columns)


## modeling phase.

### Prepare X and y

In [None]:
# Features for modeling
model_features = [
    "debt_to_equity",
    "debt_to_assets",
    "current_ratio",
    "cash_ratio",
    "net_profit_margin",
    "return_on_assets",
    "return_on_equity",
    "free_cash_flow",
    "operating_cf_ratio"
]

X = feature_df[model_features]
y = feature_df["risk_category"]

print("X shape:", X.shape)
print("y shape:", y.shape)


In [None]:
print(y.value_counts())

### Train/Test Split

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("X_train:", X_train.shape)
print("X_test:", X_test.shape)
print("y_train distribution:\n", y_train.value_counts())
print("y_test distribution:\n", y_test.value_counts())


### Train XGBoost Classifier# Train XGBoost Classifier

In [None]:
from xgboost import XGBClassifier

model = XGBClassifier(
    objective="multi:softprob",
    num_class=3,
    n_estimators=300,
    max_depth=4,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    eval_metric="mlogloss"
)

model.fit(X_train, y_train)

print("Model training completed.")


### Model Evaluation

In [None]:
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
from sklearn.preprocessing import label_binarize

# Predictions
y_pred = model.predict(X_test)

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Classification report
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred))

# ROC-AUC (multi-class)
y_test_bin = label_binarize(y_test, classes=[0, 1, 2])
y_pred_prob = model.predict_proba(X_test)

roc_auc = roc_auc_score(y_test_bin, y_pred_prob, multi_class="ovr")
print("\nROC-AUC:", roc_auc)


### Check Training Accuracy

In [None]:
# Training predictions
y_train_pred = model.predict(X_train)

train_accuracy = accuracy_score(y_train, y_train_pred)
print("Training Accuracy:", train_accuracy)

# Test accuracy again
print("Test Accuracy:", accuracy)


### Let's Check More Properly

Accuracy alone is not enough.

In [None]:
from sklearn.metrics import classification_report

print("Train Classification Report:\n")
print(classification_report(y_train, y_train_pred))

print("\nTest Classification Report:\n")
print(classification_report(y_test, y_pred))


### Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

cm = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(5,4))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
            xticklabels=[0,1,2],
            yticklabels=[0,1,2])
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()


### Crosstab (Actual vs Predicted)

In [None]:
import pandas as pd

crosstab = pd.crosstab(
    y_test,
    y_pred,
    rownames=["Actual"],
    colnames=["Predicted"]
)

print(crosstab)


### Crosstab with Percentages (Even Better)

In [None]:
crosstab_percent = pd.crosstab(
    y_test,
    y_pred,
    normalize="index"
)

print(crosstab_percent)


### Cross-Validation (More Serious Overfitting Check)

In [None]:
from sklearn.model_selection import cross_val_score
import numpy as np

cv_scores = cross_val_score(
    model,
    X,
    y,
    cv=5,
    scoring="accuracy"
)

print("Cross-validation accuracy scores:", cv_scores)
print("Mean CV accuracy:", np.mean(cv_scores))
print("Std CV accuracy:", np.std(cv_scores))


## model explainability

### XGBoost Feature Importance

In [None]:
import matplotlib.pyplot as plt
import pandas as pd

# Get feature importance scores
importance = model.feature_importances_

importance_df = pd.DataFrame({
    "Feature": model_features,
    "Importance": importance
}).sort_values(by="Importance", ascending=False)

print(importance_df)

# Plot
plt.figure()
plt.barh(importance_df["Feature"], importance_df["Importance"])
plt.gca().invert_yaxis()
plt.title("Feature Importance (XGBoost)")
plt.show()


## SHAP explainability

this is what makes your project enterprise-grade.

### Create Explainer

In [None]:
import shap
print("SHAP imported successfully")

In [None]:
import shap

# Create TreeExplainer for XGBoost model
explainer = shap.TreeExplainer(model)

# Compute SHAP values for test set
shap_values = explainer.shap_values(X_test)

print("SHAP values computed.")


### Global Summary Plot

This shows which features contribute most to predictions overall.

In [None]:
shap.summary_plot(shap_values, X_test)


### Select One Test Example

In [None]:
# Pick first test sample
sample_index = X_test.index[0]

sample_data = X_test.loc[[sample_index]]

print("Actual Risk:", y_test.loc[sample_index])
print("Predicted Risk:", model.predict(sample_data)[0])

sample_data


### SHAP Explanation for This Company

In [None]:
type(shap_values)


In [None]:
import numpy as np
np.array(shap_values).shape


In [None]:
# Find sample position in test set
sample_position = list(X_test.index).index(sample_index)

# Extract SHAP values for class 1
shap_values_class1 = shap_values[sample_position, :, 1]

shap.force_plot(
    explainer.expected_value[1],  # expected value for class 1
    shap_values_class1,
    sample_data,
    matplotlib=True
)


### Extract Top Contributing Features

In [None]:
import numpy as np

def get_top_shap_features(sample_position, class_index=1, top_n=3):
    shap_vals = shap_values[sample_position, :, class_index]
    feature_names = X_test.columns
    
    shap_df = pd.DataFrame({
        "feature": feature_names,
        "shap_value": shap_vals,
        "feature_value": X_test.iloc[sample_position].values
    })
    
    shap_df["abs_shap"] = np.abs(shap_df["shap_value"])
    shap_df = shap_df.sort_values("abs_shap", ascending=False)
    
    return shap_df.head(top_n)

top_features = get_top_shap_features(sample_position, class_index=1)
top_features


Now we convert this into an LLM-generated explanation.m

### Build Structured Prompt for LLM

In [None]:
def build_risk_prompt(predicted_class, top_features):
    
    risk_label_map = {
        0: "Low Risk",
        1: "Medium Risk",
        2: "High Risk"
    }
    
    explanation_points = ""
    
    for _, row in top_features.iterrows():
        explanation_points += (
            f"- {row['feature']} = {row['feature_value']:.2f} "
            f"(impact: {row['shap_value']:.3f})\n"
        )
    
    prompt = f"""
You are a financial risk analyst.

The model has classified this company as: {risk_label_map[predicted_class]}.

Key contributing financial indicators:

{explanation_points}

Generate a professional business-level explanation describing:
- Why the company falls into this risk category
- What financial weaknesses or strengths are driving this decision
- Keep it concise and suitable for credit committee review.
"""
    
    return prompt

predicted_class = model.predict(sample_data)[0]
prompt_text = build_risk_prompt(predicted_class, top_features)

print(prompt_text)


### Call OpenAI API

In [None]:

#code the above and not the below. and use your apikey
from openai import OpenAI
client = OpenAI()

In [None]:
from openai import OpenAI

client = OpenAI()

response = client.chat.completions.create(
    model="gpt-4o-mini",
    messages=[
        {"role": "system", "content": "You are an expert financial risk analyst."},
        {"role": "user", "content": prompt_text}
    ],
    temperature=0.3
)

llm_explanation = response.choices[0].message.content

print(llm_explanation)


## Saving the model

In [None]:
model.save_model(
r"D:\Resume\KPMG\Automated Financial Risk Scoring Platform (Machine Learning + LLM)\financial-risk-platform\models\xgboost_risk_model.json"
)

print("Model saved successfully.")


## interview questions and answersm