<a href="https://colab.research.google.com/github/venkat-nallapu09/Credict_risk_assesment_by_using_statistics_-_probability/blob/main/Credict_risk_assesment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#**Building a Probability-Based Credit Risk Model for German Credit Dataset**

# **Step 1: Data Preparation**

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Load numerical dataset (24 features + target)
data = np.loadtxt('german.data-numeric')
X = data[:, :-1]  # Features
y = np.where(data[:, -1] == 1, 0, 1)  # Convert target: 1=Good(0), 2=Bad(1)

# Split data (70% train, 30% test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

# Standardize numerical features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# **Step 2: Model Training (Logistic Regression)**

In [None]:
feature_importance = pd.Series(model.coef_[0], index=range(1,25))
feature_importance.abs().sort_values(ascending=False)

Unnamed: 0,0
1,0.680828
3,0.44369
10,0.329386
16,0.316904
17,0.314009
21,0.294645
15,0.285113
2,0.272552
11,0.207359
6,0.205582


In [None]:
from sklearn.linear_model import LogisticRegression

# Train model with class weighting
model = LogisticRegression(
    solver='liblinear',
    class_weight='balanced',  # Adjusts for class imbalance
    random_state=42
)
model.fit(X_train, y_train)

# **Step 3: Cost-Optimal Threshold Calculation**

In [None]:
from sklearn.metrics import confusion_matrix

# Get predicted probabilities for training data
train_probs = model.predict_proba(X_train)[:, 1]  # P(bad)

# Define cost function
def calculate_cost(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    return fp * 1 + fn * 5  # Cost matrix weights

# Find optimal threshold
thresholds = np.linspace(0, 1, 101)
costs = []

for thresh in thresholds:
    preds = (train_probs >= thresh).astype(int)
    costs.append(calculate_cost(y_train, preds))

optimal_threshold = thresholds[np.argmin(costs)]
print(f"Optimal threshold: {optimal_threshold:.2f}")  # Typically ~0.3-0.4

Optimal threshold: 0.39


# **Step 4: Model Evaluation**

In [None]:
# Test set predictions
test_probs = model.predict_proba(X_test)[:, 1]
test_preds = (test_probs >= optimal_threshold).astype(int)

# Calculate test cost
test_cost = calculate_cost(y_test, test_preds)
print(f"Test set cost: {test_cost}")

# Key metrics
tn, fp, fn, tp = confusion_matrix(y_test, test_preds).ravel()
print(f"""
Confusion Matrix:
| {tp:4d} | {fp:4d} |  (Actual Bad)
| {fn:4d} | {tn:4d} |  (Actual Good)
""")
print(f"False Negative Rate: {fn/(fn+tp):.2%}")
print(f"False Positive Rate: {fp/(fp+tn):.2%}")

Test set cost: 149

Confusion Matrix:
|   76 |   74 |  (Actual Bad)
|   15 |  135 |  (Actual Good)

False Negative Rate: 16.48%
False Positive Rate: 35.41%


# **Step 5: Risk Tiers Based on Probabilities**

In [None]:
# Create risk categories
def risk_tier(prob):
    if prob < 0.2:
        return "Low"
    elif prob < 0.5:
        return "Medium"
    else:
        return "High"

# Apply to test cases
risk_tiers = [risk_tier(p) for p in test_probs]

In [None]:
for index, tier in enumerate(risk_tiers):
    print(f"Index: {index}, Risk Tier: {tier}")

Index: 0, Risk Tier: High
Index: 1, Risk Tier: High
Index: 2, Risk Tier: High
Index: 3, Risk Tier: Medium
Index: 4, Risk Tier: Low
Index: 5, Risk Tier: High
Index: 6, Risk Tier: Low
Index: 7, Risk Tier: Medium
Index: 8, Risk Tier: Low
Index: 9, Risk Tier: Low
Index: 10, Risk Tier: Medium
Index: 11, Risk Tier: High
Index: 12, Risk Tier: Medium
Index: 13, Risk Tier: High
Index: 14, Risk Tier: High
Index: 15, Risk Tier: High
Index: 16, Risk Tier: Low
Index: 17, Risk Tier: Low
Index: 18, Risk Tier: High
Index: 19, Risk Tier: High
Index: 20, Risk Tier: Low
Index: 21, Risk Tier: High
Index: 22, Risk Tier: Low
Index: 23, Risk Tier: Low
Index: 24, Risk Tier: Low
Index: 25, Risk Tier: High
Index: 26, Risk Tier: High
Index: 27, Risk Tier: High
Index: 28, Risk Tier: Low
Index: 29, Risk Tier: Medium
Index: 30, Risk Tier: Medium
Index: 31, Risk Tier: Medium
Index: 32, Risk Tier: Medium
Index: 33, Risk Tier: Medium
Index: 34, Risk Tier: Medium
Index: 35, Risk Tier: Medium
Index: 36, Risk Tier: High
