In [5]:
# Features with importance > 200
feature_importances = {
    "loan_amnt": 2001,
    "installment": 1804,
    "emp_length": 806,
    "annual_inc": 637,
    "dti": 600,
    "bc_util": 549,
    "fico_range_high": 491,
    "total_rev_hi_lim": 485,
    "purpose": 484,
    "total_bc_limit": 449,
    "acc_open_past_24mths": 432,
    "mo_sin_old_rev_tl_op": 425,
    "tot_hi_cred_lim": 362,
    "mo_sin_old_il_acct": 339,
    "mths_since_recent_bc": 330,
    "earliest_cr_line": 307,
    "pct_tl_nvr_dlq": 292,
    "percent_bc_gt_75": 287,
    "bc_open_to_buy": 270,
    "total_acc": 255,
    "mort_acc": 248,
    "total_il_high_credit_limit": 227,
    "mths_since_recent_inq": 223
}

# Corresponding descriptions extracted from the dictionary
feature_descriptions = {
    "loan_amnt": "The listed amount of the loan applied for by the borrower. If at some point in time, the credit department reduces the loan amount, then it will be reflected in this value.",
    "installment": "The monthly payment owed by the borrower if the loan originates.",
    "emp_length": "Employment length in years. Possible values are between 0 and 10 where 0 means less than one year and 10 means ten or more years.",
    "annual_inc": "The self-reported annual income provided by the borrower during registration.",
    "dti": "A ratio calculated using the borrower’s total monthly debt payments on the total debt obligations, excluding mortgage and the requested LC loan, divided by the borrower’s self-reported monthly income.",
    "bc_util": "Ratio of total current balance to high credit/credit limit for all bankcard accounts.",
    "fico_range_high": "The upper boundary range the borrower’s FICO at loan origination belongs to.",
    "total_rev_hi_lim": "No description available.",
    "purpose": "A category provided by the borrower for the loan request.",
    "total_bc_limit": "Total bankcard high credit/credit limit",
    "acc_open_past_24mths": "Number of trades opened in past 24 months.",
    "mo_sin_old_rev_tl_op": "Months since oldest revolving account opened",
    "tot_hi_cred_lim": "Total high credit/credit limit",
    "mo_sin_old_il_acct": "Months since oldest bank installment account opened",
    "mths_since_recent_bc": "Months since most recent bankcard account opened.",
    "earliest_cr_line": "The month the borrower's earliest reported credit line was opened",
    "pct_tl_nvr_dlq": "Percent of trades never delinquent",
    "percent_bc_gt_75": "Percentage of all bankcard accounts > 75% of limit.",
    "bc_open_to_buy": "Total open to buy on revolving bankcards.",
    "total_acc": "The total number of credit lines currently in the borrower's credit file",
    "mort_acc": "Number of mortgage accounts.",
    "total_il_high_credit_limit": "Total installment high credit/credit limit",
    "mths_since_recent_inq": "Months since most recent inquiry."
}

# Combine into prompt format
features_prompt = "\n".join([
    f"- {feature} (Importance = {feature_importances[feature]}): {feature_descriptions[feature]}"
    for feature in feature_importances
])

# Final prompt generation function
def generate_final_prompt(features_prompt: str) -> str:
    return f"""
I am building a machine learning model for credit risk prediction. Below is a list of features that have been found to be highly important (importance > 200). Each feature is accompanied by a short description from the dataset documentation:

{features_prompt}

For each feature, please do the following:
1. Explain clearly why this feature may influence a borrower's likelihood of default.
2. Suggest one or more potential mathematical transformations or interactions (e.g., log, squared, ratio, interaction with other variables) that could enhance model performance.
3. Indicate whether the relationship is likely linear, nonlinear, or threshold-based.
4. If useful, mention which other features it might interact well with.
Return your answer in a structured format (e.g., a table or bullet points) for each feature so I can easily implement the suggestions in feature engineering.
"""

# Generate the final prompt
final_prompt = generate_final_prompt(features_prompt)


In [7]:
import openai
from datetime import datetime
client = openai.OpenAI(api_key = 'api')

response = client.chat.completions.create(
    model="gpt-3.5-turbo",  # Or "gpt-4" if you have access
    messages=[
        {"role": "system", "content": "You are a financial data scientist specializing in feature engineering for credit risk models."},
        {"role": "user", "content": final_prompt}
    ],
    temperature=0.3,
    max_tokens=3000
)

# Step 5: Extract and save response to markdown
output_text = response.choices[0].message.content.strip()

# Optional: Save to markdown file
timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
filename = f"feature_explanations_{timestamp}.md"
with open(filename, "w") as f:
    f.write(output_text)

print(f"✅ Explanation and suggestions saved to: {filename}")


✅ Explanation and suggestions saved to: feature_explanations_2025-04-04_15-36-27.md
