In [1]:
import pandas as pd
import numpy as np

# Load the dataset
file_path = 'credit_risk_dataset.csv'  # Update this path to your actual file path
df = pd.read_csv(file_path)

# Set the seed for reproducibility
np.random.seed(42)

# Function to map existing loan intents to new categories
def map_loan_intent(intent):
    mapping = {
        'PERSONAL': ['WOMAN_ENTREPRENEUR', 'SMALL_BUSINESS'],
        'VENTURE': ['MINORITY_BUSINESS', 'FREELANCE_AND_GIG_WORKERS'],
        'DEBTCONSOLIDATION': ['SMALL_FARMERS_AND_AGRICULTURE', 'OTHERS'],
        'MEDICAL': ['WOMAN_ENTREPRENEUR', 'SMALL_BUSINESS', 'MINORITY_BUSINESS', 'FREELANCE_AND_GIG_WORKERS', 'SMALL_FARMERS_AND_AGRICULTURE'],
        'HOMEIMPROVEMENT': ['SMALL_FARMERS_AND_AGRICULTURE', 'OTHERS'],
        'EDUCATION': ['WOMAN_ENTREPRENEUR', 'SMALL_BUSINESS']
    }
    return np.random.choice(mapping[intent])

# Apply the mapping function to the 'loan_intent' column
df['loan_intent'] = df['loan_intent'].apply(map_loan_intent)

# Count the number of each loan intent category
loan_intent_counts = df['loan_intent'].value_counts()

# Save the updated dataframe to a new CSV file
output_file_path = 'randomised_LI_credit_risk_dataset_final.csv'  # Update this path to your desired output file path
df.to_csv(output_file_path, index=False)

# Print the counts of each loan intent category
print("Loan intents updated and saved to:", output_file_path)
print("\nCounts of each loan intent category:")
print(loan_intent_counts)

# Optionally save the counts to a file
counts_output_file_path = 'loan_intent_counts.csv'  # Update this path to your desired output file path
loan_intent_counts.to_csv(counts_output_file_path, header=True)

print("Counts of each loan intent category saved to:", counts_output_file_path)


Loan intents updated and saved to: randomised_LI_credit_risk_dataset_final.csv

Counts of each loan intent category:
loan_intent
SMALL_BUSINESS                   7267
WOMAN_ENTREPRENEUR               7139
SMALL_FARMERS_AND_AGRICULTURE    5608
OTHERS                           4444
MINORITY_BUSINESS                4067
FREELANCE_AND_GIG_WORKERS        4056
Name: count, dtype: int64
Counts of each loan intent category saved to: loan_intent_counts.csv


In [1]:
import pandas as pd

# Load the original dataset
df = pd.read_csv('credit_risk_dataset.csv')

# Define the mapping based on current patterns and insights
intent_mapping = {
    'PERSONAL': 'WOMAN_ENTREPRENEUR',
    'VENTURE': 'WOMAN_ENTREPRENEUR',
    'DEBTCONSOLIDATION': 'LGBTQ_ENTREPRENEURS',
    'MEDICAL': 'MINORITY_BUSINESS',
    'HOMEIMPROVEMENT': 'VETERAN_OWNED_BUSINESS',
    'EDUCATION': 'GREEN_ENERGY_PROJECTS'
}

# Apply the mapping to the DataFrame
df['loan_intent'] = df['loan_intent'].map(intent_mapping)

# Verify the new distribution of loan intents
print(df['loan_intent'].value_counts())

# Save the new DataFrame to a CSV file
df.to_csv('credit_risk_dataset_manipulated.csv', index=False)


loan_intent
WOMAN_ENTREPRENEUR        11240
GREEN_ENERGY_PROJECTS      6453
MINORITY_BUSINESS          6071
LGBTQ_ENTREPRENEURS        5212
VETERAN_OWNED_BUSINESS     3605
Name: count, dtype: int64


In [3]:
# business_proposal_analysis with basic correlations
import pandas as pd

# Load your edited dataset
df = pd.read_csv('credit_risk_dataset_manipulated.csv')

# Define function to categorize business proposal analysis
def categorize_business_plan(row):
    if row['loan_status'] == 0 and row['loan_int_rate'] <= 12.5 and row['loan_percent_income'] <= 0.5:
        return 'High'
    elif row['loan_status'] == 1 and row['loan_int_rate'] > 20:
        return 'Low'
    else:
        return 'Medium'

# Apply the function to create the new column
df['business_proposal_analysis'] = df.apply(categorize_business_plan, axis=1)

# Verify the new distribution of business proposal analysis
print(df['business_proposal_analysis'].value_counts())

# Save the updated DataFrame to a new CSV file
df.to_csv('credit_risk_dataset_with_analysis_general.csv', index=False)

business_proposal_analysis
High      16595
Medium    15923
Low          63
Name: count, dtype: int64


In [4]:
# business_proposal_analysis with advanced correlations
import pandas as pd

# Load your edited dataset
df = pd.read_csv('credit_risk_dataset_manipulated.csv')

def categorize_business_plan(row):
    if (row['loan_status'] == 0 and row['loan_int_rate'] <= 12.5 and row['loan_percent_income'] <= 0.5 and 
        row['cb_person_default_on_file'] == 'N' and row['cb_person_cred_hist_length'] > 4):
        return 'High'
    elif (row['loan_status'] == 1 and row['loan_int_rate'] > 20):
        return 'Low'
    elif (row['loan_status'] == 0 and row['loan_int_rate'] > 12.5 and row['loan_int_rate'] <= 20 and row['loan_percent_income'] > 0.5 and row['loan_percent_income'] <= 0.9) or (
          row['loan_status'] == 1 and row['loan_int_rate'] <= 20):
        return 'Medium'
    else:
        return 'Low'

# Apply the refined function
df['business_proposal_analysis'] = df.apply(categorize_business_plan, axis=1)

# Verify the new distribution of business proposal analysis
print(df['business_proposal_analysis'].value_counts())

# Save the updated DataFrame to a new CSV file
df.to_csv('credit_risk_dataset_with_analysis_advanced.csv', index=False)


business_proposal_analysis
Low       18778
High       7394
Medium     6409
Name: count, dtype: int64


In [5]:
import pandas as pd

# Load your edited dataset
df = pd.read_csv('credit_risk_dataset_manipulated.csv')
def categorize_business_plan(row):
    if (row['loan_status'] == 0 and row['loan_int_rate'] <= 12.5 and row['loan_percent_income'] <= 0.5 and 
        row['cb_person_default_on_file'] == 'N' and row['cb_person_cred_hist_length'] > 4):
        return 'High'
    elif (row['loan_status'] == 1 and row['loan_int_rate'] > 20):
        return 'Low'
    elif (row['loan_status'] == 0 and row['loan_int_rate'] > 12.5 and row['loan_int_rate'] <= 20 and row['loan_percent_income'] <= 0.9) or (
          row['loan_status'] == 1 and row['loan_int_rate'] <= 20):
        return 'Medium'
    else:
        return 'Medium'  # Changing the default case to 'Medium' to balance the distribution

# Apply the adjusted function
df['business_proposal_analysis'] = df.apply(categorize_business_plan, axis=1)

# Verify the new distribution of business proposal analysis
print(df['business_proposal_analysis'].value_counts())

# Save the updated DataFrame to a new CSV file
df.to_csv('credit_risk_dataset_with_analysis_advanced_2.csv', index=False)


business_proposal_analysis
Medium    25124
High       7394
Low          63
Name: count, dtype: int64


In [4]:
import pandas as pd

# Load your edited dataset
df = pd.read_csv('randomised_LI_credit_risk_dataset.csv')
def categorize_business_plan(row):
    if (row['loan_status'] == 0 and row['loan_int_rate'] <= 12.5 and row['loan_percent_income'] <= 0.5 and 
        row['cb_person_default_on_file'] == 'N' and row['cb_person_cred_hist_length'] > 4):
        return 'High'
    elif (row['loan_status'] == 1 and row['loan_int_rate'] <= 12.5 and row['loan_percent_income'] <= 0.5 and 
        row['cb_person_default_on_file'] == 'N' and row['cb_person_cred_hist_length'] > 4):
        return 'High'
    # frequent prior defaults and short credit history
    elif (row['loan_status'] == 1 and (row['loan_int_rate'] > 17 or 
                                       row['cb_person_cred_hist_length'] <= 3 or 
                                       row['cb_person_default_on_file'] == 'Y')):
        return 'Low'
    elif (row['loan_status'] == 0 and (row['loan_int_rate'] > 17 and 
                                       row['cb_person_cred_hist_length'] <= 3 and 
                                       row['cb_person_default_on_file'] == 'Y')):
        return 'Low'
    elif (row['loan_status'] == 0 and row['loan_int_rate'] > 12.5 and row['loan_int_rate'] <= 17 and row['loan_percent_income'] <= 0.9) or (
          row['loan_status'] == 1 and row['loan_int_rate'] <= 17):
        return 'Medium'
    else:
        return 'Medium'  # Changing the default case to 'Medium' to balance the distribution

# Apply the adjusted function
df['business_proposal_analysis'] = df.apply(categorize_business_plan, axis=1)

# Verify the new distribution of business proposal analysis
print(df['business_proposal_analysis'].value_counts())

# Save the updated DataFrame to a new CSV file
df.to_csv('randomised_LI_credit_risk_dataset_with_prop.csv', index=False)

business_proposal_analysis
Medium    19895
High       8375
Low        4311
Name: count, dtype: int64


In [7]:
import pandas as pd

# Load your edited dataset
df = pd.read_csv('randomised_LI_credit_risk_dataset.csv')

def categorize_business_plan(row):
    if (row['loan_status'] == 0 and row['loan_int_rate'] <= 12.5 and row['loan_percent_income'] <= 0.5 and 
        row['cb_person_default_on_file'] == 'N' and row['cb_person_cred_hist_length'] > 4):
        return 'High'
    elif (row['loan_status'] == 1 and row['loan_int_rate'] <= 12.5 and row['loan_percent_income'] <= 0.5 and 
          row['cb_person_default_on_file'] == 'N' and row['cb_person_cred_hist_length'] > 4):
        return 'High'
    elif (row['loan_int_rate'] > 17 or row['cb_person_cred_hist_length'] <= 2 or row['cb_person_default_on_file'] == 'Y'):
        if (row['loan_status'] == 0):
            return 'Low'
        elif (row['loan_status'] == 1):
            return 'Low'
    else:
        return 'Medium'

# Apply the adjusted function
df['business_proposal_analysis'] = df.apply(categorize_business_plan, axis=1)

# Verify the new distribution of business proposal analysis
print(df['business_proposal_analysis'].value_counts())

# Save the updated DataFrame to a new CSV file
df.to_csv('randomised_LI_credit_risk_dataset_with_prop_2.csv', index=False)


business_proposal_analysis
Medium    13176
Low       11030
High       8375
Name: count, dtype: int64
