In [153]:
import pandas as pd
import numpy as np

In [154]:
print("\033[1m"+"Starting Synthetic Data Generation"+"\033[0m")


[1mStarting Synthetic Data Generation[0m


In [155]:
# Setting random seed for reproducibility
np.random.seed(0)

In [156]:
# Number of records
n_records = 500

In [157]:
# Function to generate random data for a categorical variable
def generate_categorical_data(categories, n_records):
    return np.random.choice(categories, n_records)

In [158]:
# Function to generate random data for a continuous variable
def generate_continuous_data(range_start, range_end, n_records):
    return np.random.uniform(range_start, range_end, n_records)

In [159]:
# Generate data for each variable
data = {
    'Age': generate_continuous_data(18, 70, n_records),
    'Customer_Type': generate_categorical_data(['Individual', 'Non Individual'], n_records),
    'Gender': generate_categorical_data(['Male', 'Female', 'Other'], n_records),
    'Location': generate_categorical_data(['Rural', 'Semi Urban', 'Urban', 'Metro'], n_records),
    'Income': generate_continuous_data(10000, 1000000, n_records),
    'Occupation': generate_categorical_data(['Salaried','Non Salaried'], n_records),
    'Type_of_Profession': generate_categorical_data(['Public', 'Private', 'Professional', 'Business','Pensioner','Student','Others'], n_records),
    'Credit_Score': generate_continuous_data(300, 850, n_records),
    'Number_of_Products': np.random.randint(1, 10, n_records),
    'Tenure_of_Association': generate_continuous_data(0, 30, n_records),
    'Cheque_Bounce_Indicator': generate_categorical_data(['Yes', 'No'], n_records),
    'Min_Avg_Balance': generate_continuous_data(0, 500000, n_records),
    'Last_FY_Credit': generate_continuous_data(1000, 5000000, n_records),
    'Last_Account_Opened': generate_continuous_data(0, 3650, n_records), 
    'Account_Nature': generate_categorical_data(['Standard', 'Stressed'], n_records),
    'prd': generate_categorical_data(['TL^AGRI^ALLIED^ALLIED_ANIMAL_HUSBANDRY',
'TL^AGRI^ALLIED^ALLIED_DAIRY',
'TL^AGRI^ALLIED^ALLIED_FISHERY',
'TL^AGRI^ALLIED^ALLIED_POULTRY',
'TL^AGRI^ALLIED^ALLIED_SERICULTURE',
'TL^AGRI^ANCL^AGRI_CLINIC',
'TL^AGRI^ANCL^ANCL_AGRO_PROCESS',
'TL^AGRI^CROP^AGRI_JL',
'CC^AGRI^CROP^AGRI_KCC',
'TL^AGRI^INFRA^COLD_GODOWN',
'TL^AGRI^INFRA^LAND_DEVELOPMENT',
'TL^AGRI^INV^FARM_MECH',
'TL^AGRI^INV^MINOR_IRRIGATION',
'TL^AGRI^INV^SHG',
'TL^CORPORATE^CORPORATE^MFG',
'TL^CORPORATE^LARGE_CORP^MFG',
'TL^CORPORATE^MID_CORP^MFG',
'TL^CORPORATE^CORPORATE^SRV',
'TL^CORPORATE^LARGE_CORP^SRV',
'TL^CORPORATE^MID_CORP^SRV',
'CC^CORPORATE^CORPORATE^MFG',
'CC^CORPORATE^LARGE_CORP^MFG',
'CC^CORPORATE^MID_CORP^MFG',
'CC^CORPORATE^CORPORATE^SRV',
'CC^CORPORATE^LARGE_CORP^SRV',
'CC^CORPORATE^MID_CORP^SRV',
'TL^MSME^MEDIUM^MFG',
'TL^MSME^MEDIUM^SRV',
'TL^MSME^MICRO^MFG',
'TL^MSME^MICRO^SRV',
'TL^MSME^SMALL^MFG',
'TL^MSME^SMALL^SRV',
'CC^MSME^MEDIUM^MFG',
'CC^MSME^MEDIUM^SRV',
'CC^MSME^MICRO^MFG',
'CC^MSME^MICRO^SRV',
'CC^MSME^SMALL^MFG',
'CC^MSME^SMALL^SRV',
'TL^RETAIL^EDUCATION_LOAN^EDUCATION',
'TL^RETAIL^EDUCATION_LOAN^EDUCATION_ABROAD',
'TL^RETAIL^HOME_LOAN^PLOT',
'TL^RETAIL^HOME_LOAN^BUILDING',
'TL^RETAIL^MORTGAGE^MORTGAGE',
'TL^RETAIL^PERSONAL_LOAN^JEWEL_NP',
'TL^RETAIL^PERSONAL_LOAN^LOD_ADV',
'TL^RETAIL^PERSONAL_LOAN^PENSION',
'TL^RETAIL^PERSONAL_LOAN^SALARY',
'TL^RETAIL^VEHICLE_LOAN^AUTO_VEHICLE_2W',
'TL^RETAIL^VEHICLE_LOAN^AUTO_VEHICLE_4W'], n_records),
    'Balance': generate_continuous_data(500, 500000, n_records),
    'Security_Amount': generate_continuous_data(1000, 50000000, n_records),
    'Loan_Repayment_Amount': generate_continuous_data(100, 200000, n_records),
    'Rate_of_Interest': generate_continuous_data(4, 15, n_records), 
    'Limit_Expired': generate_categorical_data(['Yes', 'No'], n_records),
    'Stock_Statement_Expiry_Date_Due': generate_categorical_data(['Yes', 'No'], n_records),
    'Loan_Disbursed_Amount': generate_continuous_data(5000, 50000000, n_records),
    'Restructured_Flag': generate_categorical_data(['Yes', 'No'], n_records),
    'NEG_MAB_CORR': generate_continuous_data(0, 1, n_records),
    'MAB_CORR': generate_continuous_data(0, 1, n_records),
    'REPAY_HIST_GOOD': generate_categorical_data(['Good', 'Bad'], n_records),
    'SMA_DAYS': np.random.randint(0, 365, n_records), 
    'NPA_TO_STD': np.random.randint(0, 4, n_records),
    'STD_TO_NPA': np.random.randint(0, 4, n_records),
    'BOUNCE_RATE': generate_continuous_data(0, 1, n_records),
    'AVG_WDL': generate_continuous_data(1000, 200000, n_records),
    'MORE_THAN_10P_CASH_WDL': generate_categorical_data(['Yes', 'No'], n_records),
    'Inflation_Rate': generate_categorical_data([0, 1], n_records), 
    'Benchmark_Lending_Rate': generate_categorical_data([0, 1], n_records), 
    'Unemployment_Rate': generate_categorical_data([0, 1], n_records), 
   
}

In [160]:
# Creating the DataFrame
df = pd.DataFrame(data)

In [161]:
df[['Type','Sector','Sub_sector','Product']] = df['prd'].str.split('^',expand=True)

In [162]:
# Determining delinquency based on various factors and hypothesis
def determine_delinquency(row):
    risk_score = 0
    
    if(row['Type'])=='CC':
    # Customer Behaviour Data
      risk_score += row['Credit_Score'] < 600 
      risk_score += row['Cheque_Bounce_Indicator'] == 'Yes'
      risk_score += row['Min_Avg_Balance'] < 10000
      risk_score += row['Last_FY_Credit'] < 20000
      risk_score += row['Income'] < 15000 

    # Account Level Data
      risk_score += row['Rate_of_Interest'] > 12  
      risk_score += row['Security_Amount']/row['Balance']  < 0.5  
      risk_score +=row['Restructured_Flag']=='Yes'
      risk_score +=row['Limit_Expired']=='Yes'
      risk_score +=row['Stock_Statement_Expiry_Date_Due']=='Yes'
 

    # Transaction Data
      risk_score += row['NEG_MAB_CORR'] > 0.5 
      risk_score += row['MAB_CORR'] > 0.5 
      risk_score +=row['REPAY_HIST_GOOD']=='BAD'
    
      risk_score += row['SMA_DAYS'] > 180
      risk_score += row['BOUNCE_RATE'] > 0.3 
    
      risk_score += row['NPA_TO_STD']  < 2
      risk_score += row['STD_TO_NPA'] > 2
    
      risk_score += row['AVG_WDL']<20000
    
   
      return 'Yes' if risk_score >= 8 else 'No'
    else:
    # Customer Behaviour Data
      risk_score += row['Credit_Score'] < 650 
      risk_score += row['Cheque_Bounce_Indicator'] == 'Yes'
      risk_score += row['Min_Avg_Balance'] < 10000
      risk_score += row['Last_FY_Credit'] < 20000
      risk_score += row['Income'] < 15000

    # Account Level Data
      risk_score += row['Loan_Repayment_Amount'] > 15000 
      risk_score += row['Rate_of_Interest'] > 12  
      risk_score += row['Security_Amount']/row['Balance']  < 0.5  
      risk_score +=row['Restructured_Flag']=='Yes'
      
    # Transaction Data
      risk_score += row['NEG_MAB_CORR'] > 0.5 
      risk_score += row['MAB_CORR'] > 0.5 
      risk_score +=row['REPAY_HIST_GOOD']=='BAD'
    
      risk_score += row['SMA_DAYS'] > 180
      risk_score += row['BOUNCE_RATE'] > 0.3 
    
      risk_score += row['NPA_TO_STD']  < 2
      risk_score += row['STD_TO_NPA'] > 2
    
         
      return 'Yes' if risk_score >= 7 else 'No'

In [163]:
# Apply the function to determine delinquency
df['Delinquent'] = df.apply(determine_delinquency, axis=1)

In [164]:
#Ading noise
# Setting random seed for reproducibility
np.random.seed(0)

In [165]:
# Number of records
n_records = 500

In [166]:
# Function to generate random data for a categorical variable
def generate_categorical_data(categories, n_records):
    return np.random.choice(categories, n_records)


In [167]:
# Function to generate random data for a continuous variable
def generate_continuous_data(range_start, range_end, n_records):
    return np.random.uniform(range_start, range_end, n_records)


In [168]:
# Generate data for each variable
data = {
    'Age': generate_continuous_data(0, 120, n_records),
    'Customer_Type': generate_categorical_data(['Individual', 'Non Individual'], n_records),
    'Gender': generate_categorical_data(['Male', 'Female', 'Other'], n_records),
    'Location': generate_categorical_data(['Rural', 'Semi Urban', 'Urban', 'Metro'], n_records),
    'Income': generate_continuous_data(0, 10000000, n_records),
    'Occupation': generate_categorical_data(['Salaried','Non Salaried'], n_records),
    'Type_of_Profession': generate_categorical_data(['Public', 'Private', 'Professional', 'Business','Pensioner','Student','Others'], n_records),
    'Credit_Score': generate_continuous_data(300, 850, n_records),
    'Number_of_Products': np.random.randint(1, 10, n_records),
    'Tenure_of_Association': generate_continuous_data(0, 30, n_records),
    'Cheque_Bounce_Indicator': generate_categorical_data(['Yes', 'No'], n_records),
    'Min_Avg_Balance': generate_continuous_data(500000, 5000000, n_records),
    'Last_FY_Credit': generate_continuous_data(0, 1000, n_records),
    'Last_Account_Opened': generate_continuous_data(0, 3650, n_records), 
    'Account_Nature': generate_categorical_data(['Standard', 'Stressed'], n_records),
    'prd': generate_categorical_data(['TL^AGRI^ALLIED^ALLIED_ANIMAL_HUSBANDRY',
'TL^AGRI^ALLIED^ALLIED_DAIRY',
'TL^AGRI^ALLIED^ALLIED_FISHERY',
'TL^AGRI^ALLIED^ALLIED_POULTRY',
'TL^AGRI^ALLIED^ALLIED_SERICULTURE',
'TL^AGRI^ANCL^AGRI_CLINIC',
'TL^AGRI^ANCL^ANCL_AGRO_PROCESS',
'TL^AGRI^CROP^AGRI_JL',
'CC^AGRI^CROP^AGRI_KCC',
'TL^AGRI^INFRA^COLD_GODOWN',
'TL^AGRI^INFRA^LAND_DEVELOPMENT',
'TL^AGRI^INV^FARM_MECH',
'TL^AGRI^INV^MINOR_IRRIGATION',
'TL^AGRI^INV^SHG',
'TL^CORPORATE^CORPORATE^MFG',
'TL^CORPORATE^LARGE_CORP^MFG',
'TL^CORPORATE^MID_CORP^MFG',
'TL^CORPORATE^CORPORATE^SRV',
'TL^CORPORATE^LARGE_CORP^SRV',
'TL^CORPORATE^MID_CORP^SRV',
'CC^CORPORATE^CORPORATE^MFG',
'CC^CORPORATE^LARGE_CORP^MFG',
'CC^CORPORATE^MID_CORP^MFG',
'CC^CORPORATE^CORPORATE^SRV',
'CC^CORPORATE^LARGE_CORP^SRV',
'CC^CORPORATE^MID_CORP^SRV',
'TL^MSME^MEDIUM^MFG',
'TL^MSME^MEDIUM^SRV',
'TL^MSME^MICRO^MFG',
'TL^MSME^MICRO^SRV',
'TL^MSME^SMALL^MFG',
'TL^MSME^SMALL^SRV',
'CC^MSME^MEDIUM^MFG',
'CC^MSME^MEDIUM^SRV',
'CC^MSME^MICRO^MFG',
'CC^MSME^MICRO^SRV',
'CC^MSME^SMALL^MFG',
'CC^MSME^SMALL^SRV',
'TL^RETAIL^EDUCATION_LOAN^EDUCATION',
'TL^RETAIL^EDUCATION_LOAN^EDUCATION_ABROAD',
'TL^RETAIL^HOME_LOAN^PLOT',
'TL^RETAIL^HOME_LOAN^BUILDING',
'TL^RETAIL^MORTGAGE^MORTGAGE',
'TL^RETAIL^PERSONAL_LOAN^JEWEL_NP',
'TL^RETAIL^PERSONAL_LOAN^LOD_ADV',
'TL^RETAIL^PERSONAL_LOAN^PENSION',
'TL^RETAIL^PERSONAL_LOAN^SALARY',
'TL^RETAIL^VEHICLE_LOAN^AUTO_VEHICLE_2W',
'TL^RETAIL^VEHICLE_LOAN^AUTO_VEHICLE_4W'], n_records),
    'Balance': generate_continuous_data(500000, 50000000, n_records),
    'Security_Amount': generate_continuous_data(1000, 50000000, n_records),
    'Loan_Repayment_Amount': generate_continuous_data(200000, 1000000, n_records),
    'Rate_of_Interest': generate_continuous_data(4, 15, n_records), 
    'Limit_Expired': generate_categorical_data(['Yes', 'No'], n_records),
    'Stock_Statement_Expiry_Date_Due': generate_categorical_data(['Yes', 'No'], n_records),
    'Loan_Disbursed_Amount': generate_continuous_data(5000, 50000000, n_records),
    'Restructured_Flag': generate_categorical_data(['Yes', 'No'], n_records),
    'NEG_MAB_CORR': generate_continuous_data(0, 1, n_records),
    'MAB_CORR': generate_continuous_data(0, 1, n_records),
    'REPAY_HIST_GOOD': generate_categorical_data(['Good', 'Bad'], n_records),
    'SMA_DAYS': np.random.randint(0, 365, n_records), 
    'NPA_TO_STD': np.random.randint(0, 4, n_records),
    'STD_TO_NPA': np.random.randint(0, 4, n_records),
    'BOUNCE_RATE': generate_continuous_data(0, 1, n_records),
    'AVG_WDL': generate_continuous_data(0, 1000000, n_records),
    'MORE_THAN_10P_CASH_WDL': generate_categorical_data(['Yes', 'No'], n_records),
    'Inflation_Rate': generate_categorical_data([0, 1], n_records), 
    'Benchmark_Lending_Rate': generate_categorical_data([0, 1], n_records), 
    'Unemployment_Rate': generate_categorical_data([0, 1], n_records), 
   
}

In [169]:
# Creating the DataFrame
df1 = pd.DataFrame(data)

In [170]:
df1[['Type','Sector','Sub_sector','Product']] = df1['prd'].str.split('^',expand=True)

In [171]:
# Determining delinquency based on various factors
def determine_delinquency(row):
    risk_score = 0
    
    if(row['Type'])=='CC':
    # Customer Behaviour Data
      risk_score += row['Credit_Score'] < 600 
      risk_score += row['Cheque_Bounce_Indicator'] == 'Yes'
      risk_score += row['Min_Avg_Balance'] < 10000
      risk_score += row['Last_FY_Credit'] < 20000
      risk_score += row['Income'] < 15000 

    # Account Level Data
      risk_score += row['Rate_of_Interest'] > 12  
      risk_score += row['Security_Amount']/row['Balance']  < 0.5  
      risk_score +=row['Restructured_Flag']=='Yes'
      risk_score +=row['Limit_Expired']=='Yes'
      risk_score +=row['Stock_Statement_Expiry_Date_Due']=='Yes'
 

    # Transaction Data
      risk_score += row['NEG_MAB_CORR'] > 0.5 
      risk_score += row['MAB_CORR'] > 0.5 
      risk_score +=row['REPAY_HIST_GOOD']=='BAD'
    
      risk_score += row['SMA_DAYS'] > 180
      risk_score += row['BOUNCE_RATE'] > 0.3 
    
      risk_score += row['NPA_TO_STD']  < 2
      risk_score += row['STD_TO_NPA'] > 2
    
      risk_score += row['AVG_WDL']<20000
    
   
      return 'Yes' if risk_score >= 10 else 'No'
    else:
    # Customer Behaviour Data
      risk_score += row['Credit_Score'] < 650 
      risk_score += row['Cheque_Bounce_Indicator'] == 'Yes'
      risk_score += row['Min_Avg_Balance'] < 10000
      risk_score += row['Last_FY_Credit'] < 20000
      risk_score += row['Income'] < 15000

    # Account Level Data
      risk_score += row['Loan_Repayment_Amount'] > 15000 
      risk_score += row['Rate_of_Interest'] > 12  
      risk_score += row['Security_Amount']/row['Balance']  < 0.5  
      risk_score +=row['Restructured_Flag']=='Yes'
      
    # Transaction Data
      risk_score += row['NEG_MAB_CORR'] > 0.5 
      risk_score += row['MAB_CORR'] > 0.5 
      risk_score +=row['REPAY_HIST_GOOD']=='BAD'
    
      risk_score += row['SMA_DAYS'] > 180
      risk_score += row['BOUNCE_RATE'] > 0.3 
    
      risk_score += row['NPA_TO_STD']  < 2
      risk_score += row['STD_TO_NPA'] > 2
    
         
      return 'Yes' if risk_score >= 10 else 'No'

In [172]:
# Apply the function to determine delinquency
df1['Delinquent'] = df1.apply(determine_delinquency, axis=1)

In [174]:
df1.loc[df1['Age'] < 18, 'Age'] = np.nan
df1.loc[df1['Age'] > 75, 'Age'] = np.nan

df1.loc[df1['Income'] < 10000, 'Income'] = np.nan
df1.loc[df1['Income'] > 1000000, 'Income'] = np.nan

df1.loc[df1['AVG_WDL'] < 1000, 'AVG_WDL'] = np.nan
df1.loc[df1['AVG_WDL'] > 200000, 'AVG_WDL'] = np.nan

df1['Type_of_Profession'] = None

In [175]:
df2=pd.concat([df,df1])

In [176]:
print("\033[1m"+"Synthetic Data Generated"+"\033[0m")
df2.head

[1mSynthetic Data Generated[0m


<bound method NDFrame.head of            Age   Customer_Type  Gender    Location         Income  \
0    46.538302  Non Individual    Male  Semi Urban  499842.513789   
1    55.189847  Non Individual    Male       Urban  674177.977692   
2    49.343696  Non Individual  Female  Semi Urban  144106.987537   
3    46.333926  Non Individual  Female       Metro  735266.930655   
4    40.030050      Individual    Male  Semi Urban  680529.365466   
..         ...             ...     ...         ...            ...   
495  32.598332      Individual   Other  Semi Urban            NaN   
496  54.653298      Individual    Male       Metro            NaN   
497  48.205624  Non Individual  Female       Metro            NaN   
498  29.809616      Individual   Other  Semi Urban            NaN   
499  60.703966      Individual  Female       Rural            NaN   

       Occupation Type_of_Profession  Credit_Score  Number_of_Products  \
0    Non Salaried           Business    760.974792                 

In [177]:
df2.to_csv("synthetic_data_withnoise.csv",index=False)

In [178]:
print("\033[1m"+"Data Generation Step Completed"+"\033[0m")

[1mData Generation Step Completed[0m
