#### Customer Info

In [194]:
import numpy as np
import pandas as pd
from datetime import datetime, timedelta

In [195]:
np.random.seed(42)

In [196]:
#Customer ID
num_customers = 10000  #number of customers

customer_ids = np.arange(1, num_customers + 1)

In [197]:
#Age
age = np.random.normal(40, 15, num_customers)
age = np.clip(age, 18, 80).astype(int)  # Ensure age is between 18 and 80

In [198]:
#gender
# 0 --> Male
# 1 --> Female

options = [0,1]
gender = np.random.choice(options, num_customers, p=[0.55, 0.45]) 

In [199]:
#location
# 0 --> Urban
# 1 --> Suburb
# 2 --> Rural


locations = [0,1,2]
location_probs = [0.6, 0.3, 0.1]
location = np.random.choice(locations, num_customers, location_probs)

In [200]:
#income
income = np.random.lognormal(14, 0.5, num_customers)

#add correlation between age and income
correlation = 0.5 
income = (income * (1 - correlation) + 
          income * correlation * 
          (age / np.mean(age)))
income = np.round(income, -3) #round to the nearest thousand

In [201]:
income

array([1907000., 2361000., 1285000., ..., 1881000., 1115000., 1175000.])

In [202]:
df = pd.DataFrame({
    'CustomerID': customer_ids,
    'Age': age,
    'Gender': gender,
    'Income': income,
    'Location': location
})

# df = pd.concat([df, product_data, channel_interactions, recent_transactions], axis=1)

In [204]:
# df.corr()

In [109]:
#hishest education of customers
# 0 --> High School
# 1 --> Graduation
# 2 --> Masters
# 3 --> Post Doc

edu_choices = [0,1,2,3]
edu_probs = [0.3, 0.45, 0.2, 0.05]
highestEducation = np.random.choice(edu_choices, num_customers, p=edu_probs)

In [110]:
df["HighestEducation"]= highestEducation

In [111]:
df.head(15)

Unnamed: 0,CustomerID,Age,Gender,Income,Location,HighestEducation
0,1,47,0,1907000.0,2,1
1,2,37,0,2361000.0,0,2
2,3,49,0,1285000.0,2,0
3,4,62,0,1345000.0,2,1
4,5,36,1,1514000.0,1,0
5,6,36,0,408000.0,2,2
6,7,63,0,1001000.0,0,1
7,8,51,1,869000.0,1,0
8,9,32,0,1427000.0,2,1
9,10,48,1,1426000.0,1,3


In [112]:
#sanity check age and highest education
for idx, row in df.iterrows():
    if row['Age']<=21:
        if row['HighestEducation']>1:
            df.loc[idx,'HighestEducation']=1


In [113]:
df.head(15)

Unnamed: 0,CustomerID,Age,Gender,Income,Location,HighestEducation
0,1,47,0,1907000.0,2,1
1,2,37,0,2361000.0,0,2
2,3,49,0,1285000.0,2,0
3,4,62,0,1345000.0,2,1
4,5,36,1,1514000.0,1,0
5,6,36,0,408000.0,2,2
6,7,63,0,1001000.0,0,1
7,8,51,1,869000.0,1,0
8,9,32,0,1427000.0,2,1
9,10,48,1,1426000.0,1,3


In [114]:
#add correlation between highest education and income
edu_corr = 0.3
income = (income*(1-edu_corr)+ income*edu_corr*(highestEducation+1)/float(2.5))

In [115]:
income = np.round(income, -3)

In [116]:
income

array([1793000., 2503000., 1054000., ..., 1542000.,  914000., 1104000.])

In [117]:
df['Income']=income

In [118]:
df.head(15)

Unnamed: 0,CustomerID,Age,Gender,Income,Location,HighestEducation
0,1,47,0,1793000.0,2,1
1,2,37,0,2503000.0,0,2
2,3,49,0,1054000.0,2,0
3,4,62,0,1264000.0,2,1
4,5,36,1,1241000.0,1,0
5,6,36,0,432000.0,2,2
6,7,63,0,941000.0,0,1
7,8,51,1,713000.0,1,0
8,9,32,0,1341000.0,2,1
9,10,48,1,1683000.0,1,3


In [119]:
df.corr()

Unnamed: 0,CustomerID,Age,Gender,Income,Location,HighestEducation
CustomerID,1.0,-0.00842,0.026943,-0.017256,-0.005225,-0.002937
Age,-0.00842,1.0,0.003921,0.310515,-0.000706,0.069119
Gender,0.026943,0.003921,1.0,-0.000398,0.014312,0.001668
Income,-0.017256,0.310515,-0.000398,1.0,-0.011697,0.206347
Location,-0.005225,-0.000706,0.014312,-0.011697,1.0,-0.007712
HighestEducation,-0.002937,0.069119,0.001668,0.206347,-0.007712,1.0


In [120]:
#credit score
creditScore = np.random.normal(700, 50, num_customers)
creditScore = np.clip(creditScore, 300, 850).astype(int)
creditScore

array([668, 689, 635, ..., 754, 629, 713])

In [121]:
df['CreditScore'] = creditScore
df.head(15)

Unnamed: 0,CustomerID,Age,Gender,Income,Location,HighestEducation,CreditScore
0,1,47,0,1793000.0,2,1,668
1,2,37,0,2503000.0,0,2,689
2,3,49,0,1054000.0,2,0,635
3,4,62,0,1264000.0,2,1,661
4,5,36,1,1241000.0,1,0,711
5,6,36,0,432000.0,2,2,711
6,7,63,0,941000.0,0,1,721
7,8,51,1,713000.0,1,0,666
8,9,32,0,1341000.0,2,1,753
9,10,48,1,1683000.0,1,3,692


In [122]:
# professions
# 0 --> Salaried
# 1 --> Self-employed
# 2 --> Business
# 3 --> Retired

professions = [0,1,2,3]
prof_probs = [0.55, 0.3, 0.1, 0.05]
profession = np.random.choice(professions, num_customers, p = prof_probs)

df['Profession']=profession

#sanity check age and salaried professionals
for idx, row in df.iterrows():
    if row['Age']>60 and row['Profession']==0:
        df.loc[idx, 'Profession']=3

In [124]:
df.head(15)

Unnamed: 0,CustomerID,Age,Gender,Income,Location,HighestEducation,CreditScore,Profession
0,1,47,0,1793000.0,2,1,668,1
1,2,37,0,2503000.0,0,2,689,2
2,3,49,0,1054000.0,2,0,635,0
3,4,62,0,1264000.0,2,1,661,3
4,5,36,1,1241000.0,1,0,711,0
5,6,36,0,432000.0,2,2,711,0
6,7,63,0,941000.0,0,1,721,3
7,8,51,1,713000.0,1,0,666,2
8,9,32,0,1341000.0,2,1,753,0
9,10,48,1,1683000.0,1,3,692,0


In [129]:
#disposable income

percentage = np.random.lognormal(3,0.5,num_customers)
percentage = np.round(percentage, 0)

In [134]:
df['DisposableIncome']= np.round(percentage*0.01*df['Income'], -3)

df.head(15)

Unnamed: 0,CustomerID,Age,Gender,Income,Location,HighestEducation,CreditScore,Profession,DisposableIncome
0,1,47,0,1793000.0,2,1,668,1,126000.0
1,2,37,0,2503000.0,0,2,689,2,275000.0
2,3,49,0,1054000.0,2,0,635,0,316000.0
3,4,62,0,1264000.0,2,1,661,3,442000.0
4,5,36,1,1241000.0,1,0,711,0,285000.0
5,6,36,0,432000.0,2,2,711,0,186000.0
6,7,63,0,941000.0,0,1,721,3,207000.0
7,8,51,1,713000.0,1,0,666,2,100000.0
8,9,32,0,1341000.0,2,1,753,0,215000.0
9,10,48,1,1683000.0,1,3,692,0,151000.0


In [135]:
df.to_csv('CustomerInfo.csv', index = False)

In [136]:
df.corr()

Unnamed: 0,CustomerID,Age,Gender,Income,Location,HighestEducation,CreditScore,Profession,DisposableIncome
CustomerID,1.0,-0.00842,0.026943,-0.017256,-0.005225,-0.002937,-0.013352,-0.008864,-0.022152
Age,-0.00842,1.0,0.003921,0.310515,-0.000706,0.069119,0.004461,0.275512,0.202682
Gender,0.026943,0.003921,1.0,-0.000398,0.014312,0.001668,0.012299,-0.000277,0.002454
Income,-0.017256,0.310515,-0.000398,1.0,-0.011697,0.206347,-0.004981,0.065605,0.683195
Location,-0.005225,-0.000706,0.014312,-0.011697,1.0,-0.007712,-0.010295,-0.004681,-0.003359
HighestEducation,-0.002937,0.069119,0.001668,0.206347,-0.007712,1.0,-0.016985,0.004362,0.126775
CreditScore,-0.013352,0.004461,0.012299,-0.004981,-0.010295,-0.016985,1.0,0.017085,-0.009595
Profession,-0.008864,0.275512,-0.000277,0.065605,-0.004681,0.004362,0.017085,1.0,0.038788
DisposableIncome,-0.022152,0.202682,0.002454,0.683195,-0.003359,0.126775,-0.009595,0.038788,1.0


#### Products 

In [186]:
#product owned and amount
products = ['PersonalLoan', 'HomeLoan', 'VehicleLoan', 'EducationLoan', 'Insurance', 'VehicleInsurance', 'MutualFund', 'FixedDeposit']

def generate_product_data(product, ownership_rate, amount_mean, amount_std):
    owned = np.random.choice([0, 1], num_customers, p=[1-ownership_rate, ownership_rate])
    amount = np.random.lognormal(amount_mean, amount_std, num_customers) * owned
    amount = np.round(amount, -2) 
    return pd.DataFrame({
        f'{product}_Owned': owned,
        f'{product}_Amount': amount
    })

product_data = pd.concat([
    generate_product_data('PersonalLoan', 0.3, 13, 0.5),
    generate_product_data('HomeLoan', 0.5, 15, 0.5),
    generate_product_data('VehicleLoan', 0.4, 13, 0.6),
    generate_product_data('EducationLoan', 0.3, 12, 0.4),
    generate_product_data('Insurance', 0.7, 10, 0.5),
    generate_product_data('VehicleInsurance', 0.4, 10, 0.5),
    generate_product_data('MutualFund', 0.5, 12, 0.5),
    generate_product_data('FixedDeposit', 0.4, 12, 0.5)
], axis=1)

In [187]:
df2 = pd.DataFrame({
    "CustomerId": customer_ids,
    })
df2 = pd.concat([df2, product_data], axis =1)
df2.head(15)


Unnamed: 0,CustomerId,PersonalLoan_Owned,PersonalLoan_Amount,HomeLoan_Owned,HomeLoan_Amount,VehicleLoan_Owned,VehicleLoan_Amount,EducationLoan_Owned,EducationLoan_Amount,Insurance_Owned,Insurance_Amount,VehicleInsurance_Owned,VehicleInsurance_Amount,MutualFund_Owned,MutualFund_Amount,FixedDeposit_Owned,FixedDeposit_Amount
0,1,0,0.0,0,0.0,1,430900.0,0,0.0,1,18400.0,0,0.0,1,105400.0,1,156100.0
1,2,0,0.0,0,0.0,0,0.0,1,196100.0,1,15300.0,0,0.0,0,0.0,1,323400.0
2,3,0,0.0,0,0.0,1,811400.0,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0
3,4,1,436100.0,0,0.0,0,0.0,1,216800.0,0,0.0,1,14900.0,0,0.0,0,0.0
4,5,0,0.0,1,3695900.0,1,527800.0,0,0.0,1,26900.0,1,19400.0,1,185700.0,0,0.0
5,6,1,445500.0,0,0.0,1,342000.0,0,0.0,0,0.0,1,42300.0,0,0.0,1,178200.0
6,7,0,0.0,0,0.0,0,0.0,0,0.0,1,20400.0,0,0.0,0,0.0,0,0.0
7,8,0,0.0,1,2071000.0,1,562400.0,0,0.0,0,0.0,1,17300.0,0,0.0,0,0.0
8,9,0,0.0,1,2685900.0,1,323500.0,0,0.0,1,20000.0,1,16500.0,0,0.0,0,0.0
9,10,0,0.0,0,0.0,0,0.0,1,92100.0,1,25900.0,1,18600.0,1,256400.0,0,0.0


In [188]:
products = ['PersonalLoan', 'HomeLoan', 'VehicleLoan', 'EducationLoan', 'Insurance', 'VehicleInsurance', 'MutualFund', 'FixedDeposit']

product_params = {
    'PersonalLoan': {'amount_mean':13, 'amount_std':0.5},
    'HomeLoan': {'amount_mean':15, 'amount_std':0.5},
    'VehicleLoan': {'amount_mean':13, 'amount_std':0.6},
    'EducationLoan': {'amount_mean':12, 'amount_std':0.4},
    'Insurance': {'amount_mean':10, 'amount_std':0.3},
    'VehicleInsurance': {'amount_mean':10, 'amount_std':0.5},
    'MutualFund': {'amount_mean':12, 'amount_std':0.6},
    'FixedDeposit': {'amount_mean':12, 'amount_std':0.6},
}


def generate_product_data():
    product_data = []

    num_products = num_products_probs = [0.3,0.45,0.25]
    
    for _ in range(num_customers):
        # Randomly choose the number of products (0 to 3) based on the avove probabilities
        num_products = np.random.choice(np.arange(1,4), p=num_products_probs)
        
        # Randomly select products
        selected_products = np.random.choice(products, num_products, replace=False)
        
        customer_products = {}
        for product in products:
            if product in selected_products:
                customer_products[f'{product}_Owned'] = 1
                amount = np.random.lognormal(
                    product_params[product]['amount_mean'],
                    product_params[product]['amount_std']
                )
                customer_products[f'{product}_Amount'] = round(amount, -2)
            else:
                customer_products[f'{product}_Owned'] = 0
                customer_products[f'{product}_Amount'] = 0
        
        product_data.append(customer_products)
    
    return pd.DataFrame(product_data)


product_data = generate_product_data()

# Verify the distribution of number of products per customer
product_counts = product_data[[f'{p}_Owned' for p in products]].sum(axis=1)
print(product_counts.value_counts(normalize=True).sort_index())


1    0.3023
2    0.4472
3    0.2505
Name: proportion, dtype: float64


In [189]:
df2 = pd.concat([pd.DataFrame({'CustomerID':customer_ids}), product_data], axis=1)

In [190]:
df2.head(10)

Unnamed: 0,CustomerID,PersonalLoan_Owned,PersonalLoan_Amount,HomeLoan_Owned,HomeLoan_Amount,VehicleLoan_Owned,VehicleLoan_Amount,EducationLoan_Owned,EducationLoan_Amount,Insurance_Owned,Insurance_Amount,VehicleInsurance_Owned,VehicleInsurance_Amount,MutualFund_Owned,MutualFund_Amount,FixedDeposit_Owned,FixedDeposit_Amount
0,1,0,0.0,0,0.0,1,129300.0,0,0.0,0,0.0,0,0.0,0,0.0,1,149200.0
1,2,0,0.0,0,0.0,0,0.0,1,189000.0,0,0.0,0,0.0,0,0.0,1,230800.0
2,3,0,0.0,0,0.0,1,1676000.0,0,0.0,0,0.0,0,0.0,0,0.0,1,182100.0
3,4,0,0.0,0,0.0,1,332000.0,0,0.0,0,0.0,1,35500.0,0,0.0,0,0.0
4,5,0,0.0,0,0.0,0,0.0,0,0.0,1,23400.0,0,0.0,0,0.0,0,0.0
5,6,1,798400.0,0,0.0,1,317900.0,0,0.0,0,0.0,0,0.0,1,239200.0,0,0.0
6,7,1,296900.0,1,4075200.0,0,0.0,0,0.0,1,25900.0,0,0.0,0,0.0,0,0.0
7,8,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,1,19000.0,0,0.0,0,0.0
8,9,0,0.0,0,0.0,0,0.0,1,187200.0,0,0.0,1,37800.0,0,0.0,1,66700.0
9,10,0,0.0,1,5420300.0,0,0.0,0,0.0,1,30300.0,0,0.0,0,0.0,0,0.0


In [191]:
#last product purchased
last_purchase=[]
for idx, row in df2.iterrows():
    owned_products = [p for p in products if row[f'{p}_Owned']==1]
    last_product_bought = np.random.choice(owned_products)
    last_purchase.append(last_product_bought)
recent_transaction = pd.DataFrame({'LastProductPurchased':last_purchase})
df2 = pd.concat([df2, recent_transaction], axis =1)
df2.head(10)


Unnamed: 0,CustomerID,PersonalLoan_Owned,PersonalLoan_Amount,HomeLoan_Owned,HomeLoan_Amount,VehicleLoan_Owned,VehicleLoan_Amount,EducationLoan_Owned,EducationLoan_Amount,Insurance_Owned,Insurance_Amount,VehicleInsurance_Owned,VehicleInsurance_Amount,MutualFund_Owned,MutualFund_Amount,FixedDeposit_Owned,FixedDeposit_Amount,LastProductPurchased
0,1,0,0.0,0,0.0,1,129300.0,0,0.0,0,0.0,0,0.0,0,0.0,1,149200.0,VehicleLoan
1,2,0,0.0,0,0.0,0,0.0,1,189000.0,0,0.0,0,0.0,0,0.0,1,230800.0,EducationLoan
2,3,0,0.0,0,0.0,1,1676000.0,0,0.0,0,0.0,0,0.0,0,0.0,1,182100.0,VehicleLoan
3,4,0,0.0,0,0.0,1,332000.0,0,0.0,0,0.0,1,35500.0,0,0.0,0,0.0,VehicleInsurance
4,5,0,0.0,0,0.0,0,0.0,0,0.0,1,23400.0,0,0.0,0,0.0,0,0.0,Insurance
5,6,1,798400.0,0,0.0,1,317900.0,0,0.0,0,0.0,0,0.0,1,239200.0,0,0.0,VehicleLoan
6,7,1,296900.0,1,4075200.0,0,0.0,0,0.0,1,25900.0,0,0.0,0,0.0,0,0.0,PersonalLoan
7,8,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0,1,19000.0,0,0.0,0,0.0,VehicleInsurance
8,9,0,0.0,0,0.0,0,0.0,1,187200.0,0,0.0,1,37800.0,0,0.0,1,66700.0,FixedDeposit
9,10,0,0.0,1,5420300.0,0,0.0,0,0.0,1,30300.0,0,0.0,0,0.0,0,0.0,HomeLoan


In [192]:
df2.to_csv('ProductInfo.csv', index = False)