<h1>Data Generation Code</h1>

<h3> Summary </h3>
<b>In this code, I found the number of data in each ranges and randomly generate the corresponding number of data within the range.

In [1]:
import numpy as np # imports a fast numerical programming library
import scipy as sp # imports stats functions, amongst other things
import pandas as pd # lets us handle data as dataframes

import math # lets us use ceil() or floor() functions
from random import randint # import a random integer generator

<h4> Data Generation for Active Users </h4>
<b>I start by reading in the pre-processed data sets as pandas dataframes and preprocessed the data so as to make the labels readable in Python.</b>

In [2]:
df_act_1 = pd.read_csv('./data/act_sal_age.csv') # read the CSV: Balance by Age
df_act_2 = pd.read_csv('./data/act_age_lev.csv') # read the CSV: Age by Title

In [3]:
# Append this column to the dataframe, making it convenient for me to read data ranges later
df_act_1.columns = ["sal_range", "21_29", "30_39", "40_49", "50_59", "60_69", "70_79", "80_89", "90_100", "sum_sal"]

In [4]:
# Showing the first five rows of original dataset
df_act_1.head(5)

Unnamed: 0,sal_range,21_29,30_39,40_49,50_59,60_69,70_79,80_89,90_100,sum_sal
0,1000_4999,0.0019,0.0014,0.0013,0.0021,0.0015,0.0,0.0,0.0,0.0083
1,5000_9999,0.0153,0.0062,0.0057,0.0089,0.0065,0.0038,0.0022,0.0,0.0486
2,10000_24999,0.0185,0.0095,0.0087,0.0137,0.01,0.0058,0.0034,0.0,0.0696
3,25000_49999,0.0145,0.0112,0.0102,0.0161,0.0118,0.0068,0.004,0.0,0.0745
4,50000_99999,0.0327,0.0252,0.0231,0.0364,0.0266,0.0153,0.0091,0.0,0.1683


<b> Here I am retrieving the age range as well as the corresponding age percentages as arrays.  

In [5]:
# Retrieve the Index column, which contains the age ranges 
age_ranges = df_act_1.columns[1:-1]
# Retrieve the last row, excluding first and last columns 
age_percents = df_act_1.iloc[14, 1:-1].to_numpy() 
age_ranges

Index(['21_29', '30_39', '40_49', '50_59', '60_69', '70_79', '80_89',
       '90_100'],
      dtype='object')

<b> This is the main portion of the code for random data generation. I start by iterating through ```age_percents```, which gives me the percentages of people in the corresponding age range. And then I randomly generate ```1234*age_percents[i]``` of age data in the corresponding ```age_ranges```. Balance and Title data were generated in similar ways.

In [6]:
# Blank lists that are used to store generated data later
age_list = []
sal_list = []
lev_list = []

# Iterate over the list containing Age percentages
for i in range(len(age_percents)):
    
    # Find the corresponding Age range
    range_age = age_ranges[i].split('_')
    floor_age= int(range_age[0])
    ceiling_age = int(range_age[1])
    number_age = int(round(age_percents[i]*1234))
    
    # Find the ranges, percentages arrays of Balance for this specific age range (eg. 21-29)
    sal_ranges = df_act_1.iloc[0:-1, 0].to_numpy()
    sal_percents = df_act_1.iloc[0:-1, i+1].to_numpy()

    # Find the title names, percentages arrays of Title for this specific age range 
    levels = df_act_2.columns[1:-1]
    lev_percents = df_act_2.iloc[i, 1:-1].to_numpy()
    
    
    # Generate random age data points (#: number_age) within the age range
    for j in range(number_age):
        # Randomly generate ages within the age range
        age = randint(floor_age, ceiling_age)
        # Append to the list
        age_list.append(age)
        
    # Variable used in compensating for inaccuracy of original data due to rounding     
    number_sal_in_age_range = 0
    # Iterate over the list containing percentages of Balance ranges within that age range
    for k in range(len(sal_percents)):
        # Find the corresponding Balance range
        range_sal = sal_ranges[k].split('_')
        floor_sal= int(range_sal[0])
        ceiling_sal = int(range_sal[1])
        # Number of Balance data points in this particular range
        number_sal = int(round(sal_percents[k]/age_percents[i]*number_age))
        
        # Generate random Balance data points (#: number_sal) within the age range
        for l in range(number_sal):
            
            # Test if too many data points (> number of age data points in this age range)
            if (number_sal_in_age_range >= number_age): break
            sal = randint(floor_sal, ceiling_sal)
            # Append to the list
            sal_list.append(sal)
            # Add one to our counter 
            number_sal_in_age_range += 1

    
    # Variable used in compensating for inaccuracy of original data due to rounding     
    number_lev_in_age_range = 0
    # Iterate over the list containing percentages of Title ranges within that age range
    for m in range(len(lev_percents)):
        number_lev = int(math.ceil(lev_percents[m]/age_percents[i]*number_age))
        
         # Generate random Title data points (#: number_lev) within the age range
        for n in range(number_lev):
            
            # Test if too many data points (> number of age data points in this age range)
            if (number_lev_in_age_range >= number_age): break
            # Retrieve Title corresponding to this percentage
            level = levels[m]
            # Append to the list
            lev_list.append(level)
            # Add one to our counter 
            number_lev_in_age_range += 1


In [7]:
# Append the lists onto a Pandas dataframe and export as a .CSV file
result_act = pd.DataFrame()
index = np.arange(start=1, stop=1235, step=1).tolist()
result_act['ID'] = index
result_act['Age'] = age_list
result_act['Balance'] = sal_list
result_act['Title'] = lev_list
result_act
result_act.to_csv("active_generated.csv")

<h4> Data Generation for Inactive Users </h4>

<b> The remaining part is for Inactive Users' data generation. Explanation is omitted as the rationals are very similar to the one above. </b> 

In [8]:
df_inact_1 = pd.read_csv('./data/inact_sal_type.csv')
df_inact_2 = pd.read_csv('./data/inact_race_type.csv')

In [9]:
df_inact_2

Unnamed: 0,#,Unknown,AA,Hispanic,Caucasian,Asian,sum_type
0,Cash Out,0.2803,0.0096,0.0153,0.0955,0.0353,0.436
1,Keep in Plan,0.1955,0.0067,0.0106,0.0666,0.0246,0.304
2,Transfer to IRA,0.108,0.0037,0.0059,0.0368,0.0136,0.168
3,Transfer to Plan,0.0592,0.002,0.0032,0.0201,0.0075,0.092
4,sum_race,0.643,0.022,0.035,0.219,0.081,1.0


In [10]:
a = df_inact_2.iloc[0:-1,1:-1].to_numpy()
a.sum(axis=1)

array([0.436, 0.304, 0.168, 0.092])

In [11]:
type_names = df_inact_1.columns[1:-1]
type_percents = df_inact_1.iloc[14, 1:-1].to_numpy()
type_percents 

array([0.436, 0.304, 0.168, 0.092], dtype=object)

In [12]:
type_list = []
sal_list = []
race_list = []
for i in range(len(type_percents)):
    number_type = int(round(type_percents[i]*1234))
    
    for j in range(number_type):
        type_list.append(type_names[i])
    
    # about Balance
    sal_ranges = df_inact_1.iloc[0:-1, 0].to_numpy()
    sal_percents = df_inact_1.iloc[0:-1, i+1].to_numpy()
    
    number_sal_in_type_range = 0
    for k in range(len(sal_percents)):
        range_sal = sal_ranges[k].split('_')
        floor_sal= int(range_sal[0])
        ceiling_sal = int(range_sal[1])
        number_sal = int(math.ceil((sal_percents[k]/type_percents[i]*number_type)))
        for l in range(number_sal):
            if (number_sal_in_type_range >= number_type): break
            sal = randint(floor_sal, ceiling_sal)
            sal_list.append(sal)
            number_sal_in_type_range += 1
            
    # about race 
    race_names = df_inact_2.columns[1:-1]
    race_percents = df_inact_2.iloc[i, 1:-1].to_numpy()
    
    number_race_in_age_range = 0
    for m in range(len(race_names)):
        number_race = int(math.ceil((race_percents[m]/type_percents[i]*number_type)))
        for n in range(number_race):
            if (number_race_in_age_range >= number_type): break
            race = race_names[m]
            race_list.append(race)
            number_race_in_age_range += 1
    

In [13]:
result_inact = pd.DataFrame()
index = np.arange(start=1, stop=1235, step=1).tolist()
result_inact['ID'] = index
result_inact['Category'] = type_list
result_inact['Balance'] = sal_list
result_inact['Race'] = race_list
result_inact
result_inact.to_csv("inactive_generated.csv")