In [2]:
import numpy as np
import pandas as pd
import random
import string
import lightgbm as lgb

In [2]:
''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(10))

'K86YZDQX8J'

In [5]:
def assign_strategy_based_on_conditions(row, conditions, strategy, strategies):
    if all(condition(row) for condition in conditions):
        return strategy if random.random() < 0.9 else random.choice([s for s in strategies if s != strategy])
    return None

def assign_strategy(row):
    """
    Assigns an investment strategy based on investor profile.
    This is a simplified example and should be more nuanced in practice.
    """
    # Define the thresholds for the numeric variables
    age_threshold = 50
    net_worth_threshold = 300000

    # Define the strategies
    strategies = ['Aggressive', 'Conservative', 'Growth-oriented', 'Growth', 'Income', 'Preservation', 'Balanced']

    strategy_conditions = [
        {
            'strategy': 'Aggressive',
            'conditions': [
                lambda row: row['Risk_Tolerance'] == 'High',
                lambda row: row['Financial_Knowledge'] == 'High'
            ]
        },
        {
            'strategy': 'Conservative',
            'conditions': [
                lambda row: row['Risk_Tolerance'] == 'Low',
                lambda row: row['Time_Horizon'] == 'Short-term'
            ]
        },
        {
            'strategy': 'Growth-oriented',
            'conditions': [
                lambda row: row['Income_Level'] == 'High',
                lambda row: row['Credit_Score'] > 750
            ]
        },
        {
            'strategy': 'Growth',
            'conditions': [
                lambda row: row['Age'] > age_threshold,
                lambda row: row['Net_Worth'] > net_worth_threshold
            ]
        },
        {
            'strategy': 'Income',
            'conditions': [
                lambda row: row['Age'] <= age_threshold,
                lambda row: row['Net_Worth'] > net_worth_threshold
            ]
        },
        {
            'strategy': 'Preservation',
            'conditions': [
                lambda row: row['Age'] > age_threshold,
                lambda row: row['Net_Worth'] <= net_worth_threshold
            ]
        }
    ]

    for strategy_condition in strategy_conditions:
        strategy = assign_strategy_based_on_conditions(row, strategy_condition['conditions'], strategy_condition['strategy'], strategies)
        if strategy is not None:
            return strategy

    return 'Balanced' if random.random() < 0.9 else random.choice([s for s in strategies if s != 'Balanced'])

In [18]:
def generate_synthetic_data(n_samples=10000):
    """
    Generates a synthetic dataset of investor profiles and their corresponding investment strategies.
    """
    np.random.seed(0)
    data = pd.DataFrame({
        'Age': np.random.randint(18, 70, n_samples),
        'Income_Level': np.random.choice(['Low', 'Medium', 'High'], n_samples),
        'Credit_Score': np.random.normal(700, 50, n_samples).astype(int),
        'Investment_Experience': np.random.choice(['Novice', 'Intermediate', 'Experienced'], n_samples),
        'Financial_Knowledge': np.random.choice(['Low', 'Medium', 'High'], n_samples),
        'Risk_Tolerance': np.random.choice(['Low', 'Medium', 'High'], n_samples),
        'Investment_Goals': np.random.choice(['Preservation', 'Income', 'Growth', 'Speculation'], n_samples),
        'Time_Horizon': np.random.choice(['Short-term', 'Medium-term', 'Long-term'], n_samples),
        # New numeric columns
        'Total_Assets': np.random.normal(500000, 150000, n_samples).astype(int),
        'Total_Liabilities': np.random.normal(200000, 100000, n_samples).astype(int),
        'Number_of_Dependents': np.random.randint(0, 5, n_samples),
        'Years_of_Investing': np.random.randint(0, 40, n_samples)
    })

    # Calculating Net Worth
    data['Net_Worth'] = data['Total_Assets'] - data['Total_Liabilities']

    # Assigning Investment Strategies
    data['Investment_Strategy'] = data.apply(assign_strategy, axis=1)

    return data

def encode_data(data):
    """
    Encodes categorical variables in the dataset using one-hot encoding.
    """
    for column in data.columns:
        if data[column].dtype == 'object' and column != "Investment_Strategy" :
            # Perform one-hot encoding and drop the original column
            dummies = pd.get_dummies(data[column], prefix=column)
            data = pd.concat([data, dummies], axis=1)
            data = data.drop(column, axis=1)
    return data

# Generate and encode the synthetic dataset
synthetic_data = generate_synthetic_data()
encoded_data = encode_data(synthetic_data)

synthetic_data.head()  # Display the first few rows of the encoded dataset



Unnamed: 0,Age,Income_Level,Credit_Score,Investment_Experience,Financial_Knowledge,Risk_Tolerance,Investment_Goals,Time_Horizon,Total_Assets,Total_Liabilities,Number_of_Dependents,Years_of_Investing,Net_Worth,Investment_Strategy
0,62,Medium,716,Experienced,High,Low,Income,Medium-term,411537,310754,3,34,100783,Preservation
1,65,Low,631,Experienced,Medium,Medium,Speculation,Short-term,459601,147518,3,13,312083,Growth-oriented
2,18,High,740,Intermediate,High,Medium,Preservation,Short-term,485092,256569,0,33,228523,Balanced
3,21,Low,733,Intermediate,Low,High,Income,Short-term,529798,85701,1,17,444097,Income
4,21,Medium,738,Novice,Medium,Low,Growth,Medium-term,665016,75881,1,29,589135,Income


In [20]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import lightgbm as lgb

# Define X (features) and y (target)
X = encoded_data.drop('Investment_Strategy', axis=1)
y = encoded_data['Investment_Strategy']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the LightGBM model
model = lgb.LGBMClassifier()

# Fit the model to the training data
model.fit(X_train, y_train)

# Predict the risk profile on the test data
y_pred = model.predict(X_test)

# Calculate and print the accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: ", accuracy)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000255 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1153
[LightGBM] [Info] Number of data points in the train set: 8000, number of used features: 26
[LightGBM] [Info] Start training from score -2.177054
[LightGBM] [Info] Start training from score -1.975983
[LightGBM] [Info] Start training from score -2.167180
[LightGBM] [Info] Start training from score -2.469526
[LightGBM] [Info] Start training from score -1.142172
[LightGBM] [Info] Start training from score -1.962548
[LightGBM] [Info] Start training from score -2.416314
Accuracy:  0.905


In [4]:
df = pd.read_csv("/workspaces/robo_advisor/data/raw/synthetic_data.csv")

for col in df.columns:
    if df[col].dtype == "float64" or df[col].dtype == "int64":
        print(col)

Age
Credit_Score
Total_Assets
Total_Liabilities
Number_of_Dependents
Years_of_Investing
Net_Worth
