# 1. Generating Synthetic Expense Data


In [33]:
import pandas as pd
import numpy as np
import random

# Define categories
categories = ['Salaries', 'Marketing & Advertising', 'Office Supplies & Equipment', 
              'Entertainment', 'Rent & Lease', 'Insurance', 'Professional Services', 
              'Software & Subscriptions', 'Taxes & Regulatory Fees', 'Maintenance & Repairs']

# Function to generate synthetic data
def generate_expense_data(n_records=1000):
    data = []
    
    # Define initial total budget and category-specific budgets
    total_budget = 1000000  # Start with a total budget of $1,000,000
    category_budgets = {category: random.randint(50000, 200000) for category in categories}  # Allocate budget to each category
    
    for _ in range(n_records):
        date = pd.Timestamp('2023-01-01') + pd.DateOffset(days=random.randint(0, 365))
        category = random.choice(categories)
        allocated_budget = category_budgets[category]
        
        # Ensure the allocated budget is large enough to generate an expense
        if allocated_budget >= 500:  # Set a minimum threshold to avoid invalid range for randint
            amount = random.randint(100, min(int(allocated_budget / 5), 10000))  # Simulate varying expenses per category
            
            # Reduce the total and category-specific budgets after each expense
            total_budget -= amount
            category_budgets[category] -= amount
            
            # Make sure budgets don’t drop below zero
            if total_budget < 0:
                total_budget = 0
            if category_budgets[category] < 0:
                category_budgets[category] = 0
            
            # Add the record to the dataset
            data.append([date, category, amount, total_budget, category_budgets[category]])
    
    # Create a DataFrame
    df = pd.DataFrame(data, columns=['Date', 'Category', 'Amount', 'Total_Budget', 'Category_Allocated_Budget'])
    
    return df

# Generate the data
expense_data = generate_expense_data(n_records=1000)

# Display the first few rows of the generated data
print(expense_data.head())


        Date                 Category  Amount  Total_Budget  \
0 2023-09-09  Taxes & Regulatory Fees    4539        995461   
1 2023-05-18                 Salaries    7522        987939   
2 2023-05-12                Insurance    2458        985481   
3 2023-03-26             Rent & Lease    6702        978779   
4 2023-06-30    Professional Services     493        978286   

   Category_Allocated_Budget  
0                     114420  
1                     110110  
2                     106106  
3                      44678  
4                      94724  


# 2. Data Processing


In [34]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# Assume expense_data is already generated from the previous code

# Split the data into features and target variable
X = expense_data[['Date', 'Category', 'Total_Budget', 'Category_Allocated_Budget']]
y = expense_data['Amount']

# Extract date features
X['Month'] = X['Date'].dt.month
X['Year'] = X['Date'].dt.year
X['Weekday'] = X['Date'].dt.weekday
X['Quarter'] = X['Date'].dt.quarter

# Drop the original Date column
X = X.drop('Date', axis=1)

# Encode categorical variables
categorical_features = ['Category', 'Month', 'Year', 'Weekday', 'Quarter']
numeric_features = ['Total_Budget', 'Category_Allocated_Budget']

# Preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', numeric_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])

# Transform the data
X_transformed = preprocessor.fit_transform(X)

# Convert the sparse matrix to a dense format and then to a DataFrame
encoded_columns = preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_features)
columns = list(numeric_features) + list(encoded_columns)
X_transformed_df = pd.DataFrame(X_transformed.toarray(), columns=columns)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_transformed_df, y, test_size=0.2, random_state=42)


# train the model

In [39]:
# from sklearn.ensemble import RandomForestRegressor

# # Create a Random Forest model
# model = RandomForestRegressor(n_estimators=100, random_state=42)

# # Train the model
# model.fit(X_train, y_train)


from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import AdaBoostRegressor

# Initialize the AdaBoost model
ada_model = AdaBoostRegressor(random_state=42)

# Define hyperparameters to tune
param_dist = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 1.0],
    'loss': ['linear', 'square', 'exponential']
}

# Set up the randomized search with cross-validation
random_search = RandomizedSearchCV(ada_model, param_distributions=param_dist, n_iter=20, cv=5, random_state=42, n_jobs=-1)

# Fit the model to the training data
random_search.fit(X_train, y_train)

# Check the best hyperparameters
print("Best hyperparameters:", random_search.best_params_)

# Evaluate on the test data
best_model = random_search.best_estimator_
y_pred = best_model.predict(X_test)

# Measure performance
from sklearn.metrics import r2_score, mean_squared_error
print("R-squared:", r2_score(y_test, y_pred))
print("RMSE:", mean_squared_error(y_test, y_pred, squared=False))


Best hyperparameters: {'n_estimators': 50, 'loss': 'square', 'learning_rate': 0.1}
R-squared: 0.5230718823858396
RMSE: 1828.6154963372387


# evaluate

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

# Print the evaluation metrics
print(f'Mean Absolute Error (MAE): {mae}')
print(f'Mean Squared Error (MSE): {mse}')
print(f'Root Mean Squared Error (RMSE): {rmse}')
print(f'R-squared (R²): {r2}')
