In [None]:
import h2o
from h2o.automl import H2OAutoML
import psycopg2
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [None]:
def query_into_df(query, cursor):
	"""

	Description: This function loads a SQL query into a pandas DataFrame.

	Parameters:

	table_name (str): A string containing the name of the table to be retrieved;
	cursor (database cursor): A cursor to make a connection with the database.

	"""

	cursor.execute(query)
	result = cursor.fetchall()
	column_names = [desc[0] for desc in cursor.description]

	df = pd.DataFrame(result, columns=column_names)

	return df

In [None]:
def add_suffix_to_duplicates(lst):
    frequency = {}
    result = []
    for item in lst:
        if item in frequency:
            frequency[item] += 1
            item_with_suffix = item + "_dupplicated_" + str(frequency[item])
            result.append(item_with_suffix)
        else:
            frequency[item] = 0
            result.append(item)
    return result

# Example usage
my_list = ['apple', 'orange', 'banana', 'apple', 'apple', 'banana']
suffix = '_dup'
result_list = add_suffix_to_duplicates(my_list,)
print(result_list)


In [None]:
def query_into_h2o(query, cursor):
    """

    Description: This function loads a SQL query into a pandas DataFrame.

    Parameters:

    table_name (str): A string containing the name of the table to be retrieved;
    cursor (database cursor): A cursor to make a connection with the database.

    """

    cursor.execute(query)
    result = cursor.fetchall()

    column_names = [desc[0] for desc in cursor.description]
    column_names = add_suffix_to_duplicates(column_names)

    df = h2o.H2OFrame(result, column_names=column_names)

    return df

In [None]:
def data_split(split_time, df, drop_from_features, target):
    """
    Description: Splits data into training and test considerind a split date.
    
    Parameters:
        split_time (str): A string with the split date in Y-m-d format;
        df (h2o.H2OFrame): A h2o frame containing the data to be splitted;
        drop_from_features (list): A list containing the features (and the target) to be dropped out of the training step;
        target (str): The name of the target variable;
    
    Returns:
        train (h2o.H2OFrame): The data for train;
        test (h2o.H2OFrame): The data for test;
        features (list): A list of features to predict the target;
        target (str): A string of the target to predicted;
        
    """
    
    split_time = pd.to_datetime(split_time)
    
    train = df[df["date"] <  split_time]
    test = df[df["date"] >=  split_time]
    
    features = train.columns
    features = [feature_col for feature_col in features if feature_col not in drop_from_features]
    
    return train, test, features, target

In [None]:
# exporting models metrics and results

def calculate_metrics(actual_frame, predicted_frame):
    """
    Calculate RMSE, MAE, and R-squared metrics.

    Parameters:
        actual_frame (H2OFrame): The actual target values;
        predicted_frame (H2OFrame): The predicted target values.

    Returns:
        dict: Dictionary containing the calculated metrics.
        
    """
    actual_values = actual_frame.as_data_frame().values.flatten()
    predicted_values = predicted_frame.as_data_frame().values.flatten()

    rmse = np.sqrt(mean_squared_error(actual_values, predicted_values))
    mae = mean_absolute_error(actual_values, predicted_values)
    r2 = r2_score(actual_values, predicted_values)

    return {"RMSE": rmse, "MAE": mae, "R-squared": r2}

In [None]:
def display_metrics(models, actual_values, predicted_values):
    """
    Display the performance metrics for each model in a DataFrame.

    Parameters:
    models (list): A list of model names.
    actual_values (H2OFrame): H2OFrame with the actual target values.
    predicted_values (list): A list of H2OFrames with the predicted target values for each model.

    Returns:
    pandas.DataFrame: DataFrame containing the performance metrics for each model.
    """
    
    results = {}
    
    actual_values = [actual_values]
    actual_values = len(models) * actual_values
    
    for model_name, actual, predicted in zip(models, actual_values, predicted_values):
        
        metrics = calculate_metrics(actual, predicted)
        results[model_name] = metrics

    df = pd.DataFrame.from_dict(results, orient='index')
    
    return df

In [None]:
def generate_quarterly_series(start_date, end_date):
    """
    Description: This function generates a quarterly time series between the specified start and end dates.

    Parameters:
        start_date (str): A string representing the start date in the format 'YYYY-MM-DD'.
        end_date (str): A string representing the end date in the format 'YYYY-MM-DD'.

    Returns:
        pandas.Series: A series object containing the quarterly time series.
    """

    # Generate a DatetimeIndex with quarterly frequency
    dates = pd.date_range(start=start_date, end=end_date, freq='QS')

    # Create a pandas Series with the quarterly time series
    quarterly_series = pd.Series(dates)

    return quarterly_series

In [None]:
RESULTS_FOLDER = r"alternative_economic_data\data\ml_data"

In [None]:
h2o.init()

In [None]:
try:
    db = psycopg2.connect(
        host="aws-california.caclwjj7hnoo.us-east-2.rds.amazonaws.com",
        database="california",
        user="postgres",
        password="24567811"
    )
    cursor = db.cursor()
    print("Connected to the database!")
    
    # Perform database operations here
    
except psycopg2.Error as e:
    print("Error connecting to the database:", e)


In [None]:
query_trad_data = "SELECT trad.* " \
                  "FROM traditional_data AS trad"
                 
query_alt_data = "SELECT alt.*, trad.gdp " \
                 "FROM traditional_data AS trad " \
                 "JOIN alternative_data AS alt ON trad.date = alt.date"

query_all_data = "SELECT trad.*, alt.* " \
                 "FROM traditional_data AS trad " \
                 "FULL OUTER JOIN alternative_data AS alt " \
                 "ON trad.date = alt.date"

traditional_data = query_into_h2o(query_trad_data, cursor)
alternative_data = query_into_h2o(query_alt_data, cursor)
all_data = query_into_h2o(query_all_data, cursor)
all_data = all_data.drop("date_dupplicated_1")

db.close() # don't want to kill your finances in aws haha

In [None]:
import sys

In [None]:
# sorting data frames 

traditional_data = traditional_data.sort(by='date')
alternative_data = alternative_data.sort(by='date')
all_data = all_data.sort(by='date')


<br>

### Predicting Expenditure and disposable Income 

<br>

Expenditure:

- Remove GDP and disp_inc

Disp Income:

- Remove GDP and expenditure

Create train and test data based on split time and columns to drop

In [30]:
from pympler import asizeof, tracker

In [31]:
mem_tracker = tracker.SummaryTracker()

In [32]:
## Expenditure data

split_date = "2019-01-01"
drop_from_expend = ["gdp", "disposable_income", "expenditure"]

train_expend, test_expend, features_expend, expend = data_split(split_date, traditional_data, drop_from_expend, "expenditure")

# Initializing and training model 

expendt_model = H2OAutoML(max_runtime_secs=60)
expendt_model.train(x=features_expend, y=expend, training_frame=train_expend)

# Predictions  

predictions_expend = expendt_model.predict(test_expend[features_expend])
del expendt_model

AutoML progress: |
13:59:29.912: AutoML: XGBoost is not available; skipping it.


13:59:30.266: _min_rows param, The dataset size is too small to split for min_rows=100.0: must have at least 200.0 (weighted) rows, but have only 56.0.

███████████████████████████████████████████████████████████████| (done) 100%
gbm prediction progress: |███████████████████████████████████████████████████████| (done) 100%
                               types |   # objects |   total size
                                list |       23479 |      1.98 MB
                                 str |       25230 |      1.81 MB
  h2o.backend.connection.H2OResponse |         677 |    192.36 KB
                                 int |        5704 |    156.90 KB
                               tuple |         418 |     42.76 KB
                                dict |          80 |     16.61 KB
                                type |          14 |      5.58 KB
                                code |          -4 |      4.31 KB

In [None]:
## Disposable Income 

drop_from_disp = ["gdp", "disposable_income", "expenditure"]

train_disp, test_disp, features_disp, disp = data_split(split_date, traditional_data, drop_from_disp, "disposable_income")

# Initializing and training model 

disp_model = H2OAutoML(max_runtime_secs=10)
disp_model.train(x=features_disp, y=disp, training_frame=train_disp)

# Predictions  

predictions_disp = disp_model.predict(test_disp[features_disp])
del disp_model

In [None]:
## Traditional Data


drop_from_trad = ["gdp"]

train_trad, test_trad, features_trad, gdp = data_split(split_date, traditional_data, drop_from_trad, "gdp")

# replacing expenditure and tradosable income data in the test 

test_trad["expenditure"] = predictions_expend
test_trad["disposable_income"] = predictions_disp

# Initializing and training model 

trad_model = H2OAutoML(max_runtime_secs=10)
trad_model.train(x=features_trad, y=gdp, training_frame=train_trad)

# Predictions  

predictions_trad = trad_model.predict(test_trad[features_trad])
del disp_model

In [None]:
## Alternative Data


drop_from_alt = ["gdp"]

train_alt, test_alt, features_alt, gdp = data_split(split_date, alternative_data, drop_from_alt, "gdp")


# Initializing and training model 

alt_model = H2OAutoML(max_runtime_secs=10)
alt_model.train(x=features_alt, y=gdp, training_frame=train_alt)

# Predictions  

predictions_alt = alt_model.predict(test_alt[features_alt])

In [None]:
## All Data

drop_from_all = ["gdp"]

train_all, test_all, features_all, gdp = data_split(split_date, all_data, drop_from_all, "gdp")

# replacing expenditure and disposable income data in the test 

test_all["expenditure"] = predictions_expend
test_all["disposable_income"] = predictions_disp

# Initializing and training model 

all_model = H2OAutoML(max_runtime_secs=10)
all_model.train(x=features_all, y=gdp, training_frame=train_all)

# Predictions  

predictions_all = all_model.predict(test_all[features_all])

In [None]:
models = ["trad_model", "alt_model", "all_model"]
predictions = [predictions_trad, predictions_alt, predictions_all]
actual_gdp = test_all["gdp"]

models_metrics = display_metrics(models, actual_gdp, predictions)

In [None]:
flattened_dict = {
    "date": generate_quarterly_series(split_date, "2022-10-01"),        
    "real_gdp": test_all["gdp"].as_data_frame(),
    "trad_predictions": predictions_trad.as_data_frame(),
    "alt_predictions": predictions_alt.as_data_frame(),
    "all_predictions": predictions_all.as_data_frame()
}

prediction_results = pd.concat(flattened_dict.values(), axis=1, keys=flattened_dict.keys())
prediction_results.columns = prediction_results .columns.get_level_values(0)

In [None]:
prediction_file = "..\data\ml_data\predictions.csv"
metrics_file = "..\data\ml_data\models_metrics.csv"

prediction_results.to_csv(prediction_file, index=False)
models_metrics.to_csv(metrics_file)