### Process the Data

In [18]:
import pandas as pd
import numpy as np

hrs = pd.read_csv("life_expectancy_CleanedHRSdata.csv")
pd.set_option('display.max_columns', None)

### These are functions from the raw_data_processing.ipynb
def standardize_dataframe(df, numerical_columns, exclude_columns):
    """
    A function to standardize specified numerical values in a DataFrame using z-score normalization,
    excluding specified columns.
    
    Parameters:
        df (pandas DataFrame): The input DataFrame.
        numerical_columns (list): A list of numerical column names to standardize.
        exclude_columns (list): A list of column names to exclude from standardization.
        
    Returns:
        pandas DataFrame: A DataFrame with standardized numerical values.
    """
    # Exclude columns specified in exclude_columns
    columns_to_standardize = [col for col in numerical_columns if col not in exclude_columns]
    
    # Initialize the StandardScaler
    scaler = StandardScaler()
    
    # Fit the scaler to the selected columns and transform the values
    standardized_values = scaler.fit_transform(df[columns_to_standardize])
    
    # Create a new DataFrame with the standardized values and the same index and columns as the original DataFrame
    standardized_df = pd.DataFrame(standardized_values, index=df.index, columns=columns_to_standardize)
    
    # Combine the standardized numerical columns with non-numerical columns from the original DataFrame
    for col in df.columns:
        if col not in columns_to_standardize:
            standardized_df[col] = df[col]
    
    return standardized_df

def encode_categorical(df, categorical_vars):
    """
    A function to perform one-hot encoding for categorical variables in a DataFrame.
    
    Parameters:
        df (pandas DataFrame): The input DataFrame.
        categorical_columns (list): A list of column names containing categorical variables to be one-hot encoded.
        
    Returns:
        pandas DataFrame: A DataFrame with one-hot encoded categorical variables.
    """
    print(f"encodable categorical vars: {categorical_vars}")
    
    # Convert numeric categorical variables to categorical type
    for col in categorical_vars:
        df[col] = df[col].astype('category')
        
    # Extract categorical variables
    categorical_df = df[categorical_vars]
    
    # Perform one-hot encoding for categorical variables, drop first ensures there is no multicolinearity
    result = pd.get_dummies(categorical_df, dtype=float, drop_first=True)
    
    return result

def replace_encoded_categorical(df, encoded_categorical_df, categorical_columns):
    """
    A function to replace original categorical columns in a DataFrame with one-hot encoded columns.
    
    Parameters:
        df (pandas DataFrame): The original DataFrame.
        encoded_categorical_df (pandas DataFrame): The DataFrame with one-hot encoded categorical variables.
        categorical_columns (list): A list of column names containing original categorical variables.
        
    Returns:
        pandas DataFrame: A DataFrame with original categorical columns replaced by one-hot encoded columns.
    """
    # Drop original categorical columns from the original DataFrame
    df = df.drop(columns=categorical_columns)
    
    # Concatenate the original DataFrame with the one-hot encoded DataFrame
    df = pd.concat([df, encoded_categorical_df], axis=1)
    return df
#columns_to_drop = ['Unnamed: 0', 'nt','n2','dage_y', 'rarelig']
columns_to_drop = ['Unnamed: 0', 'nt','n2','dage_y', 'rarelig']
hrs_cleaned = hrs.drop(columns=columns_to_drop, errors='ignore')
    # Replace empty strings with NaN to handle both empty strings and NaN values uniformly
hrs_cleaned = hrs_cleaned.replace('', np.nan)
    
    # Drop rows with any NaN values
hrs_cleaned = hrs_cleaned.dropna()
def random_sample_per_group(df, group_col):
    """
    Randomly selects one observation per group from a DataFrame.

    Parameters:
    df (pd.DataFrame): The DataFrame to sample from.
    group_col (str): The name of the column to group by.

    Returns:
    pd.DataFrame: A DataFrame with one randomly selected row per group.
    """
    # Group by the specified column and apply the sampling
    return df.groupby(group_col).apply(lambda x: x.sample(1)).reset_index(drop=True)

def is_categorical(df):
    """
    Determines which columns in a DataFrame are categorical based on data type being a string.
    
    Parameters:
        df (pandas DataFrame): The DataFrame to check.
        
    Returns:
        list: A list of column names that are considered categorical because they are of string type.
    """
    categorical_vars = [col for col in df.columns if df[col].dtype == 'object']
    return categorical_vars

# Determine categorical columns
categorical_columns = is_categorical(hrs_cleaned)  # Adjust threshold as necessary
print("Categorical columns:", categorical_columns)
# To remove columns
categorical_columns = [col for col in categorical_columns if col not in ['raedyrs']]
# Encode categorical variables
encoded_df = encode_categorical(hrs_cleaned.copy(), categorical_columns)

# Replace original categorical columns with encoded ones
final_df = replace_encoded_categorical(hrs_cleaned, encoded_df, categorical_columns)

# Convert all values to integers, setting '17.17+ yrs' specifically to 17
final_df['raedyrs'] = final_df['raedyrs'].replace('17.17+ yrs', '17').astype(int)

# Create a binary indicator for whether the education years are 17 and above
final_df['raedyrs_17plus'] = (final_df['raedyrs'] >= 17).astype(int)

# Check to make sure each "hhidpn" appears only once
def random_sample_per_group(df, group_col):
    """
    Randomly selects one observation per group from a DataFrame.

    Parameters:
    df (pd.DataFrame): The DataFrame to sample from.
    group_col (str): The name of the column to group by.

    Returns:
    pd.DataFrame: A DataFrame with one randomly selected row per group.
    """
    # Group by the specified column and apply the sampling
    return df.groupby(group_col).apply(lambda x: x.sample(1)).reset_index(drop=True)

# Randomly sample one observation per individual
final_df = random_sample_per_group(final_df, 'hhidpn')

# Check to make sure each "hhidpn" appears only once
def check_unique_hhidpn(sampled_df, group_col):
    """
    Checks if each group identifier appears only once in the DataFrame.

    Parameters:
    sampled_df (pd.DataFrame): The DataFrame to check.
    group_col (str): The name of the column to group by.
    """
    if sampled_df[group_col].duplicated().any():
        print("Some hhidpn values appear more than once.")
    else:
        print("Each hhidpn value appears only once.")

# Perform the check
check_unique_hhidpn(final_df, 'hhidpn')

columns_to_drop = ['hhidpn', 'iwbeg', 'id']
final_df = final_df.drop(columns=columns_to_drop, errors='ignore')


# Shuffle the columns randomly
shuffled_columns = np.random.permutation(final_df.columns)

# Split the columns approximately in half
midpoint = len(shuffled_columns) // 2
first_half_columns = shuffled_columns[:midpoint]
second_half_columns = shuffled_columns[midpoint:]

# Create two new DataFrames based on these split columns
df_first_half = final_df[first_half_columns]
df_second_half = final_df[second_half_columns]
output_dir = 'Coding/Git/Economics_Research/HRS Data Processing'
# Print out the columns to verify
print("First half columns:", df_first_half.columns)
print("Second half columns:", df_second_half.columns)

df_first_half.to_csv(os.path.join(output_dir, 'first_half.csv'), index=False)
df_second_half.to_csv(os.path.join(output_dir, 'second_half.csv'), index=False)

Categorical columns: ['mstat', 'cendiv', 'gender', 'rahispan', 'raracem', 'raedyrs', 'ravetrn', 'shlt', 'shltc', 'depres', 'effort', 'sleepr', 'smokev', 'smoken', 'hibp', 'diab', 'cancr', 'lung', 'heart', 'strok', 'psych', 'arthr', 'slfmem', 'pstmem', 'spcfac', 'puffpos', 'covs', 'hiltc', 'lbrf']
encodable categorical vars: ['mstat', 'cendiv', 'gender', 'rahispan', 'raracem', 'ravetrn', 'shlt', 'shltc', 'depres', 'effort', 'sleepr', 'smokev', 'smoken', 'hibp', 'diab', 'cancr', 'lung', 'heart', 'strok', 'psych', 'arthr', 'slfmem', 'pstmem', 'spcfac', 'puffpos', 'covs', 'hiltc', 'lbrf']
Each hhidpn value appears only once.
First half columns: Index(['raedyrs', 'mstat_5.divorced', 'effort_1.yes', 'logisret',
       'strok_4.disp prev record and no cond', 'cendiv_6.es central',
       'psych_4.disp prev record and no cond',
       'raracem_2.black/african american', 'cancr_1.yes', 'hiltc_1.yes',
       'loghatotb', 'shltc_-4', 'hiltc_0.no', 'shltc_-2', 'slfmem_3.good',
       'hibp_1.yes',

OSError: Cannot save file into a non-existent directory: 'Coding\Git\Economics_Research\HRS Data Processing'

In [11]:
### These are functions from the raw_data_processing.ipynb
def standardize_dataframe(df, numerical_columns, exclude_columns):
    """
    A function to standardize specified numerical values in a DataFrame using z-score normalization,
    excluding specified columns.
    
    Parameters:
        df (pandas DataFrame): The input DataFrame.
        numerical_columns (list): A list of numerical column names to standardize.
        exclude_columns (list): A list of column names to exclude from standardization.
        
    Returns:
        pandas DataFrame: A DataFrame with standardized numerical values.
    """
    # Exclude columns specified in exclude_columns
    columns_to_standardize = [col for col in numerical_columns if col not in exclude_columns]
    
    # Initialize the StandardScaler
    scaler = StandardScaler()
    
    # Fit the scaler to the selected columns and transform the values
    standardized_values = scaler.fit_transform(df[columns_to_standardize])
    
    # Create a new DataFrame with the standardized values and the same index and columns as the original DataFrame
    standardized_df = pd.DataFrame(standardized_values, index=df.index, columns=columns_to_standardize)
    
    # Combine the standardized numerical columns with non-numerical columns from the original DataFrame
    for col in df.columns:
        if col not in columns_to_standardize:
            standardized_df[col] = df[col]
    
    return standardized_df

def encode_categorical(df, categorical_vars):
    """
    A function to perform one-hot encoding for categorical variables in a DataFrame.
    
    Parameters:
        df (pandas DataFrame): The input DataFrame.
        categorical_columns (list): A list of column names containing categorical variables to be one-hot encoded.
        
    Returns:
        pandas DataFrame: A DataFrame with one-hot encoded categorical variables.
    """
    print(f"encodable categorical vars: {categorical_vars}")
    
    # Convert numeric categorical variables to categorical type
    for col in categorical_vars:
        df[col] = df[col].astype('category')
        
    # Extract categorical variables
    categorical_df = df[categorical_vars]
    
    # Perform one-hot encoding for categorical variables, drop first ensures there is no multicolinearity
    result = pd.get_dummies(categorical_df, dtype=float, drop_first=True)
    
    return result

def replace_encoded_categorical(df, encoded_categorical_df, categorical_columns):
    """
    A function to replace original categorical columns in a DataFrame with one-hot encoded columns.
    
    Parameters:
        df (pandas DataFrame): The original DataFrame.
        encoded_categorical_df (pandas DataFrame): The DataFrame with one-hot encoded categorical variables.
        categorical_columns (list): A list of column names containing original categorical variables.
        
    Returns:
        pandas DataFrame: A DataFrame with original categorical columns replaced by one-hot encoded columns.
    """
    # Drop original categorical columns from the original DataFrame
    df = df.drop(columns=categorical_columns)
    
    # Concatenate the original DataFrame with the one-hot encoded DataFrame
    df = pd.concat([df, encoded_categorical_df], axis=1)
    return df
#columns_to_drop = ['Unnamed: 0', 'nt','n2','dage_y', 'rarelig']
columns_to_drop = ['Unnamed: 0', 'nt','n2','dage_y', 'rarelig']
hrs_cleaned = hrs.drop(columns=columns_to_drop, errors='ignore')
    # Replace empty strings with NaN to handle both empty strings and NaN values uniformly
hrs_cleaned = hrs_cleaned.replace('', np.nan)
    
    # Drop rows with any NaN values
hrs_cleaned = hrs_cleaned.dropna()
def random_sample_per_group(df, group_col):
    """
    Randomly selects one observation per group from a DataFrame.

    Parameters:
    df (pd.DataFrame): The DataFrame to sample from.
    group_col (str): The name of the column to group by.

    Returns:
    pd.DataFrame: A DataFrame with one randomly selected row per group.
    """
    # Group by the specified column and apply the sampling
    return df.groupby(group_col).apply(lambda x: x.sample(1)).reset_index(drop=True)

def is_categorical(df):
    """
    Determines which columns in a DataFrame are categorical based on data type being a string.
    
    Parameters:
        df (pandas DataFrame): The DataFrame to check.
        
    Returns:
        list: A list of column names that are considered categorical because they are of string type.
    """
    categorical_vars = [col for col in df.columns if df[col].dtype == 'object']
    return categorical_vars

# Determine categorical columns
categorical_columns = is_categorical(hrs_cleaned)  # Adjust threshold as necessary
print("Categorical columns:", categorical_columns)
# To remove columns
categorical_columns = [col for col in categorical_columns if col not in ['raedyrs']]
# Encode categorical variables
encoded_df = encode_categorical(hrs_cleaned.copy(), categorical_columns)

# Replace original categorical columns with encoded ones
final_df = replace_encoded_categorical(hrs_cleaned, encoded_df, categorical_columns)

# Convert all values to integers, setting '17.17+ yrs' specifically to 17
final_df['raedyrs'] = final_df['raedyrs'].replace('17.17+ yrs', '17').astype(int)

# Create a binary indicator for whether the education years are 17 and above
final_df['raedyrs_17plus'] = (final_df['raedyrs'] >= 17).astype(int)

# Check to make sure each "hhidpn" appears only once
def random_sample_per_group(df, group_col):
    """
    Randomly selects one observation per group from a DataFrame.

    Parameters:
    df (pd.DataFrame): The DataFrame to sample from.
    group_col (str): The name of the column to group by.

    Returns:
    pd.DataFrame: A DataFrame with one randomly selected row per group.
    """
    # Group by the specified column and apply the sampling
    return df.groupby(group_col).apply(lambda x: x.sample(1)).reset_index(drop=True)

# Randomly sample one observation per individual
final_df = random_sample_per_group(final_df, 'hhidpn')

# Check to make sure each "hhidpn" appears only once
def check_unique_hhidpn(sampled_df, group_col):
    """
    Checks if each group identifier appears only once in the DataFrame.

    Parameters:
    sampled_df (pd.DataFrame): The DataFrame to check.
    group_col (str): The name of the column to group by.
    """
    if sampled_df[group_col].duplicated().any():
        print("Some hhidpn values appear more than once.")
    else:
        print("Each hhidpn value appears only once.")

# Perform the check
check_unique_hhidpn(final_df, 'hhidpn')

columns_to_drop = ['hhidpn', 'iwbeg', 'id']
final_df = final_df.drop(columns=columns_to_drop, errors='ignore')


# Shuffle the columns randomly
shuffled_columns = np.random.permutation(final_df.columns)

# Split the columns approximately in half
midpoint = len(shuffled_columns) // 2
first_half_columns = shuffled_columns[:midpoint]
second_half_columns = shuffled_columns[midpoint:]

# Create two new DataFrames based on these split columns
df_first_half = final_df[first_half_columns]
df_second_half = final_df[second_half_columns]

# Print out the columns to verify
print("First half columns:", df_first_half.columns)
print("Second half columns:", df_second_half.columns)

Categorical columns: ['mstat', 'cendiv', 'gender', 'rahispan', 'raracem', 'raedyrs', 'ravetrn', 'shlt', 'shltc', 'depres', 'effort', 'sleepr', 'smokev', 'smoken', 'hibp', 'diab', 'cancr', 'lung', 'heart', 'strok', 'psych', 'arthr', 'slfmem', 'pstmem', 'spcfac', 'puffpos', 'covs', 'hiltc', 'lbrf']
encodable categorical vars: ['mstat', 'cendiv', 'gender', 'rahispan', 'raracem', 'ravetrn', 'shlt', 'shltc', 'depres', 'effort', 'sleepr', 'smokev', 'smoken', 'hibp', 'diab', 'cancr', 'lung', 'heart', 'strok', 'psych', 'arthr', 'slfmem', 'pstmem', 'spcfac', 'puffpos', 'covs', 'hiltc', 'lbrf']
Each hhidpn value appears only once.
First half columns: Index(['heart_4.disp prev record and no cond', 'smoken_1.yes',
       'arthr_4.disp prev record and no cond', 'mstat_4.separated',
       'slfmem_3.good', 'lbrf_4.partly retired', 'shlt_4.fair',
       'mstat_5.divorced', 'effort_1.yes', 'spcfac_0.no', 'raedyrs_17plus',
       'shltc_-4', 'hiltc_0.no', 'raracem_2.black/african american',
       'can

### VAE Process

In [15]:
from multiprocessing import freeze_support
import os
import sys
import numpy as np
import pandas as pd
import torch
import torch.multiprocessing as mp
import statsmodels.api as sm

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

current_directory = os.getcwd()
projects_directory = os.path.dirname(current_directory)
sys.path.append(projects_directory)

from Vectorize import VAE
from Preprocess import raw_dataframe_preprocessor, column_optimizer
from Predict import predictors as oracle
from Vectorize import Encoder
from HNSW import Row_Matcher

def main():
    # Assume df_first_half and df_second_half are already in memory
    first_half_preprocessed = df_first_half
    first_half_X = first_half_preprocessed.drop('target_variable', axis=1)  # Replace 'target_variable' with the actual target variable name
    
    print("First half dataset loaded")
    
    second_half_preprocessed_whole = df_second_half
    second_half_X_whole = second_half_preprocessed_whole.drop('target_variable', axis=1)  # Replace 'target_variable' with the actual target variable name
    second_half_y_whole = second_half_preprocessed_whole['target_variable']  # Replace 'target_variable' with the actual target variable name
    second_half_X_whole_tensor = torch.tensor(second_half_X_whole.values, dtype=torch.float)
    second_half_y_whole_tensor = torch.tensor(second_half_y_whole.values, dtype=torch.float)
    
    print("Second half dataset loaded")
    
    second_half_X_whole_with_target = oracle.run_model_pipeline_and_return_final_heart_predictors(None, None, second_half_X_whole)
    
    print("Model pipeline run on second half dataset")
    
    ### Column rearranging 
    column_rearranger = column_optimizer.ColumnRearranger()
    
    # Reduce row number of second half table to match that of first half via bootstrapping
    second_half_X = column_rearranger.bootstrap_to_match(first_half_X, second_half_X_whole_with_target)
    
    print("Second half dataset bootstrapped to match first half")
    
    raw_dataframe_preprocessor.save_dataframe(second_half_X, current_directory+"/PVM/Datasets", "second_half_preprocessed_X.csv")
    
    # pre-rearrangement
    average_correlation_pre = column_rearranger.compute_average_correlation(first_half_X, second_half_X)
    print(f"Pre-operation average correlation: {average_correlation_pre}")
    
    # Rearrange columns of the right table such that the average correlation between every column i from the left table and every column j from the right table where i=j is maximized
    second_half_X_rearranged = column_rearranger.return_optimal_rearrangement(first_half_X, second_half_X)
    
    print("Second half dataset columns rearranged")
    
    # post-rearrangement
    average_correlation_post = column_rearranger.compute_average_correlation(first_half_X, second_half_X_rearranged)
    print(f"Post-operation average correlation: {average_correlation_post}")
    
    column_rearranger.visualize_comparison(average_correlation_pre, average_correlation_post)
    
    # Update global data
    raw_dataframe_preprocessor.update_heart_final_predictors(second_half_X_rearranged, list(second_half_X_rearranged.columns))
    
    print("Global data updated with rearranged second half dataset")
    
    # Visualize if rearrangement was done correctly
    raw_dataframe_preprocessor.save_dataframe(second_half_X_rearranged, current_directory+"/PVM/Datasets", "second_half_preprocessed_X_rearranged.csv")
    
    ### Adding the Vector Encoded Column that summarizes each row's data using VAE
    encoder = Encoder.DataFrameEncoder()
    # Train the models
    encoder.train_and_assign_models(first_half_preprocessed, second_half_X_rearranged, second_half_preprocessed_whole)
    # Save the models
    encoder.save_model(encoder.first_half_model, 'first_half_model.pth')
    encoder.save_model(encoder.second_half_model, 'second_half_model.pth')
    # Add a vector encoding column to first half and second half dataframes
    encoded_first_half_df, encoded_second_half_df = encoder.load_and_encode_dataframes(first_half_X, second_half_X_rearranged)
    
    print("Vector encoding added to first half and second half datasets")
    
    raw_dataframe_preprocessor.save_dataframe(encoded_first_half_df, current_directory+"/PVM/Datasets", "first_half_predictors.csv")
    print(f"First half predictors:\n{encoded_first_half_df.head()}")
    
    raw_dataframe_preprocessor.save_dataframe(encoded_second_half_df, current_directory+"/PVM/Datasets", "second_half_predictors.csv")
    print(f"Second half predictors:\n{encoded_second_half_df.head()}")
    
    ### Probabilistic Vectorized Matching
    
    # HYPERPARAMETERS
    batch_size, num_training_updates, num_hiddens, embedding_dim, learning_rate = VAE.return_hyperparameters()
    device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
    
    first_half_model = VAE.Model(encoded_first_half_df.shape[1], num_hiddens, embedding_dim).to(device)
    first_half_model.load_state_dict(torch.load('first_half_model.pth'))
    first_half_model.eval()
    
    second_half_model = VAE.Model(encoded_second_half_df.shape[1], num_hiddens, embedding_dim).to(device)
    second_half_model.load_state_dict(torch.load('second_half_model.pth'))
    second_half_model.eval()
    
    print("VAE models loaded")

    def match_rows(first_half_df, second_half_df):
        row_matcher = Row_Matcher.RowMatcher()
        return row_matcher.retrieve_similar(first_half_df, second_half_df)
    
    # Perform row matching and store results
    combined_data = match_rows(encoded_first_half_df, encoded_second_half_df)
    
    print("Row matching performed")
    
    ### Prepare Data for Final OLS Regression
    final_first_half_regressors, final_second_half_regressors, final_first_half_y, final_second_half_y = raw_dataframe_preprocessor.return_final_variables()
    combined_data[final_first_half_y] = first_half_preprocessed[final_first_half_y]
    combined_data[final_second_half_y] = second_half_preprocessed_whole[final_second_half_y]
    
    print("Data prepared for final OLS regression")

    # Save and display results
    combined_data.to_csv(current_directory + "/PVM/Datasets/merged_predictors.csv")
    print(f"Merged predictors:\n{combined_data.head()}")
    
    """
        FINAL RESULTS
    """
    oracle.test_research_null(combined_data, combined_data[final_first_half_regressors], combined_data[final_second_half_regressors], combined_data[final_first_half_y], combined_data[final_second_half_y])
    

if __name__ == "__main__":
    ### Multiprocessing for deep learning
    freeze_support()
    main()

ModuleNotFoundError: No module named 'Vectorize'

In [14]:
sys.path.append(os.path.join(current_directory, 'modules'))