In [1]:
import yaml
import pandas as pd
import os
import numpy as np

In [2]:
## Variable
config_file_path = "config.yml"

In [3]:
# functions
def config_loading(config_path:str):
    with open(config_path, 'r') as f:
        config = yaml.safe_load(f)
    return config

def build_output_paths(config):
    """
    Build full paths for output files based on the configuration.

    Args:
        config (dict): Configuration dictionary with keys:
            - root_path: Root directory path.
            - output_folder: Folder where output files will be saved.
            - filtered_data_filename: Filename for filtered data.
            - no_null_imputed_data_filename: Filename for no-null imputed data.
            - scaled_data_filename: Filename for scaled data.

    Returns:
        dict: A dictionary with keys 'filtered_data_path', 'no_null_imputed_data_path',
              and 'scaled_data_path', containing the full paths for the respective files.
    """
    # Get the root path and output folder
    root_path = config["root_path"]
    output_folder = config["output_folder"]

    # Ensure the output folder exists
    output_path = os.path.join(root_path, output_folder)
    os.makedirs(output_path, exist_ok=True)

    # Build full paths for the output files
    paths = {
        "filtered_data_path": os.path.join(
            output_path, config["filtered_data"]
        ),
        "no_null_imputed_data_path": os.path.join(
            output_path, config["no_null_imputed_data"]
        ),
        "scaled_data_path": os.path.join(output_path, config["scaled_data"]),
    }

    return paths

def date_dv_columns_check(data: pd.DataFrame, config: dict):
    required_columns = [config['date_column'], config['dv_column']] + config['data_prep_group_var']
    missing_columns = [col for col in required_columns if col not in data.columns]

    if missing_columns:
        raise ValueError(f"The following required columns are missing from the DataFrame: {missing_columns}")
    else:
        print("All required columns are present in the DataFrame.")

def data_date_conversion(data:pd.DataFrame, config:dict):
    data[config['date_column']] = pd.to_datetime(data[config['date_column']], format=config['date_format'])
    return data

def idv_list_loading(config:dict):
    idv = pd.read_csv(config['idv_list'])
    return idv

def check_idv_columns_in_data(data:pd.DataFrame, idv_list:pd.DataFrame, column_name:str, config):
    """
    Checks if all columns in df1 are present as rows in a specified column of df2.

    Parameters:
    - data (pd.DataFrame): The first DataFrame whose columns need to be checked.
    - idv_list (pd.DataFrame): The second DataFrame with the reference column.
    - column_name (str): The column in df2 that should contain all column names of df1.

    Returns:
    - None: If all columns are found, the function silently passes.

    Raises:
    - ValueError: If any columns are missing, it raises an error with the missing columns.
    """
    # Get the list of columns from df1
    df1_columns = set(data.columns)

    # Get the unique values in the specified column of df2
    df2_values = set(idv_list[column_name])

    # Find the missing columns
    missing_columns = df2_values - df1_columns

    # Raise an error if there are missing columns
    if missing_columns:
        raise ValueError(f"The following independent variables are missing in data: {missing_columns}")
    else:
        print("All independent variables in idv_list are present in the data.")
    required_columns = [config['date_column'], config['dv_column']] + config['data_prep_group_var']
    return data[required_columns + list(df2_values)]

def data_loading(config:dict):

    input_data = pd.read_csv(config['input_data'])
    date_dv_columns_check(input_data, config)
    data = data_date_conversion(input_data, config)
    idv_list = idv_list_loading(config)
    data = check_idv_columns_in_data(data, idv_list, "idv", config)

    return data, idv_list

def column_arrangement(config: dict, idv_list:pd.DataFrame):
    sorted_idv_list = sorted(idv_list['idv'].tolist())
    sorted_idv_list
    column_arrangement = [config['date_column']] + config['data_prep_group_var'] + sorted_idv_list + [config['dv_column']]
    return column_arrangement

def filter_by_date_range(data:pd.DataFrame, config:dict):
    """Filtering date range for the data processing and further analysis

    Args:
        data (DataFrame): Harmonized_processed_data to filter out the date range
        config (dict): configuration dictionary
    Returns:
        DataFrame: Data with filtered date range
    """
    data["date"] = pd.to_datetime(data["date"], utc=False)

    # Print the minimum and maximum date values for verification
    print("Minimum date:", data["date"].min(skipna=True))
    print("Maximum date:", data["date"].max(skipna=True))

    # Define the date range from run_config
    date1 = pd.to_datetime(config['start_date'], format="%Y-%m-%d")
    date2 = pd.to_datetime(config['end_date'], format="%Y-%m-%d")

    # Filter the DataFrame based on the date range
    data = data[(data["date"] >= date1) & (data["date"] <= date2)]
    if data[config['dv_column']].isna().sum() != 0:
        raise ValueError(f"The dependent variable {config['dv_column']} is having null values")
    return data

### preprocessing functions ####

def cap_values(df, col, min_val, max_val):
    df[col] = df[col].astype(float).clip(lower=min_val, upper=max_val)
    return df


def get_unique_combinations(df: pd.DataFrame, columns: list):
    """
    Get unique combinations of values in the specified columns of a DataFrame.

    Parameters:
    - df (pd.DataFrame): The DataFrame to extract combinations from.
    - columns (list): A list of column names to consider.

    Returns:
    - list of dict: Each dictionary represents a unique combination of column-value pairs.
    """
    # Check if the specified columns exist in the DataFrame
    missing_columns = [col for col in columns if col not in df.columns]
    if missing_columns:
        raise ValueError(f"The following columns are missing from the DataFrame: {missing_columns}")

    # Extract unique rows for the specified columns
    unique_combinations = df[columns].drop_duplicates().to_dict(orient="records")

    return unique_combinations


def drop_rows_with_nulls(df: pd.DataFrame, group_by_columns: list, null_threshold: float):
    """
    Drop rows for specific combinations of columns where more than 50% of the 'value' column is null.

    Parameters:
    - df (pd.DataFrame): The DataFrame to process.
    - group_by_columns (list): List of columns to group by.

    Returns:
    - pd.DataFrame: The DataFrame with rows removed based on the condition.
    """
    # Step 1: Group by specified columns
    grouped = df.groupby(group_by_columns)

    # Step 2: Calculate the percentage of nulls for each group
    null_percentage = grouped['value'].apply(lambda x: x.isnull().mean())

    # Step 3: Identify groups with more than 50% null values
    groups_to_drop = null_percentage[null_percentage > null_threshold].index

    # Step 4: Filter out rows belonging to the identified groups
    filtered_df = df[~df.set_index(group_by_columns).index.isin(groups_to_drop)]

    return filtered_df


def impute_groups(df: pd.DataFrame, group_by_columns: list, null_threshold: float, imputation_method: str = 'mean'):
    """
    Drop rows for specific combinations of columns where more than the specified percentage of the 'value' column is null,
    and impute missing values for groups with less than the threshold.

    Parameters:
    - df (pd.DataFrame): The DataFrame to process.
    - group_by_columns (list): List of columns to group by.
    - null_threshold (float): The threshold of null percentage for which rows will be dropped.
    - imputation_method (str): The method for imputing missing values ('mean', 'median', 'mode').

    Returns:
    - pd.DataFrame: The DataFrame with rows removed and missing values imputed.
    """
    # Step 1: Group by specified columns
    grouped = df.groupby(group_by_columns)

    # Step 2: Calculate the percentage of nulls for each group
    null_percentage = grouped['value'].apply(lambda x: x.isnull().mean())

    # Step 3: Identify groups with more than the null threshold percentage of null values
    groups_to_drop = null_percentage[null_percentage > null_threshold].index
    groups_to_drop_list = groups_to_drop.to_list()

    print(groups_to_drop_list)
    # Step 4: Filter out rows belonging to the identified groups
    df_filtered = df[~df.set_index(group_by_columns).index.isin(groups_to_drop)]

    # Step 5: Impute missing values for the remaining groups (those with less than the threshold null percentage)
    for group, group_df in df_filtered.groupby(group_by_columns):
        if group not in groups_to_drop:
            if imputation_method == 'mean':
                fill_value = group_df['value'].mean()
            elif imputation_method == 'median':
                fill_value = group_df['value'].median()
            elif imputation_method == 'mode':
                fill_value = group_df['value'].mode()[0]
            else:
                raise ValueError(f"Unsupported imputation method: {imputation_method}")

            # Impute the missing values in the group
            df_filtered.loc[group_df.index, 'value'] = group_df['value'].fillna(fill_value)

    return df_filtered

# Scaling Function
def scale_metrics(melted_data, idv_list, config):
    def apply_scaling(group):
        metric_name = group['metric'].iloc[0]
        # Skip scaling for the dependent variable
        if metric_name == config['dv_column']:
            group['value'] = group['value']
            return group
        if config['scaling'] == 'minmax':
            # Min-Max Scaling: Scale between 0 and 1
            min_val = group['value'].min()
            max_val = group['value'].max()
            group['value'] = (group['value'] - min_val) / (max_val - min_val)

        elif config['scaling'] == 'standard':
            # Standard Scaling: Scale with mean and standard deviation
            mean_val = group['value'].mean()
            std_dev = group['value'].std()
            group['value'] = (group['value'] - mean_val) / std_dev

        elif config['scaling'] == 'custom':
            # Custom Scaling: Use min and max from df2
            custom_min = idv_list.loc[idv_list['idv'] == metric_name, 'min'].values[0]
            custom_max = idv_list.loc[idv_list['idv'] == metric_name, 'max'].values[0]
            print((metric_name, custom_max, custom_min))
            group['value'] = (group['value'] - custom_min) / (custom_max - custom_min)

        else:
            raise ValueError("Invalid scaling method. Choose 'minmax', 'standard', or 'custom'.")

        return group

    # Group by 'metric' and apply scaling
    scaled_df = melted_data.groupby('metric', group_keys=False).apply(apply_scaling)
    return scaled_df



In [63]:
config = config_loading(config_file_path)
data, idv_list = data_loading(config)
filtered_data = filter_by_date_range(data, config)

All required columns are present in the DataFrame.
All independent variables in idv_list are present in the data.
Minimum date: 2017-01-07 00:00:00
Maximum date: 2025-01-11 00:00:00


In [145]:
config

{'input_data': 'D:\\BRAND_HUB_PROJECT\\brandhub-capability\\src\\BHC_Capability\\data\\input_data_refined_5.csv',
 'idv_list': 'D:\\BRAND_HUB_PROJECT\\brandhub-capability\\src\\BHC_Capability\\data\\idv_list_.csv',
 'granularity': 'weekly',
 'data_prep_group_var': ['brand', 'category'],
 'date_column': 'date',
 'date_format': '%d-%m-%Y',
 'start_date': '2022-08-01',
 'end_date': '2024-06-01',
 'dv_column': 'market_share',
 'null_percentage': 0.5,
 'scaling': 'custom',
 'cfa_sampling_seeding': [2, 3, 5, 7, 11, 13, 17, 19],
 'model_type': 'RandomForest',
 'model_config': {'RandomForest': {'grid_search': {'max_depth': [2, 3, 4],
    'n_estimators': [15, 50, 100, 300, 500],
    'max_features': [2, 4, 10],
    'random_state': [42],
    'eval_metrics': []},
   'random_state': 42},
  'XGBoost': {'grid_search': {'max_depth': [2, 3],
    'n_estimators': [100, 500, 1000, 1500],
    'learning_rate': [0.01, 0.02],
    'random_state': [42],
    'eval_metrics': []},
   'random_state': 42},
  'RF_Rid

In [14]:
#Data Preprocessing
melted_df = pd.melt(
    filtered_data,
    id_vars=['date', 'brand', 'category'],  # Columns to keep as-is
    var_name='metric',  # Name of the new column for melted variable names
    value_name='value'  # Name of the new column for melted values
)
no_null_imputed_data = impute_groups(melted_df, ['brand', 'category', 'metric'], .5)
scaled_data = scale_metrics(no_null_imputed_data, idv_list, config)


[("BEGGIN'", 'DOG FOOD', 'directions_brand_attributes_always_seems_to_be_doing_something_new_mean'), ("BEGGIN'", 'DOG FOOD', 'directions_brand_attributes_creates_excitement_and_eagerness_at_mealtime_mean'), ("BEGGIN'", 'DOG FOOD', 'directions_brand_attributes_creates_playful_moments_with_my_cat_mean'), ("BEGGIN'", 'DOG FOOD', 'directions_brand_attributes_has_a_taste_my_cat_enjoys_mean'), ("BEGGIN'", 'DOG FOOD', 'directions_brand_attributes_has_a_taste_my_dog_enjoys_mean'), ("BEGGIN'", 'DOG FOOD', 'directions_brand_attributes_has_craveable_treats_that_my_cat_comes_running_for_mean'), ("BEGGIN'", 'DOG FOOD', 'directions_brand_attributes_has_the_most_satisfying_and_hearty_dog_treats_mean'), ("BEGGIN'", 'DOG FOOD', 'directions_brand_attributes_has_treats_that_contain_wholesome_ingredients_mean'), ("BEGGIN'", 'DOG FOOD', 'directions_brand_attributes_has_treats_that_have_an_appealing_package_mean'), ("BEGGIN'", 'DOG FOOD', 'directions_brand_attributes_is_a_brand_i_do_not_trust_to_feed_to_my_

  scaled_df = melted_data.groupby('metric', group_keys=False).apply(apply_scaling)


### CFA analysis

In [147]:
scaled_data

Unnamed: 0,date,brand,category,metric,value
0,2022-08-13,BEGGIN',DOG FOOD,market_share,0.008066
1,2022-08-13,PEDIGREE,DOG FOOD,market_share,0.070409
2,2022-08-13,TEMPTATIONS,CAT FOOD,market_share,0.055228
3,2022-08-20,BEGGIN',DOG FOOD,market_share,0.007958
4,2022-08-20,BLUE BUFFALO,DOG FOOD,market_share,0.074323
...,...,...,...,...,...
38395,2024-02-24,BLUE BUFFALO,CAT FOOD,directions_awareness_total_awareness_net_mentions,0.607638
38396,2024-02-24,TEMPTATIONS,CAT FOOD,directions_awareness_total_awareness_net_mentions,0.879747
38397,2024-03-02,PEDIGREE,DOG FOOD,directions_awareness_total_awareness_net_mentions,0.812161
38398,2024-03-09,BEGGIN',DOG FOOD,directions_awareness_total_awareness_net_mentions,0.900533


In [148]:
group_list = [config['date_column']] + config['data_prep_group_var']
for group, group_df in scaled_data.groupby(['brand', 'category']):
    if group == ("BEGGIN'", 'DOG FOOD'):
        cfa_data = group_df.pivot(index=group_list, columns='metric', values='value').reset_index()
        group_value_tuple = group

In [149]:
pillar_idv_dict = idv_list.groupby('equity_pillar')['idv'].apply(list).to_dict()

In [150]:
pillar = 'awareness'
col_available = [col for col in cfa_data if col in pillar_idv_dict[pillar]]
cfa_data_filtered = cfa_data[col_available]
factor_str = f"{pillar}_pillar =~ " + " + ".join(col_available)



In [151]:
# Generate samples based on seeds
cfa_data_filtered_list = []
for seed in config['cfa_sampling_seeding']:
    sampled_df = cfa_data_filtered.sample(frac=0.95, random_state=seed)
    cfa_data_filtered_list.append(sampled_df)

In [152]:
import semopy as sp
from semopy import Model


def cfa_py(fa_str, scaled_data):

    model = Model(fa_str, cov_diag=False)
    model.fit(scaled_data, solver="L-BFGS-B")  # , estimator="GLS")

    # Retrieve fit statistics (method may vary)
    stats = sp.calc_stats(model)

    try:
        fit_indices = model.fit_stats()
    except AttributeError:
        # If fit_stats() is not available, check other attributes
        fit_indices = (
            model.statistics_ if hasattr(model, "statistics_") else {}
        )
    cfa_fit_indices = pd.DataFrame(
        fit_indices.items(), columns=["fitmeasure", "value"]
    )
    cfa_fit_indices = cfa_fit_indices[
        cfa_fit_indices["fitmeasure"].isin(["cfi", "tli", "rmsea"])
    ]
    cfa_fit_indices["value"] = cfa_fit_indices["value"].astype(float)

    cfa_fit_indices_t = cfa_fit_indices.set_index("fitmeasure").T

    # Extract parameter estimates
    cfa_estimates = model.inspect()

    # CFA summary table
    cfa_summary = pd.concat([cfa_estimates, cfa_fit_indices_t], axis=1)

    # Store the results
    cfa_summary["factor_str"] = fa_str
    # cfa_summary["Seed"] = seed
    cfa_summary["cfi"] = stats.loc["Value"]["CFI"]
    cfa_summary["tli"] = stats.loc["Value"]["TLI"]
    cfa_summary["rmsea"] = stats.loc["Value"]["RMSEA"]
    cfa_summary = cfa_summary.rename(
        columns={
            "lval": "rhs",
            "rval": "lhs",
            "Estimate": "est.std",
            "Std. Err": "se",
            "z-value": "z",
            "p-value": "pvalue",
        }
    )
    cfa_summary.replace({"op": {"~": "=~"}}, inplace=True)
    columns_list = ["rhs", "op", "lhs", "est.std", "se", "z", "pvalue", "factor_str", "cfi", "tli", "rmsea"]

    cfa_summary = cfa_summary[columns_list]

    return cfa_summary

In [153]:
fit_cfa_df = cfa_py(factor_str, cfa_data_filtered).dropna(subset = ['op'])
for i, x in enumerate(config['data_prep_group_var']):
    fit_cfa_df[x] = group_value_tuple[i]

fit_cfa_df

  return np.sqrt((chi2 / dof - 1) / (model.n_samples - 1))


Unnamed: 0,rhs,op,lhs,est.std,se,z,pvalue,factor_str,cfi,tli,rmsea,brand,category
0,directions_awareness_total_awareness_net_mentions,=~,awareness_pillar,1.0,-,-,-,awareness_pillar =~ directions_awareness_total...,0.81151,,inf,BEGGIN',DOG FOOD
1,directions_awareness_unaided_awareness_net_men...,=~,awareness_pillar,1.00299,0.0208,48.219697,0.0,awareness_pillar =~ directions_awareness_total...,0.81151,,inf,BEGGIN',DOG FOOD
2,social_share_of_total_buzz_post,=~,awareness_pillar,0.08556,0.004339,19.716983,0.0,awareness_pillar =~ directions_awareness_total...,0.81151,,inf,BEGGIN',DOG FOOD
3,awareness_pillar,~~,awareness_pillar,0.018327,0.002651,6.914451,0.0,awareness_pillar =~ directions_awareness_total...,0.81151,,inf,BEGGIN',DOG FOOD
4,directions_awareness_total_awareness_net_mentions,~~,directions_awareness_total_awareness_net_mentions,0.0,0.000167,0.0,1.0,awareness_pillar =~ directions_awareness_total...,0.81151,,inf,BEGGIN',DOG FOOD
5,directions_awareness_unaided_awareness_net_men...,~~,directions_awareness_unaided_awareness_net_men...,0.000614,0.00019,3.235352,0.001215,awareness_pillar =~ directions_awareness_total...,0.81151,,inf,BEGGIN',DOG FOOD
6,social_share_of_total_buzz_post,~~,social_share_of_total_buzz_post,3.2e-05,0.000005,6.698613,0.0,awareness_pillar =~ directions_awareness_total...,0.81151,,inf,BEGGIN',DOG FOOD


In [154]:
def process_cfa_samples(cfa_py, factor_str, cfa_data_filtered, config, group_tuple):
    """
    Processes CFA samples, fits CFA to each sample, and concatenates the results.

    Args:
        cfa_py (function): Function to perform CFA.
        factor_str (str): The CFA factor string.
        cfa_data_filtered (DataFrame): The original filtered DataFrame.
        config (dict): Configuration dictionary with sampling seeds and group variables.
        group_tuple (tuple): Tuple containing values for `data_prep_group_var`.

    Returns:
        DataFrame: Concatenated results with additional columns.
    """
    cfa_data_filtered_list = []

    # Perform sampling and fit CFA for each sample
    for seed in config['cfa_sampling_seeding']:
        # Sample 95% of the data
        sampled_df = cfa_data_filtered.sample(frac=0.95, random_state=seed)

        # Fit CFA and drop rows with missing 'op' values
        fit_cfa_df = cfa_py(factor_str, sampled_df).dropna(subset=['op'])

        # Add group variables as columns
        for i, x in enumerate(config['data_prep_group_var']):
            fit_cfa_df[x] = group_tuple[i]

        fit_cfa_df['seed'] = seed

        # Append the processed DataFrame to the list
        cfa_data_filtered_list.append(fit_cfa_df)

    # Concatenate all results
    concatenated_results = pd.concat(cfa_data_filtered_list, ignore_index=True)

    return concatenated_results


In [155]:
def perform_cfa_analysis(scaled_data, idv_list, config, cfa_py):
    """
    Performs CFA analysis across multiple groups, equity pillars, and samples.
    """
    group_list = [config['date_column']] + config['data_prep_group_var']
    final_results = []  # To store results from all groups and equity pillars
    pillar_idv_dict = idv_list.groupby('equity_pillar')['idv'].apply(list).to_dict()
    for group, group_df in scaled_data.groupby(['brand', 'category']):

        # Pivot the data
        cfa_data = group_df.pivot(index=group_list, columns='metric', values='value').reset_index()
        group_value_tuple = group  # Capture group values for column assignment

        for pillar in pillar_idv_dict.keys():
            # Filter columns available for the current pillar
            col_available = [col for col in cfa_data if col in pillar_idv_dict[pillar]]

            if col_available:  # Proceed only if there are relevant columns
                # Filter data for CFA
                cfa_data_filtered = cfa_data[col_available]

                # Create factor string
                factor_str = f"{pillar} =~ " + " + ".join(col_available)

                # Process CFA samples and collect results
                results = process_cfa_samples(
                    cfa_py=cfa_py,
                    factor_str=factor_str,
                    cfa_data_filtered=cfa_data_filtered,
                    config=config,
                    group_tuple=group_value_tuple
                )
                final_results.append(results)

    # Concatenate all results from all groups and pillars
    return pd.concat(final_results, ignore_index=True)

In [156]:
perform_cfa_analysis(scaled_data, idv_list, config, cfa_py)

  return np.sqrt((chi2 / dof - 1) / (model.n_samples - 1))
  return np.sqrt((chi2 / dof - 1) / (model.n_samples - 1))
  return np.sqrt((chi2 / dof - 1) / (model.n_samples - 1))
  return np.sqrt((chi2 / dof - 1) / (model.n_samples - 1))
  return np.sqrt((chi2 / dof - 1) / (model.n_samples - 1))
  return np.sqrt((chi2 / dof - 1) / (model.n_samples - 1))
  return np.sqrt((chi2 / dof - 1) / (model.n_samples - 1))
  return np.sqrt((chi2 / dof - 1) / (model.n_samples - 1))
  return np.sqrt((chi2 / dof - 1) / (model.n_samples - 1))
  return np.sqrt((chi2 / dof - 1) / (model.n_samples - 1))
  return np.sqrt((chi2 / dof - 1) / (model.n_samples - 1))
  return np.sqrt((chi2 / dof - 1) / (model.n_samples - 1))
  return np.sqrt((chi2 / dof - 1) / (model.n_samples - 1))
  return np.sqrt((chi2 / dof - 1) / (model.n_samples - 1))
  return np.sqrt((chi2 / dof - 1) / (model.n_samples - 1))
  return np.sqrt((chi2 / dof - 1) / (model.n_samples - 1))
  return np.sqrt((chi2 / dof - 1) / (model.n_samples - 1

Unnamed: 0,rhs,op,lhs,est.std,se,z,pvalue,factor_str,cfi,tli,rmsea,brand,category,seed
0,directions_funnel_metrics_advocacy_t2b_buyers,=~,advocacy,1.000000,-,-,-,advocacy =~ directions_funnel_metrics_advocacy...,0.688131,,inf,BEGGIN',DOG FOOD,2
1,directions_strategic_measures_brand_love_index,=~,advocacy,0.227565,0.068392,3.327383,0.000877,advocacy =~ directions_funnel_metrics_advocacy...,0.688131,,inf,BEGGIN',DOG FOOD,2
2,social_percent_positive_neutral,=~,advocacy,-0.079943,0.028482,-2.806788,0.005004,advocacy =~ directions_funnel_metrics_advocacy...,0.688131,,inf,BEGGIN',DOG FOOD,2
3,advocacy,~~,advocacy,0.049732,0.015477,3.213364,0.001312,advocacy =~ directions_funnel_metrics_advocacy...,0.688131,,inf,BEGGIN',DOG FOOD,2
4,directions_funnel_metrics_advocacy_t2b_buyers,~~,directions_funnel_metrics_advocacy_t2b_buyers,0.000176,0.013594,0.012962,0.989658,advocacy =~ directions_funnel_metrics_advocacy...,0.688131,,inf,BEGGIN',DOG FOOD,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4939,ratings_reviews_review_sentiment_score_average,~~,ratings_reviews_review_sentiment_score_average,0.000381,0.000057,6.740209,0.0,product_feedback =~ ratings_reviews_good_exper...,0.954372,0.936121,0.226292,TEMPTATIONS,CAT FOOD,19
4940,ratings_reviews_review_sentiment_score_health_...,~~,ratings_reviews_review_sentiment_score_health_...,0.033746,0.005021,6.721637,0.0,product_feedback =~ ratings_reviews_good_exper...,0.954372,0.936121,0.226292,TEMPTATIONS,CAT FOOD,19
4941,ratings_reviews_review_sentiment_score_ingredi...,~~,ratings_reviews_review_sentiment_score_ingredi...,0.062984,0.009338,6.745307,0.0,product_feedback =~ ratings_reviews_good_exper...,0.954372,0.936121,0.226292,TEMPTATIONS,CAT FOOD,19
4942,ratings_reviews_review_sentiment_score_pet_enj...,~~,ratings_reviews_review_sentiment_score_pet_enj...,0.000398,0.00006,6.584319,0.0,product_feedback =~ ratings_reviews_good_exper...,0.954372,0.936121,0.226292,TEMPTATIONS,CAT FOOD,19


### RF Model

In [56]:
from sklearn import metrics
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import RFECV

from sklearn.metrics import (
    mean_absolute_percentage_error,

)

# Machine learning imports
from sklearn.model_selection import (
    GridSearchCV,

    cross_val_score,

    train_test_split,
)


import shap

  from .autonotebook import tqdm as notebook_tqdm


In [206]:
for group, group_df in scaled_data.groupby(config['data_prep_group_var']):
    print(group)
    if group == ("BEGGIN'", 'DOG FOOD'):
        model_data = group_df


("BEGGIN'", 'DOG FOOD')
('BLUE BUFFALO', 'CAT FOOD')
('BLUE BUFFALO', 'DOG FOOD')
('DENTALIFE', 'CAT FOOD')
('DENTALIFE', 'DOG FOOD')
('PEDIGREE', 'DOG FOOD')
('SHEBA', 'CAT FOOD')
('TEMPTATIONS', 'CAT FOOD')


In [207]:
model_data = model_data.pivot(index=group_list, columns='metric', values='value').reset_index()

In [229]:
def model_data_prep(model_idv_dv_df, col_available, config, test_size_ratio=0.1, shuffle=True):
    """
    Prepare model data by splitting into training and testing sets.

    Parameters:
    model_idv_dv_df (DataFrame): The dataset containing independent and dependent variables.
    col_available (list): List of column names to be used as independent variables.
    config (dict): Configuration dictionary containing the dependent variable column name.
    test_size_ratio (float): Proportion of the dataset to be used as the test set.
    shuffle (bool): Whether to shuffle the data before splitting.

    Returns:
    tuple: train_x, test_x, train_y, test_y
    """
    idvs = model_idv_dv_df[col_available]
    dv = model_idv_dv_df[config["dv_column"]]

    train_x, test_x, train_y, test_y = train_test_split(
        idvs,
        dv,
        test_size=int(test_size_ratio * idvs.shape[0]),
        shuffle=shuffle,
    )

    return train_x, test_x, train_y, test_y, idvs, dv

In [210]:
train_x, test_x, train_y, test_y, idvs, dv = model_data_prep(model_idv_dv_df, col_available, config)

In [230]:
def train_and_evaluate_model(config, train_x, train_y):
    """
    Train and evaluate a machine learning model based on the provided configuration.

    Parameters:
    config (dict): Configuration dictionary containing model details and hyperparameters.
    train_x (DataFrame): Training features.
    train_y (Series): Training target.

    Returns:
    regressor (object): Trained model.
    feat_importance (DataFrame): Feature importance values.
    feat_df (DataFrame): SHAP feature importance values.
    """
    # Select model class based on config
    if config['model_type'] == "RandomForest":
        model_class = RandomForestRegressor

    # Extract hyperparameters
        param_grid = {
            key: config["model_config"][config['model_type']]["grid_search"][key]
            for key in config["model_config"][config['model_type']]["grid_search"]
            if key != "eval_metrics"  # Exclude eval_metrics
        }

        # Initialize model with random state
        regressor = model_class(random_state=param_grid["random_state"])

        # Perform Grid Search
        search = GridSearchCV(
            regressor,
            param_grid,
            cv=config["cross_validation_number"],
            scoring=["r2", "neg_mean_absolute_percentage_error"],
            refit="neg_mean_absolute_percentage_error",
        )

    search.fit(train_x, train_y)

    print(f"The best hyperparameters for {config['model_type']} are {search.best_params_}")

    # Train model with best parameters
    regressor = model_class(**search.best_params_)
    regressor.fit(train_x, train_y)

    # Feature importance and SHAP values (only for RandomForest)
    feat_importance = None
    shap_df = None
    if config['model_type'] == "RandomForest":
        features = list(train_x.columns)
        f_i = list(zip(features, regressor.feature_importances_))
        f_i.sort(key=lambda x: x[1], reverse=True)

        rfe = RFECV(
            regressor,
            cv=config["cross_validation_number"],
            scoring="neg_mean_absolute_percentage_error",
        )
        rfe.fit(train_x, train_y)
        selected_features = list(np.array(features)[rfe.get_support()])
        print(selected_features)

        feat_importance = pd.DataFrame(f_i, columns=["metric", "Feature Importance"])
        feat_importance.set_index("metric", inplace=True)
        print(feat_importance)

        # Compute SHAP values
        explainer = shap.TreeExplainer(regressor)
        shap_values = explainer.shap_values(train_x)
        shap_df = pd.DataFrame(
            np.abs(pd.DataFrame(shap_values, columns=train_x.columns)).mean(),
            columns=["shap values"],
        )
        print(f"{config['model_type']} SHAP importance", shap_df.sort_values(by="shap values", ascending=False))

    return regressor, feat_importance, shap_df, search


In [212]:
regressor, feat_importance, shap_df, search = train_and_evaluate_model(config,train_x,train_y)

The best hyperparameters for RandomForest are {'max_depth': 2, 'max_features': 2, 'n_estimators': 500, 'random_state': 42}
[np.str_('directions_funnel_metrics_advocacy_t2b_buyers'), np.str_('directions_strategic_measures_brand_love_index'), np.str_('social_percent_positive_neutral')]
                                                Feature Importance
metric                                                            
social_percent_positive_neutral                           0.414599
directions_strategic_measures_brand_love_index            0.327918
directions_funnel_metrics_advocacy_t2b_buyers             0.257483
RandomForest SHAP importance                                                 shap values
metric                                                     
social_percent_positive_neutral                    0.000080
directions_strategic_measures_brand_love_index     0.000066
directions_funnel_metrics_advocacy_t2b_buyers      0.000032


In [241]:
def evaluate_model_performance(config, regressor, train_x, train_y, test_x, test_y, idvs, dv, feat_importance, shap_df, search, group):
    """
    Evaluate model performance and compute various metrics.

    Parameters:
    config (dict): Configuration dictionary containing model details and hyperparameters.
    regressor (object): Trained model.
    train_x (DataFrame): Training features.
    train_y (Series): Training target.
    test_x (DataFrame): Test features.
    test_y (Series): Test target.
    idvs (DataFrame): Independent variables used for the entire dataset.
    dv (Series): Dependent variable.
    feat_importance (DataFrame): Feature importance values.
    shap_df (DataFrame): SHAP feature importance values.
    search (GridSearchCV): Grid search object containing best parameters.

    Returns:
    results_all_model (DataFrame): Model evaluation results and feature importance.
    actual_vs_predicted (DataFrame): Actual vs predicted values for the full dataset.
    """
    # Generate Predictions
    y_pred_train = regressor.predict(train_x)
    y_pred_test = regressor.predict(test_x)
    y_pred_all = regressor.predict(idvs)

    # Compute Metrics
    mae_train = metrics.mean_absolute_error(train_y, y_pred_train)
    mse_train = metrics.mean_squared_error(train_y, y_pred_train)
    rmse_train = np.sqrt(mse_train)
    r2_train = metrics.r2_score(train_y, y_pred_train)
    mape_train = mean_absolute_percentage_error(train_y, y_pred_train)

    # Merge Feature Importance and SHAP values
    results_all_model = pd.concat([feat_importance, shap_df], axis=1).reset_index().rename(
        columns={"Feature Importance": "feature_importance", "index": "shap_features", "shap values": "shap_values"}
    )

    # Add model performance metrics
    results_all_model["model_type"] = config['model_type']
    results_all_model["latest_dv"] = dv.values[-1]
    results_all_model["r2_score_train"] = r2_train
    results_all_model["mape_train"] = mape_train

    # Cross-validation metrics
    results_all_model["r2_score_fold"] = cross_val_score(
        regressor, train_x, train_y, cv=config["cross_validation_number"], scoring="r2"
    ).mean()
    results_all_model["mape_fold"] = (
        cross_val_score(
            regressor, train_x, train_y, cv=config["cross_validation_number"], scoring="neg_mean_absolute_percentage_error"
        ).mean() * -1
    )

    # Hold-out test set metrics
    results_all_model["r2_score_hold_out"] = metrics.r2_score(test_y, y_pred_test)
    results_all_model["mape_hold_out"] = mean_absolute_percentage_error(test_y, y_pred_test)

    # Overall dataset metrics
    results_all_model["r2_score_all"] = metrics.r2_score(dv, y_pred_all)
    results_all_model["mape_all"] = mean_absolute_percentage_error(dv, y_pred_all)

    # Store best parameters from grid search
    results_all_model["best_params_gridsearchcv"] = str(search.best_params_)

    # Create Actual vs Predicted DataFrame
    actual_vs_predicted = pd.DataFrame({"actual": dv, "predicted": y_pred_all})
    actual_vs_predicted["model_type"] = config['model_type']

    for i, x in enumerate(config['data_prep_group_var']):
        results_all_model[x] = group[i]
        actual_vs_predicted[x] = group[i]
    results_all_model['pillar'] = pillar
    actual_vs_predicted['pillar'] = pillar


    return results_all_model, actual_vs_predicted


In [216]:
results_all_model, actual_vs_predicted = evaluate_model_performance(config, regressor, train_x, train_y, test_x, test_y, idvs, dv, feat_importance, shap_df, search)

In [218]:
results_all_model

Unnamed: 0,metric,feature_importance,shap_values,model_type,latest_dv,r2_score_train,mape_train,r2_score_fold,mape_fold,r2_score_hold_out,mape_hold_out,r2_score_all,mape_all,best_params_gridsearchcv
0,social_percent_positive_neutral,0.414599,8e-05,RandomForest,0.008493,0.327432,0.03206,-0.119596,0.038679,0.510156,0.023633,0.338788,0.03127,"{'max_depth': 2, 'max_features': 2, 'n_estimat..."
1,directions_strategic_measures_brand_love_index,0.327918,6.6e-05,RandomForest,0.008493,0.327432,0.03206,-0.119596,0.038679,0.510156,0.023633,0.338788,0.03127,"{'max_depth': 2, 'max_features': 2, 'n_estimat..."
2,directions_funnel_metrics_advocacy_t2b_buyers,0.257483,3.2e-05,RandomForest,0.008493,0.327432,0.03206,-0.119596,0.038679,0.510156,0.023633,0.338788,0.03127,"{'max_depth': 2, 'max_features': 2, 'n_estimat..."


In [239]:
idv_list

Unnamed: 0,idv,equity_pillar,min,max,negation_flag
0,directions_awareness_total_awareness_net_mentions,awareness,0.0,100.0,P
1,directions_awareness_unaided_awareness_net_men...,awareness,0.0,100.0,P
2,directions_brand_attributes_always_seems_to_be...,brand_perceptions,0.0,100.0,P
3,directions_brand_attributes_creates_excitement...,brand_perceptions,0.0,100.0,P
4,directions_brand_attributes_creates_playful_mo...,brand_perceptions,0.0,100.0,P
5,directions_brand_attributes_has_a_taste_my_cat...,brand_perceptions,0.0,100.0,P
6,directions_brand_attributes_has_a_taste_my_dog...,brand_perceptions,0.0,100.0,P
7,directions_brand_attributes_has_craveable_trea...,brand_perceptions,0.0,100.0,P
8,directions_brand_attributes_has_the_most_satis...,brand_perceptions,0.0,100.0,P
9,directions_brand_attributes_has_treats_i_could...,brand_perceptions,0.0,100.0,P


In [244]:
group_list = [config['date_column']] + config['data_prep_group_var']
final_rf_results = []  # To store results from all groups and equity pillars
final_act_pred_results = []
pillar_idv_dict = idv_list.groupby('equity_pillar')['idv'].apply(list).to_dict()
for group, group_df in scaled_data.groupby(config['data_prep_group_var']):
    print(group)
    model_data = group_df.copy()
    model_data = model_data.pivot(index=group_list, columns='metric', values='value').reset_index()
    for pillar in pillar_idv_dict.keys():
        print(pillar)
        # Filter columns available for the current pillar
        col_available = [
            col for col in model_data if col in pillar_idv_dict[pillar]
        ]
        columns_model = group_list + col_available + [config["dv_column"]]

        if col_available:  # Proceed only if there are relevant columns
        # Filter data for CFA
            print("training")
            model_idv_dv_df = model_data[columns_model]
            train_x, test_x, train_y, test_y, idvs, dv = model_data_prep(model_idv_dv_df, col_available, config)
            regressor, feat_importance, shap_df, search = train_and_evaluate_model(config,train_x,train_y)
            results_all_model, actual_vs_predicted = evaluate_model_performance(config, regressor, train_x, train_y, test_x, test_y, idvs, dv, feat_importance, shap_df, search, group)
            final_rf_results.append(results_all_model)
            final_act_pred_results.append(actual_vs_predicted)



("BEGGIN'", 'DOG FOOD')
advocacy
training
The best hyperparameters for RandomForest are {'max_depth': 3, 'max_features': 4, 'n_estimators': 50, 'random_state': 42}
[np.str_('directions_funnel_metrics_advocacy_t2b_buyers'), np.str_('directions_strategic_measures_brand_love_index'), np.str_('social_percent_positive_neutral')]
                                                Feature Importance
metric                                                            
social_percent_positive_neutral                           0.389802
directions_funnel_metrics_advocacy_t2b_buyers             0.309841
directions_strategic_measures_brand_love_index            0.300357
RandomForest SHAP importance                                                 shap values
metric                                                     
social_percent_positive_neutral                    0.000087
directions_strategic_measures_brand_love_index     0.000080
directions_funnel_metrics_advocacy_t2b_buyers      0.000071
awareness


In [None]:
import pandas as pd

def train_and_evaluate_group_models(config, scaled_data, idv_list):
    """
    Function to train and evaluate models for each group in the scaled data,
    then return concatenated results.

    Args:
        config (dict): Configuration dictionary with necessary columns and parameters.
        scaled_data (pd.DataFrame): The input data to train and evaluate the models on.
        idv_list (pd.DataFrame): The individual variable data for different equity pillars.

    Returns:
        pd.DataFrame: Concatenated results for both models and actual vs predicted values.
    """
    # Prepare the group list and initialize result containers
    group_list = [config['date_column']] + config['data_prep_group_var']
    final_rf_results = []  # To store results from all groups and equity pillars
    final_act_pred_results = []

    # Create the pillar to IDV dictionary
    pillar_idv_dict = idv_list.groupby('equity_pillar')['idv'].apply(list).to_dict()

    # Loop through each group in the scaled data
    for group, group_df in scaled_data.groupby(config['data_prep_group_var']):
        print(f"Processing group: {group}")
        model_data = group_df.copy()
        model_data = model_data.pivot(index=group_list, columns='metric', values='value').reset_index()

        # Loop through each pillar
        for pillar in pillar_idv_dict.keys():
            print(f"Processing pillar: {pillar}")
            # Filter columns available for the current pillar
            col_available = [col for col in model_data if col in pillar_idv_dict[pillar]]
            columns_model = group_list + col_available + [config["dv_column"]]

            if col_available:  # Proceed only if there are relevant columns
                # Filter data for CFA
                print("Training model...")
                model_idv_dv_df = model_data[columns_model]
                train_x, test_x, train_y, test_y, idvs, dv = model_data_prep(model_idv_dv_df, col_available, config)

                # Train and evaluate the model
                regressor, feat_importance, shap_df, search = train_and_evaluate_model(config, train_x, train_y)
                results_all_model, actual_vs_predicted = evaluate_model_performance(
                    config, regressor, train_x, train_y, test_x, test_y, idvs, dv,
                    feat_importance, shap_df, search, group
                )

                # Append results to the final lists
                final_rf_results.append(results_all_model)
                final_act_pred_results.append(actual_vs_predicted)

    # Concatenate the results
    final_rf_results_df = pd.concat(final_rf_results, axis=0, ignore_index=True)
    final_act_pred_results_df = pd.concat(final_act_pred_results, axis=0, ignore_index=True)

    return final_rf_results_df, final_act_pred_results_df


In [245]:
final_rf_df = pd.concat(final_rf_results, axis=0, ignore_index=True)
final_act_pred_df = pd.concat(final_act_pred_results, axis=0, ignore_index=True)


In [247]:

final_rf_df.to_csv(r"D:\BRAND_HUB_PROJECT\brandhub-capability\src\BHC_Capability\output\rf_fit_data.csv", index= False)
final_act_pred_df.to_csv(r"D:\BRAND_HUB_PROJECT\brandhub-capability\src\BHC_Capability\output\rf_act_pred_data.csv", index= False)

In [248]:
results_all_model

Unnamed: 0,metric,feature_importance,shap_values,model_type,latest_dv,r2_score_train,mape_train,r2_score_fold,mape_fold,r2_score_hold_out,mape_hold_out,r2_score_all,mape_all,best_params_gridsearchcv,brand,category,pillar
0,ratings_reviews_review_rating_average,0.23713,0.000409,RandomForest,0.060086,0.641974,0.021128,-0.127677,0.03257,-0.094905,0.033734,0.591666,0.02231,"{'max_depth': 4, 'max_features': 4, 'n_estimat...",TEMPTATIONS,CAT FOOD,product_feedback
1,ratings_reviews_review_sentiment_score_health_...,0.186356,0.000454,RandomForest,0.060086,0.641974,0.021128,-0.127677,0.03257,-0.094905,0.033734,0.591666,0.02231,"{'max_depth': 4, 'max_features': 4, 'n_estimat...",TEMPTATIONS,CAT FOOD,product_feedback
2,ratings_reviews_positive_ratings_percentage,0.165127,0.000403,RandomForest,0.060086,0.641974,0.021128,-0.127677,0.03257,-0.094905,0.033734,0.591666,0.02231,"{'max_depth': 4, 'max_features': 4, 'n_estimat...",TEMPTATIONS,CAT FOOD,product_feedback
3,ratings_reviews_review_sentiment_score_value_f...,0.104672,0.000193,RandomForest,0.060086,0.641974,0.021128,-0.127677,0.03257,-0.094905,0.033734,0.591666,0.02231,"{'max_depth': 4, 'max_features': 4, 'n_estimat...",TEMPTATIONS,CAT FOOD,product_feedback
4,ratings_reviews_review_sentiment_score_pet_enj...,0.102816,0.000135,RandomForest,0.060086,0.641974,0.021128,-0.127677,0.03257,-0.094905,0.033734,0.591666,0.02231,"{'max_depth': 4, 'max_features': 4, 'n_estimat...",TEMPTATIONS,CAT FOOD,product_feedback
5,ratings_reviews_review_sentiment_score_ingredi...,0.101828,0.000211,RandomForest,0.060086,0.641974,0.021128,-0.127677,0.03257,-0.094905,0.033734,0.591666,0.02231,"{'max_depth': 4, 'max_features': 4, 'n_estimat...",TEMPTATIONS,CAT FOOD,product_feedback
6,ratings_reviews_review_sentiment_score_average,0.09044,0.000102,RandomForest,0.060086,0.641974,0.021128,-0.127677,0.03257,-0.094905,0.033734,0.591666,0.02231,"{'max_depth': 4, 'max_features': 4, 'n_estimat...",TEMPTATIONS,CAT FOOD,product_feedback
7,ratings_reviews_good_experience_percentage,0.011632,1.6e-05,RandomForest,0.060086,0.641974,0.021128,-0.127677,0.03257,-0.094905,0.033734,0.591666,0.02231,"{'max_depth': 4, 'max_features': 4, 'n_estimat...",TEMPTATIONS,CAT FOOD,product_feedback


In [9]:
#Parallel processing
from joblib import Parallel, delayed
import pandas as pd
import gc

def train_and_evaluate_group_models_(config, scaled_data, idv_list):
    """Optimized function for parallel group processing and efficient model training."""
    # Prepare data once upfront
    group_list = [config["date_column"]] + config["data_prep_group_var"]
    pillar_idv_dict = (
        idv_list.groupby("equity_pillar")["idv"].apply(list).to_dict()
    )

    # Pre-pivot entire dataset (critical optimization)
    pivoted_data = scaled_data.pivot(
        index=group_list, columns="metric", values="value"
    ).reset_index()

    # Parallel processing of groups
    results = Parallel(n_jobs=-1, prefer="processes")(
        delayed(process_single_group)(
            group, group_df, config, pillar_idv_dict, group_list
        )
        for group, group_df in pivoted_data.groupby(
            config["data_prep_group_var"]
        )
    )

    # Combine results from all workers
    final_rf_results, final_act_pred_results = zip(*results)
    final_rf_results_df = pd.concat(final_rf_results, ignore_index=True)
    final_act_pred_results_df = pd.concat(
        final_act_pred_results, ignore_index=True
    )

    # Save results
    # final_rf_results_df.to_csv(paths["rf_fit_data_path"], index=False)
    # final_act_pred_results_df.to_csv(
    #     paths["rf_act_pred_data_path"], index=False
    # )

    return final_rf_results_df, final_act_pred_results_df


def process_single_group(
    group, model_data, config, pillar_idv_dict, group_list
):
    """Process a single group in parallel."""
    print(f"Processing group: {group}")
    group_results = []
    group_act_pred = []
    model_data = model_data.dropna(axis=1, how="all")
    for pillar in pillar_idv_dict.keys():
        print(f"Processing pillar: {pillar}")
        col_available = [
            col for col in model_data if col in pillar_idv_dict[pillar]
        ]

        if not col_available:
            continue

        try:
            # Efficient data preparation
            model_idv_dv_df = model_data[
                group_list + col_available + [config["dv_column"]]
            ]
            train_x, test_x, train_y, test_y, idvs, dv = model_data_prep(
                model_idv_dv_df, col_available, config
            )

            # Optimized model training
            regressor, feat_importance, shap_df, search = (
                train_and_evaluate_model(config, train_x, train_y)
            )

            # Memory-efficient evaluation
            results, actual_pred = evaluate_model_performance(
                config,
                regressor,
                train_x,
                train_y,
                test_x,
                test_y,
                idvs,
                dv,
                feat_importance,
                shap_df,
                search,
                group,
                pillar,
            )

            group_results.append(results)
            group_act_pred.append(actual_pred)

        except Exception as e:
            print(f"Error processing {group}-{pillar}: {str(e)}")

        # Clean up memory
        del train_x, test_x, train_y, test_y
        gc.collect()

    return (
        (
            pd.concat(group_results, ignore_index=True)
            if group_results
            else pd.DataFrame()
        ),
        (
            pd.concat(group_act_pred, ignore_index=True)
            if group_act_pred
            else pd.DataFrame()
        ),
    )

### Post Modeling

In [5]:
import numpy as np
import pandas as pd



In [6]:
cfa_df = pd.read_csv(r'D:\BRAND_HUB_PROJECT\brandhub-capability\src\BHC_Capability\output\cfa_fit_data.csv')
rf_df = pd.read_csv(r'D:\BRAND_HUB_PROJECT\brandhub-capability\src\BHC_Capability\output\rf_fit_data.csv')

In [7]:
cfa_df.head()

Unnamed: 0,brand,category,seed,lhs,op,rhs,est.std,se,z,pvalue,factor_str,cfi,tli,rmsea
0,BEGGIN',DOG FOOD,2,advocacy,=~,directions_funnel_metrics_advocacy_t2b_buyers,1.0,-,-,-,advocacy =~ directions_funnel_metrics_advocacy...,0.688131,1.0,0.0
1,BEGGIN',DOG FOOD,2,advocacy,=~,directions_strategic_measures_brand_love_index,0.227565,0.0683915765868133,3.3273832645852384,0.000876657103834555,advocacy =~ directions_funnel_metrics_advocacy...,0.688131,1.0,0.0
2,BEGGIN',DOG FOOD,2,advocacy,=~,social_percent_positive_neutral,-0.079943,0.028481887927253726,-2.8067878567429756,0.0050038183051075436,advocacy =~ directions_funnel_metrics_advocacy...,0.688131,1.0,0.0
3,BEGGIN',DOG FOOD,2,advocacy,~~,advocacy,0.049732,0.015476752682421916,3.213364023521866,0.0013118990063356595,advocacy =~ directions_funnel_metrics_advocacy...,0.688131,1.0,0.0
4,BEGGIN',DOG FOOD,2,directions_funnel_metrics_advocacy_t2b_buyers,~~,directions_funnel_metrics_advocacy_t2b_buyers,0.000176,0.013593630889584138,0.012961605802385168,0.9896584244175497,advocacy =~ directions_funnel_metrics_advocacy...,0.688131,1.0,0.0


In [8]:
rf_df.head()

Unnamed: 0,brand,category,pillar,metric,feature_importance,shap_values,model_type,latest_dv,r2_score_train,mape_train,r2_score_fold,mape_fold,r2_score_hold_out,mape_hold_out,r2_score_all,mape_all,best_params_gridsearchcv
0,BEGGIN',DOG FOOD,advocacy,directions_funnel_metrics_advocacy_t2b_buyers,0.34569,6.7e-05,RandomForest,0.008493,0.483842,0.027938,0.098157,0.0368,-0.331477,0.031424,0.447602,0.028265,"{'max_depth': 3, 'max_features': 2, 'n_estimat..."
1,BEGGIN',DOG FOOD,advocacy,directions_strategic_measures_brand_love_index,0.339414,8.4e-05,RandomForest,0.008493,0.483842,0.027938,0.098157,0.0368,-0.331477,0.031424,0.447602,0.028265,"{'max_depth': 3, 'max_features': 2, 'n_estimat..."
2,BEGGIN',DOG FOOD,advocacy,social_percent_positive_neutral,0.314895,7.5e-05,RandomForest,0.008493,0.483842,0.027938,0.098157,0.0368,-0.331477,0.031424,0.447602,0.028265,"{'max_depth': 3, 'max_features': 2, 'n_estimat..."
3,BEGGIN',DOG FOOD,awareness,social_share_of_total_buzz_post,0.395026,6.8e-05,RandomForest,0.008493,0.607507,0.023787,0.08049,0.03388,0.019645,0.041715,0.552282,0.025468,"{'max_depth': 4, 'max_features': 2, 'n_estimat..."
4,BEGGIN',DOG FOOD,awareness,directions_awareness_total_awareness_net_mentions,0.3146,8.1e-05,RandomForest,0.008493,0.607507,0.023787,0.08049,0.03388,0.019645,0.041715,0.552282,0.025468,"{'max_depth': 4, 'max_features': 2, 'n_estimat..."


In [9]:
def data_merge(cfa_df, rf_df, idv_list, config):
    data_group = config['data_prep_group_var']
    cfa_filtered = cfa_df[cfa_df['lhs'].isin(idv_list['equity_pillar'].unique()) & (cfa_df['op'] == '=~') & (cfa_df['seed'] == config['cfa_seed'])]
    cfa_filtered.rename(columns ={'lhs':'pillar', 'rhs':'metric'}, inplace = True)
    cfa_filtered[config['cfa_target_col']] = cfa_filtered[config['cfa_target_col']].abs()
    cfa_filtered = cfa_filtered[data_group + ['pillar', 'metric'] + [config['cfa_target_col']]]
    rf_df['feature_importance'] = rf_df['feature_importance'].abs()
    rf_filtered = rf_df[data_group + ['pillar', 'metric']+ [config['rf_target_col']]]
    merged_df = cfa_filtered.merge(rf_filtered, on = data_group + ["pillar", 'metric'], how = 'inner')

    return merged_df



In [10]:
def data_normalize_weight(scores_merged, config):
    # Normalize cfa_value and shap_values within each group
    scores_merged[config['cfa_target_col']] = scores_merged.groupby( config['data_prep_group_var'] + ['pillar'])[config['cfa_target_col']].transform(
        lambda x: x / x.sum()
    )
    scores_merged[config['rf_target_col']] = scores_merged.groupby(config['data_prep_group_var'] + ['pillar'])[config['rf_target_col']].transform(
        lambda x: x / x.sum()
    )
    scores_merged['weight'] = (scores_merged[config['cfa_target_col']] + scores_merged[config['rf_target_col']]) / 2

    return scores_merged

In [11]:
merged_df = data_merge(cfa_df, rf_df, idv_list, config)
scores = data_normalize_weight(merged_df, config)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cfa_filtered.rename(columns ={'lhs':'pillar', 'rhs':'metric'}, inplace = True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cfa_filtered[config['cfa_target_col']] = cfa_filtered[config['cfa_target_col']].abs()


In [12]:
def pillar_creation(scores, scaled_data, config):
    merged_df = pd.merge(
    scores,
    scaled_data,
    on=['brand', 'category', 'metric'],
    how='inner'  # Use 'inner' to drop metrics without weights
    )

    merged_df['score'] = merged_df['value'] * merged_df['weight']
    pillar_scores = merged_df.groupby([config['date_column']]+ config['data_prep_group_var']+ ['pillar'])['score'].sum().reset_index()
    return pillar_scores



In [16]:
pillar_data = pillar_creation(scores, scaled_data, config)

In [17]:

def trend_past_creation(pillar_data, config):
    trend_past = pillar_data.sort_values(by=config['data_prep_group_var'] + ['pillar', config['date_column']]).copy()

    # Define window size (e.g., 2-day rolling average)
    window_size = config['trend_past_rolling_window']  # Adjust this value as needed

    # Calculate rolling average within groups
    trend_past['trend_past'] = (
        trend_past
        .groupby(['brand', 'category', 'pillar'])['score']
        .transform(lambda x: x.rolling(window=window_size, min_periods=1).mean())
    )

    return trend_past


In [18]:
trend_data = trend_past_creation(pillar_data, config)

In [19]:
def scaled_score_creation(pillar_data, config):
    df = pillar_data.copy()
    df['group_mean'] = df.groupby([config['scale_level'], config['date_column']])['score'].transform('mean')

    # Scale scores such that group mean = 100
    df['scaled_score'] = (df['score'] / df['group_mean']) * 100

    # Drop intermediate column (optional)
    df = df.drop(columns=['group_mean'])

    return df

In [124]:
scaled_pillar_data = scaled_score_creation(pillar_data, config)

### Importance Model

In [46]:
dv_data = scaled_data[scaled_data['metric']== config['dv_column']].pivot(index = [config['date_column']] + config['data_prep_group_var'], columns="metric", values="value"
).reset_index()


In [47]:
pillar_pivot = trend_data.pivot(index = [config['date_column']] + config['data_prep_group_var'], columns="pillar", values="trend_past"
).reset_index()


In [48]:
model_dv_idv_df = pd.merge(pillar_pivot, dv_data, on=[config['date_column']] + config['data_prep_group_var'], how = 'inner')

In [54]:
importance_col_available = idv_list['equity_pillar'].unique()
importance_col_available

array(['awareness', 'brand_perceptions', 'consideration', 'loyalty',
       'advocacy', 'product_feedback'], dtype=object)

In [50]:
def importance_model_data_prep(model_idv_dv_df, col_available, config, test_size_ratio=0.1, shuffle=True):
    """
    Prepare model data by splitting into training and testing sets.

    Parameters:
    model_idv_dv_df (DataFrame): The dataset containing independent and dependent variables.
    col_available (list): List of column names to be used as independent variables.
    config (dict): Configuration dictionary containing the dependent variable column name.
    test_size_ratio (float): Proportion of the dataset to be used as the test set.
    shuffle (bool): Whether to shuffle the data before splitting.

    Returns:
    tuple: train_x, test_x, train_y, test_y
    """
    idvs = model_idv_dv_df[col_available]
    dv = model_idv_dv_df[config["dv_column"]]

    train_x, test_x, train_y, test_y = train_test_split(
        idvs,
        dv,
        test_size=int(test_size_ratio * idvs.shape[0]),
        shuffle=shuffle,
    )

    return train_x, test_x, train_y, test_y, idvs, dv

In [77]:
def importance_train_and_evaluate_model(config, train_x, train_y):
    """
    Train and evaluate a machine learning model based on the provided configuration.

    Parameters:
    config (dict): Configuration dictionary containing model details and hyperparameters.
    train_x (DataFrame): Training features.
    train_y (Series): Training target.

    Returns:
    regressor (object): Trained model.
    feat_importance (DataFrame): Feature importance values.
    feat_df (DataFrame): SHAP feature importance values.
    """
    # Select model class based on config
    if config['importance_model_type'] == "RandomForest":
        model_class = RandomForestRegressor
        params = {
            "max_depth": config["importance_model_config"]["RandomForest"]["grid_search"]["max_depth"],
            "n_estimators": config["importance_model_config"]["RandomForest"]["grid_search"]["n_estimators"],
            "max_features": config["importance_model_config"]["RandomForest"]["grid_search"]["max_features"],
            "random_state": [config["importance_model_config"]["RandomForest"]["grid_search"]["random_state"]],
        }
        model = RandomForestRegressor(
            random_state=[config["importance_model_config"]["RandomForest"]["random_state"]]
        )
        grid_search = GridSearchCV(
            model,
            params,
            cv=config['cross_validation_number'],
            scoring=["r2", "neg_mean_absolute_percentage_error"],
            refit="neg_mean_absolute_percentage_error",
        )

    grid_search.fit(train_x, train_y)

    print(f"The best hyperparameters for {config['importance_model_type']} are {grid_search.best_params_}")

    # Train model with best parameters
    regressor = model_class(**grid_search.best_params_)
    regressor.fit(train_x, train_y)

    # Feature importance and SHAP values (only for RandomForest)
    feat_importance = None
    shap_df = None
    if config['importance_model_type'] == "RandomForest":
        features = list(train_x.columns)
        f_i = list(zip(features, regressor.feature_importances_))
        f_i.sort(key=lambda x: x[1], reverse=True)

        rfe = RFECV(
            regressor,
            cv=config["cross_validation_number"],
            scoring="neg_mean_absolute_percentage_error",
        )
        rfe.fit(train_x, train_y)
        selected_features = list(np.array(features)[rfe.get_support()])
        print(selected_features)

        feat_importance = pd.DataFrame(f_i, columns=["metric", "Feature Importance"])
        feat_importance.set_index("metric", inplace=True)
        print(feat_importance)

        # Compute SHAP values
        explainer = shap.TreeExplainer(regressor)
        shap_values = explainer.shap_values(train_x)
        shap_df = pd.DataFrame(
            np.abs(pd.DataFrame(shap_values, columns=train_x.columns)).mean(),
            columns=["shap values"],
        )
        print(f"{config['model_type']} SHAP importance", shap_df.sort_values(by="shap values", ascending=False))

    return regressor, feat_importance, shap_df, grid_search


In [95]:
def importance_evaluate_model_performance(config, regressor, train_x, train_y, test_x, test_y, idvs, dv, feat_importance, shap_df, search, group):
    """
    Evaluate model performance and compute various metrics.

    Parameters:
    config (dict): Configuration dictionary containing model details and hyperparameters.
    regressor (object): Trained model.
    train_x (DataFrame): Training features.
    train_y (Series): Training target.
    test_x (DataFrame): Test features.
    test_y (Series): Test target.
    idvs (DataFrame): Independent variables used for the entire dataset.
    dv (Series): Dependent variable.
    feat_importance (DataFrame): Feature importance values.
    shap_df (DataFrame): SHAP feature importance values.
    search (GridSearchCV): Grid search object containing best parameters.

    Returns:
    results_all_model (DataFrame): Model evaluation results and feature importance.
    actual_vs_predicted (DataFrame): Actual vs predicted values for the full dataset.
    """
    # Generate Predictions
    y_pred_train = regressor.predict(train_x)
    y_pred_test = regressor.predict(test_x)
    y_pred_all = regressor.predict(idvs)

    # Compute Metrics
    mae_train = metrics.mean_absolute_error(train_y, y_pred_train)
    mse_train = metrics.mean_squared_error(train_y, y_pred_train)
    rmse_train = np.sqrt(mse_train)
    r2_train = metrics.r2_score(train_y, y_pred_train)
    mape_train = mean_absolute_percentage_error(train_y, y_pred_train)

    # Merge Feature Importance and SHAP values
    results_all_model = pd.concat([feat_importance, shap_df], axis=1).reset_index().rename(
        columns={"Feature Importance": "feature_importance", "index": "shap_features", "shap values": "shap_values"}
    )

    # Add model performance metrics
    results_all_model["model_type"] = config['model_type']
    results_all_model["latest_dv"] = dv.values[-1]
    results_all_model["r2_score_train"] = r2_train
    results_all_model["mape_train"] = mape_train

    # Cross-validation metrics
    results_all_model["r2_score_fold"] = cross_val_score(
        regressor, train_x, train_y, cv=config["cross_validation_number"], scoring="r2"
    ).mean()
    results_all_model["mape_fold"] = (
        cross_val_score(
            regressor, train_x, train_y, cv=config["cross_validation_number"], scoring="neg_mean_absolute_percentage_error"
        ).mean() * -1
    )

    # Hold-out test set metrics
    results_all_model["r2_score_hold_out"] = metrics.r2_score(test_y, y_pred_test)
    results_all_model["mape_hold_out"] = mean_absolute_percentage_error(test_y, y_pred_test)

    # Overall dataset metrics
    results_all_model["r2_score_all"] = metrics.r2_score(dv, y_pred_all)
    results_all_model["mape_all"] = mean_absolute_percentage_error(dv, y_pred_all)

    # Store best parameters from grid search
    results_all_model["best_params_gridsearchcv"] = str(search.best_params_)

    # Create Actual vs Predicted DataFrame
    actual_vs_predicted = pd.DataFrame({"actual": dv, "predicted": y_pred_all})
    actual_vs_predicted["model_type"] = config['model_type']

    for i, x in enumerate(config['data_prep_group_var']):
        results_all_model[x] = group[i]
        actual_vs_predicted[x] = group[i]


    return results_all_model, actual_vs_predicted


In [None]:
def importance_train_and_evaluate_models(scaled_data, trend_data, idv_list, config):
    """Train and evaluate models sequentially and store results in DataFrames."""

    # Prepare dependent variable data
    dv_data = scaled_data[scaled_data['metric'] == config['dv_column']].pivot(
        index=[config['date_column']] + config['data_prep_group_var'],
        columns="metric",
        values="value"
    ).reset_index()

    # Prepare pillar trend data
    pillar_pivot = trend_data.pivot(
        index=[config['date_column']] + config['data_prep_group_var'],
        columns="pillar",
        values="trend_past"
    ).reset_index()

    # Merge datasets
    model_dv_idv_df = pd.merge(
        pillar_pivot,
        dv_data,
        on=[config['date_column']] + config['data_prep_group_var'],
        how='inner'
    )

    # Identify available importance columns
    importance_col_available = idv_list['equity_pillar'].unique()

    final_rf_results = []
    final_act_pred_results = []

    for group, group_df in model_dv_idv_df.groupby(config['data_prep_group_var']):
        train_x, test_x, train_y, test_y, idvs, dv = importance_model_data_prep(group_df, importance_col_available, config)
        regressor, feat_importance, shap_df, search = importance_train_and_evaluate_model(config, train_x, train_y)
        results_all_model, actual_vs_predicted = importance_evaluate_model_performance(
            config, regressor, train_x, train_y, test_x, test_y, idvs, dv, feat_importance, shap_df, search, group
        )

        final_rf_results.append(results_all_model)
        final_act_pred_results.append(actual_vs_predicted)

    # Concatenating all results into single DataFrames
    final_rf_results_df = pd.concat(final_rf_results, ignore_index=True)
    final_act_pred_results_df = pd.concat(final_act_pred_results, ignore_index=True)

    return final_rf_results_df, final_act_pred_results_df


In [84]:
final_rf_results[0]

Unnamed: 0,shap_features,feature_importance,shap_values,model_type,latest_dv,r2_score_train,mape_train,r2_score_fold,mape_fold,r2_score_hold_out,mape_hold_out,r2_score_all,mape_all,best_params_gridsearchcv,brand,category
0,advocacy,0.203978,4.4e-05,RandomForest,0.008493,0.822621,0.01685,0.123899,0.034081,-0.142614,0.040098,0.746919,0.01903,"{'max_depth': 6, 'max_features': 2, 'n_estimat...",BEGGIN',DOG FOOD
1,product_feedback,0.192039,7.8e-05,RandomForest,0.008493,0.822621,0.01685,0.123899,0.034081,-0.142614,0.040098,0.746919,0.01903,"{'max_depth': 6, 'max_features': 2, 'n_estimat...",BEGGIN',DOG FOOD
2,loyalty,0.178607,3.6e-05,RandomForest,0.008493,0.822621,0.01685,0.123899,0.034081,-0.142614,0.040098,0.746919,0.01903,"{'max_depth': 6, 'max_features': 2, 'n_estimat...",BEGGIN',DOG FOOD
3,awareness,0.172683,5.6e-05,RandomForest,0.008493,0.822621,0.01685,0.123899,0.034081,-0.142614,0.040098,0.746919,0.01903,"{'max_depth': 6, 'max_features': 2, 'n_estimat...",BEGGIN',DOG FOOD
4,brand_perceptions,0.157299,5.5e-05,RandomForest,0.008493,0.822621,0.01685,0.123899,0.034081,-0.142614,0.040098,0.746919,0.01903,"{'max_depth': 6, 'max_features': 2, 'n_estimat...",BEGGIN',DOG FOOD
5,consideration,0.095394,3.1e-05,RandomForest,0.008493,0.822621,0.01685,0.123899,0.034081,-0.142614,0.040098,0.746919,0.01903,"{'max_depth': 6, 'max_features': 2, 'n_estimat...",BEGGIN',DOG FOOD


In [99]:
import pandas as pd
from joblib import Parallel, delayed

def importance_process_group(group_df, group, importance_col_available, config):
    """Process a single group to train and evaluate the model."""
    # Prepare data
    train_x, test_x, train_y, test_y, idvs, dv = importance_model_data_prep(
        group_df, importance_col_available, config
    )
    # Train model
    regressor, feat_importance, shap_df, search = importance_train_and_evaluate_model(
        config, train_x, train_y
    )
    # Evaluate performance
    results_all_model, actual_vs_predicted = importance_evaluate_model_performance(
        config, regressor, train_x, train_y, test_x, test_y,
        idvs, dv, feat_importance, shap_df, search, group
    )
    return results_all_model, actual_vs_predicted

def importance_run_parallel_processing(scaled_data, trend_data, idv_list, config):
    """
    Execute the data preparation and model training/evaluation
    with parallel processing across groups.
    """
    # Prepare dependent variable data
    dv_data = scaled_data[scaled_data['metric'] == config['dv_column']].pivot(
        index=[config['date_column']] + config['data_prep_group_var'],
        columns="metric",
        values="value"
    ).reset_index()

    # Prepare pillar trend data
    pillar_pivot = trend_data.pivot(
        index=[config['date_column']] + config['data_prep_group_var'],
        columns="pillar",
        values="trend_past"
    ).reset_index()

    # Merge datasets
    model_dv_idv_df = pd.merge(
        pillar_pivot,
        dv_data,
        on=[config['date_column']] + config['data_prep_group_var'],
        how='inner'
    )

    # Identify available importance columns
    importance_col_available = idv_list['equity_pillar'].unique()


    # Parallel execution across groups
    results = Parallel(n_jobs=-1, verbose=10)(
        delayed(importance_process_group)(
            group_df, group, importance_col_available, config
        ) for group, group_df in model_dv_idv_df.groupby(config['data_prep_group_var'])
    )

    # Unpack results
    final_rf_results, final_act_pred_results = zip(*results)

    final_rf_results_df = pd.concat(final_rf_results, ignore_index=True)
    final_act_pred_results_df = pd.concat(
        final_act_pred_results, ignore_index=True
    )

    # Save results
    # final_rf_results_df.to_csv(paths["rf_fit_data_path"], index=False)
    # final_act_pred_results_df.to_csv(
    #     paths["rf_act_pred_data_path"], index=False
    # )

    return final_rf_results_df, final_act_pred_results_df

In [105]:
imp_rf_df, imp_rf_act_pred_df = importance_run_parallel_processing(
    scaled_data, trend_data, idv_list, config
)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 18 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   8 | elapsed:  1.5min remaining:  4.5min
[Parallel(n_jobs=-1)]: Done   3 out of   8 | elapsed:  1.5min remaining:  2.5min
[Parallel(n_jobs=-1)]: Done   4 out of   8 | elapsed:  1.5min remaining:  1.5min
[Parallel(n_jobs=-1)]: Done   5 out of   8 | elapsed:  1.5min remaining:   54.7s
[Parallel(n_jobs=-1)]: Done   6 out of   8 | elapsed:  1.5min remaining:   30.4s
[Parallel(n_jobs=-1)]: Done   8 out of   8 | elapsed:  1.8min finished


### Score Card

In [114]:
scaled_data

Unnamed: 0,date,brand,category,metric,value
0,2022-08-13,BEGGIN',DOG FOOD,market_share,0.008066
1,2022-08-13,PEDIGREE,DOG FOOD,market_share,0.070409
2,2022-08-13,TEMPTATIONS,CAT FOOD,market_share,0.055228
3,2022-08-20,BEGGIN',DOG FOOD,market_share,0.007958
4,2022-08-20,BLUE BUFFALO,DOG FOOD,market_share,0.074323
...,...,...,...,...,...
38395,2024-02-24,BLUE BUFFALO,CAT FOOD,directions_brand_attributes_is_too_cheap_to_be...,0.267546
38396,2024-02-24,TEMPTATIONS,CAT FOOD,directions_brand_attributes_is_too_cheap_to_be...,0.305562
38397,2024-03-02,PEDIGREE,DOG FOOD,directions_brand_attributes_is_too_cheap_to_be...,0.496763
38398,2024-03-09,BEGGIN',DOG FOOD,directions_brand_attributes_is_too_cheap_to_be...,0.463400


In [112]:
pillar_weights = scores.drop(['est.std', 'shap_values'], axis= 1)
pillar_weights

Unnamed: 0,brand,category,pillar,metric,weight
0,BEGGIN',DOG FOOD,advocacy,directions_funnel_metrics_advocacy_t2b_buyers,0.530139
1,BEGGIN',DOG FOOD,advocacy,directions_strategic_measures_brand_love_index,0.273145
2,BEGGIN',DOG FOOD,advocacy,social_percent_positive_neutral,0.196716
3,BEGGIN',DOG FOOD,awareness,directions_awareness_total_awareness_net_mentions,0.425634
4,BEGGIN',DOG FOOD,awareness,directions_awareness_unaided_awareness_net_men...,0.397216
...,...,...,...,...,...
280,TEMPTATIONS,CAT FOOD,product_feedback,ratings_reviews_review_sentiment_score_average,0.153933
281,TEMPTATIONS,CAT FOOD,product_feedback,ratings_reviews_review_sentiment_score_health_...,0.205776
282,TEMPTATIONS,CAT FOOD,product_feedback,ratings_reviews_review_sentiment_score_ingredi...,0.047832
283,TEMPTATIONS,CAT FOOD,product_feedback,ratings_reviews_review_sentiment_score_pet_enj...,0.095967


In [None]:
score_card_df  = pd.merge(
    pillar_weights,
    scaled_data,
    on=config['data_prep_group_var']+['metric'],
    how='inner'  # Use 'inner' to drop metrics without weights
    )

score_card_df["metric_contribution"] = score_card_df["value"] * score_card_df["weight"]

score_card_df[config['date_column']] = pd.to_datetime(score_card_df["date"], format="%Y-%m-%d")
score_card_df["year"] = score_card_df["date"].dt.year
score_card_df["month"] = score_card_df["date"].dt.month

score_card_final_df = pd.merge(score_card_df,
         scaled_pillar_data)

In [130]:
score_card_final_df

Unnamed: 0,brand,category,pillar,metric,weight,date,value,metric_contribution,year,month,score,scaled_score
0,BEGGIN',DOG FOOD,advocacy,directions_funnel_metrics_advocacy_t2b_buyers,0.530139,2022-08-13,0.823293,0.436460,2022,8,0.795621,129.442711
1,BEGGIN',DOG FOOD,advocacy,directions_funnel_metrics_advocacy_t2b_buyers,0.530139,2022-08-20,0.823293,0.436460,2022,8,0.809719,131.265328
2,BEGGIN',DOG FOOD,advocacy,directions_funnel_metrics_advocacy_t2b_buyers,0.530139,2022-08-27,0.823293,0.436460,2022,8,0.809071,135.427512
3,BEGGIN',DOG FOOD,advocacy,directions_funnel_metrics_advocacy_t2b_buyers,0.530139,2022-12-10,0.840237,0.445442,2022,12,0.801900,138.897480
4,BEGGIN',DOG FOOD,advocacy,directions_funnel_metrics_advocacy_t2b_buyers,0.530139,2023-02-04,0.796484,0.422247,2023,2,0.789022,135.157636
...,...,...,...,...,...,...,...,...,...,...,...,...
27355,TEMPTATIONS,CAT FOOD,product_feedback,ratings_reviews_review_sentiment_score_value_f...,0.106400,2023-07-22,0.776515,0.082621,2023,7,0.708105,113.009929
27356,TEMPTATIONS,CAT FOOD,product_feedback,ratings_reviews_review_sentiment_score_value_f...,0.106400,2023-07-29,0.813725,0.086580,2023,7,0.669401,103.665444
27357,TEMPTATIONS,CAT FOOD,product_feedback,ratings_reviews_review_sentiment_score_value_f...,0.106400,2024-04-27,0.726190,0.077267,2024,4,0.695157,100.045987
27358,TEMPTATIONS,CAT FOOD,product_feedback,ratings_reviews_review_sentiment_score_value_f...,0.106400,2023-04-15,0.934211,0.099400,2023,4,0.660509,100.829252


In [None]:
    filtered_imp_model_results = imp_rf_df[
        imp_rf_df["model_type"] == config['importance_model_type']
    ][["brand", "category", "shap_features", "shap_values"]].copy()

In [138]:
# Calculate the sum of SHAP values and relative importance
sum_shap_values = filtered_imp_model_results.groupby(
    config['data_prep_group_var']
)["shap_values"].transform("sum")
filtered_imp_model_results["relative_importance"] = (
    filtered_imp_model_results["shap_values"] / sum_shap_values
)

In [139]:
filtered_imp_model_results

Unnamed: 0,brand,category,shap_features,shap_values,relative_importance
0,BEGGIN',DOG FOOD,product_feedback,7.3e-05,0.306847
1,BEGGIN',DOG FOOD,awareness,6e-05,0.252861
2,BEGGIN',DOG FOOD,advocacy,3.6e-05,0.151224
3,BEGGIN',DOG FOOD,brand_perceptions,3.5e-05,0.147795
4,BEGGIN',DOG FOOD,loyalty,1.4e-05,0.059549
5,BEGGIN',DOG FOOD,consideration,1.9e-05,0.081724
6,BLUE BUFFALO,CAT FOOD,product_feedback,0.001246,0.483153
7,BLUE BUFFALO,CAT FOOD,awareness,0.000734,0.284629
8,BLUE BUFFALO,CAT FOOD,consideration,0.000203,0.078598
9,BLUE BUFFALO,CAT FOOD,loyalty,0.00027,0.104567


In [None]:
def calculate_score_and_importance(config, pillar_weights, scaled_data, scaled_pillar_data, imp_rf_df):
    """
    Computes the score card and importance model results.

    Args:
        config (dict): Configuration settings.
        weight_scores (pd.DataFrame): DataFrame containing weights for metrics.
        scaled_data (pd.DataFrame): DataFrame with scaled values.
        scaled_pillar_data (pd.DataFrame): DataFrame containing pillar data.
        imp_rf_df (pd.DataFrame): DataFrame containing model importance results.

    Returns:
        score_card_final_df (pd.DataFrame): Final score card with calculated metric contributions.
        filtered_imp_model_results (pd.DataFrame): Processed importance model results with relative importance.
    """

    # Merge weight scores with scaled data
    score_card_df = pd.merge(
        pillar_weights,
        scaled_data,
        on=config['data_prep_group_var'] + ['metric'],
        how='inner'  # Use 'inner' to drop metrics without weights
    )

    # Compute metric contribution
    score_card_df["metric_contribution"] = score_card_df["value"] * score_card_df["weight"]

    # Convert date column to datetime format and extract year/month
    score_card_df[config['date_column']] = pd.to_datetime(score_card_df["date"], format="%Y-%m-%d")
    score_card_df["year"] = score_card_df["date"].dt.year
    score_card_df["month"] = score_card_df["date"].dt.month

    # Merge with scaled pillar data
    score_card_final_df = pd.merge(score_card_df, scaled_pillar_data, how="inner")

    # Filter model importance results
    filtered_imp_model_results = imp_rf_df[
        imp_rf_df["model_type"] == config['importance_model_type']
    ][["brand", "category", "shap_features", "shap_values"]].copy()

    # Calculate the sum of SHAP values and relative importance
    sum_shap_values = filtered_imp_model_results.groupby(config['data_prep_group_var'])["shap_values"].transform("sum")
    filtered_imp_model_results["relative_importance"] = (
        filtered_imp_model_results["shap_values"] / sum_shap_values
    )

    return score_card_final_df, filtered_imp_model_results
