<a href="https://colab.research.google.com/github/vivek6311/Artificial-Intelligence-with-Python/blob/master/Vape%2BCallibration.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np

In [4]:
df = pd.read_excel("/content/Vape_Plus_Wave1_2024W2_UK_Fresh+Recontact_Excel 2.xlsx")

In [5]:
df_Disp_Brands = pd.read_excel("/content/UK-Disp_Brands 1.xlsx")

In [6]:
df_RCS_Brands = pd.read_excel("/content/UK-RCS_Brands 1.xlsx")

In [7]:
df_OS_Brands = pd.read_excel("/content/UK-OS_Brands 1.xlsx")

In [8]:
df = df.iloc[1:]

In [9]:
# extract and rename columns based on a pattern
def extract_and_rename(df, brand_pattern, awc_pattern, pc_pattern=None, brand_prefix='brand', awc_prefix='awc', pc_prefix='pc'):
    # Extract Unique_serial column
    unique_serial_col = df[['Unique_serial', 'Weight']]

    # Extract brand columns based on the brand pattern and rename them
    brand_cols = df.filter(like=brand_pattern)
    brand_cols.columns = [f"{brand_prefix}_{i+1}" for i in range(brand_cols.shape[1])]

    # Extract AWC columns based on the awc pattern and rename them
    awc_cols = df.filter(like=awc_pattern)
    awc_cols.columns = [f"{awc_prefix}_{i+1}" for i in range(awc_cols.shape[1])]

    # If pc_pattern is provided, extract PC columns and rename them
    if pc_pattern:
        pc_cols = df.filter(like=pc_pattern)
        pc_cols.columns = [f"{pc_prefix}_{i+1}" for i in range(pc_cols.shape[1])]
        # Concatenate Unique_serial with brand, awc, and pc columns
        return pd.concat([unique_serial_col, brand_cols, awc_cols, pc_cols], axis=1)

    # If no PC columns are present, concatenate Unique_serial with brand and awc columns only
    return pd.concat([unique_serial_col, brand_cols, awc_cols], axis=1)


disp_data = extract_and_rename(df, 'AllDisposbale_Brands_', 'Q40_', 'Q62_', 'disp_brand', 'disp_awc', 'disp_pc')


rcs_data = extract_and_rename(df,'AllClosedSystem_Brands_' , 'Q44_', 'Q64_', 'rcs_brand', 'rcs_awc', 'rcs_pc')


os_data = extract_and_rename(df, 'AllOpenSystem_Brands_', 'Q46_', brand_prefix='os_brand', awc_prefix='os_awc')

In [10]:
disp_brands = df_Disp_Brands["Brand_Variant"].dropna().tolist()

rcs_brands =  df_RCS_Brands["Brand_Variant"].dropna().tolist()

os_brands = df_OS_Brands["Brand_Variant"].dropna().tolist()

In [11]:
def filter_misplaced_brands(df, valid_brands):
    """
    Filters misplaced brands in a DataFrame based on a single list of valid brands.

    Parameters:
    - df (pd.DataFrame): DataFrame containing brand, AWC, and PC columns.
    - valid_brands (list): List of valid brands for the specified category.

    Returns:
    - clean_data (pd.DataFrame): DataFrame with correctly placed brands, AWC, and PC.
    - misplaced_data (pd.DataFrame): DataFrame with misplaced brands not found in the valid brand list,
                                         retaining the same shape and filling missing values with 0.
    """
    # Convert all valid brands to lowercase for case-insensitive matching
    valid_brands = [brand.strip().lower() for brand in valid_brands if isinstance(brand, str)]

    # Initialize lists to store clean and misplaced rows
    clean_rows = []
    misplaced_rows = []

    # Identify all relevant columns
    brand_cols = [col for col in df.columns if 'brand' in col.lower()]
    awc_cols = [col for col in df.columns if 'awc' in col.lower()]
    pc_cols = [col for col in df.columns if 'pc' in col.lower()]  # Identify PC columns

    # Loop through each row in the DataFrame to filter based on valid brands
    for _, row in df.iterrows():
        # Initialize clean and misplaced row structures with 'Unique_serial' column
        clean_row = {'Unique_serial': row['Unique_serial'], 'Weight': row['Weight']}
        misplaced_row = {'Unique_serial': row['Unique_serial'], 'Weight': row['Weight']}

        # Process each brand, AWC, and PC column
        for brand_col, awc_col in zip(brand_cols, awc_cols):
            brand = row[brand_col]
            awc = row[awc_col]

            # Get the corresponding PC column if it exists
            pc_col = brand_col.replace('brand', 'pc')  # Assuming a naming convention
            pc = row[pc_col] if pc_col in pc_cols else 0  # Use 0 if PC column does not exist

            # Check if brand is a string and not NA before processing
            if pd.notna(brand) and isinstance(brand, str):
                brand = brand.strip().lower()

                # Check if the brand is in the valid brand list
                if brand in valid_brands:
                    clean_row[brand_col] = row[brand_col]  # original case
                    clean_row[awc_col] = awc
                    clean_row[pc_col] = pc  # Add PC value to clean row if it exists
                    misplaced_row[brand_col] = 0
                    misplaced_row[awc_col] = 0
                    misplaced_row[pc_col] = 0  # Set PC to 0 in misplaced row
                else:
                    clean_row[brand_col] = 0
                    clean_row[awc_col] = 0
                    clean_row[pc_col] = 0  # Set PC to 0 in clean row
                    misplaced_row[brand_col] = row[brand_col]  # original case
                    misplaced_row[awc_col] = awc
                    misplaced_row[pc_col] = pc  # Keep PC value in misplaced row
            else:
                # Fill empty positions with 0 for both rows
                clean_row[brand_col] = 0
                clean_row[awc_col] = 0
                clean_row[pc_col] = 0
                misplaced_row[brand_col] = 0
                misplaced_row[awc_col] = 0
                misplaced_row[pc_col] = 0

        # Append rows to respective lists
        clean_rows.append(clean_row)
        misplaced_rows.append(misplaced_row)

    # Convert lists of dictionaries to DataFrames and retain the original column order
    ordered_columns = ['Unique_serial'] + ['Weight'] + brand_cols + awc_cols + pc_cols
    clean_data = pd.DataFrame(clean_rows).reindex(columns=ordered_columns, fill_value=0)
    misplaced_data = pd.DataFrame(misplaced_rows).reindex(columns=ordered_columns, fill_value=0)

    return clean_data, misplaced_data



clean_data_disp, misplaced_data_disp = filter_misplaced_brands(disp_data,disp_brands )

clean_data_os, misplaced_data_os = filter_misplaced_brands(os_data,os_brands )

clean_data_rcs, misplaced_data_rcs = filter_misplaced_brands(rcs_data,rcs_brands )


In [12]:
def filter_misplaced_by_categories(misplaced_df, other_category1_brands, other_category2_brands):
    """
    Filters misplaced data into multiple categories based on brand lists for two other categories.

    Parameters:
    - misplaced_df (pd.DataFrame): DataFrame containing misplaced brands, AWC, and PC columns.
    - other_category1_brands (list): List of valid brands for the first other category.
    - other_category2_brands (list): List of valid brands for the second other category.

    Returns:
    - misplaced_in_other1 (pd.DataFrame): DataFrame for misplaced brands in the first other category.
    - misplaced_in_other2 (pd.DataFrame): DataFrame for misplaced brands in the second other category.
    - truly_misplaced (pd.DataFrame): DataFrame for brands that don't match any provided brand list.
    """
    # Convert brand lists to lowercase for case-insensitive matching
    other_category1_brands = [brand.strip().lower() for brand in other_category1_brands if isinstance(brand, str)]
    other_category2_brands = [brand.strip().lower() for brand in other_category2_brands if isinstance(brand, str)]

    # Identify all brand and AWC columns based on pattern
    brand_cols = [col for col in misplaced_df.columns if 'brand' in col.lower()]
    awc_cols = [col.replace('brand', 'awc') for col in brand_cols]

    # Identify PC columns
    pc_cols = [col for col in misplaced_df.columns if 'pc' in col.lower()]

    # Initialize lists to store rows for each category
    other1_rows, other2_rows, truly_misplaced_rows = [], [], []

    # Loop through each row in misplaced DataFrame
    for _, row in misplaced_df.iterrows():
        # Initialize rows for each output category with 'Unique_serial' preserved
        other1_row = {'Unique_serial': row['Unique_serial'], 'Weight': row['Weight']}
        other2_row = {'Unique_serial': row['Unique_serial'], 'Weight': row['Weight']}
        truly_misplaced_row = {'Unique_serial': row['Unique_serial'], 'Weight': row['Weight']}

        # Process each brand, AWC, and PC column
        for brand_col, awc_col in zip(brand_cols, awc_cols):
            brand = row[brand_col]
            awc = row[awc_col]

            # Check if brand is a string and not NA before processing
            if pd.notna(brand) and isinstance(brand, str):
                brand_lower = brand.strip().lower()

                # Check membership in each of the other categories
                if brand_lower in other_category1_brands:
                    # Assign to first other category
                    other1_row[brand_col] = row[brand_col]
                    other1_row[awc_col] = awc

                    # Set zero for other categories
                    other2_row[brand_col] = 0
                    other2_row[awc_col] = 0
                    truly_misplaced_row[brand_col] = 0
                    truly_misplaced_row[awc_col] = 0

                elif brand_lower in other_category2_brands:
                    # Assign to second other category
                    other2_row[brand_col] = row[brand_col]
                    other2_row[awc_col] = awc

                    # Set zero for other categories
                    other1_row[brand_col] = 0
                    other1_row[awc_col] = 0
                    truly_misplaced_row[brand_col] = 0
                    truly_misplaced_row[awc_col] = 0

                else:
                    # Assign to Truly Misplaced row
                    truly_misplaced_row[brand_col] = row[brand_col]
                    truly_misplaced_row[awc_col] = awc

                    # Set zero for other categories
                    other1_row[brand_col] = 0
                    other1_row[awc_col] = 0
                    other2_row[brand_col] = 0
                    other2_row[awc_col] = 0

            else:
                # If brand or AWC is missing, fill with 0 across all categories
                other1_row[brand_col] = 0
                other1_row[awc_col] = 0
                other2_row[brand_col] = 0
                other2_row[awc_col] = 0
                truly_misplaced_row[brand_col] = 0
                truly_misplaced_row[awc_col] = 0

        # Handle PC Columns: Preserve Original String Values in Relevant Rows
        for pc_col in pc_cols:
            pc_value = row[pc_col]
            if any(other1_row[col] for col in brand_cols):  # If assigned to other1_row
                other1_row[pc_col] = pc_value
                other2_row[pc_col] = 0
                truly_misplaced_row[pc_col] = 0
            elif any(other2_row[col] for col in brand_cols):  # If assigned to other2_row
                other2_row[pc_col] = pc_value
                other1_row[pc_col] = 0
                truly_misplaced_row[pc_col] = 0
            elif any(truly_misplaced_row[col] for col in brand_cols):  # If assigned to truly_misplaced_row
                truly_misplaced_row[pc_col] = pc_value
                other1_row[pc_col] = 0
                other2_row[pc_col] = 0
            else:
                # If no relevant brand, set PC columns to 0 across all rows
                other1_row[pc_col] = 0
                other2_row[pc_col] = 0
                truly_misplaced_row[pc_col] = 0

        # Append each row to its respective list
        other1_rows.append(other1_row)
        other2_rows.append(other2_row)
        truly_misplaced_rows.append(truly_misplaced_row)

    # Convert lists of dictionaries to DataFrames, ensuring they retain the original column order
    ordered_columns = ['Unique_serial'] + ['Weight'] + brand_cols + awc_cols + pc_cols
    misplaced_in_other1 = pd.DataFrame(other1_rows).reindex(columns=ordered_columns, fill_value=0)
    misplaced_in_other2 = pd.DataFrame(other2_rows).reindex(columns=ordered_columns, fill_value=0)
    truly_misplaced = pd.DataFrame(truly_misplaced_rows).reindex(columns=ordered_columns, fill_value=0)

    return misplaced_in_other1, misplaced_in_other2, truly_misplaced


misplaced_rcs_in_disp,misplaced_os_in_disp,truly_misplaced_disp = filter_misplaced_by_categories(misplaced_data_disp,rcs_brands,os_brands)

misplaced_disp_in_rcs,misplaced_os_in_rcs,truly_misplaced_rcs = filter_misplaced_by_categories(misplaced_data_rcs,disp_brands,os_brands)

misplaced_disp_in_os,misplaced_rcs_in_os,truly_misplaced_os = filter_misplaced_by_categories(misplaced_data_os,disp_brands,rcs_brands)

In [13]:
###############################################################

def normalize_and_consolidate(clean_data, misplaced_in_other_cat_1, misplaced_in_other_cat_2):
    # Ensure Unique_serial and Weight exist in all DataFrames
    for df in [clean_data, misplaced_in_other_cat_1, misplaced_in_other_cat_2]:
        if 'Unique_serial' not in df.columns or 'Weight' not in df.columns:
            raise ValueError("Unique_serial or Weight column is missing from one of the DataFrames.")

    # Store and reset the Unique_serial and Weight columns from each DataFrame
    unique_serial_clean = clean_data['Unique_serial'].reset_index(drop=True)
    weight_clean = clean_data['Weight'].reset_index(drop=True)

    unique_serial_misplaced_1 = misplaced_in_other_cat_1['Unique_serial'].reset_index(drop=True)
    weight_misplaced_1 = misplaced_in_other_cat_1['Weight'].reset_index(drop=True)

    unique_serial_misplaced_2 = misplaced_in_other_cat_2['Unique_serial'].reset_index(drop=True)
    weight_misplaced_2 = misplaced_in_other_cat_2['Weight'].reset_index(drop=True)

    # Drop the Unique_serial and Weight columns before merging
    clean_data_dropped = clean_data.drop(columns=['Unique_serial', 'Weight']).reset_index(drop=True)
    misplaced_in_other_cat_1_dropped = misplaced_in_other_cat_1.drop(columns=['Unique_serial', 'Weight']).reset_index(drop=True)
    misplaced_in_other_cat_2_dropped = misplaced_in_other_cat_2.drop(columns=['Unique_serial', 'Weight']).reset_index(drop=True)

    # Concatenate the DataFrames side by side (aligning by index)
    combined_data = pd.concat([clean_data_dropped,
                               misplaced_in_other_cat_1_dropped,
                               misplaced_in_other_cat_2_dropped], axis=1)

    # Combine the Unique_serial and Weight columns
    unique_serial_combined = unique_serial_clean.combine_first(unique_serial_misplaced_1).combine_first(unique_serial_misplaced_2)
    weight_combined = weight_clean.combine_first(weight_misplaced_1).combine_first(weight_misplaced_2)

    # Add Unique_serial and Weight as the first two columns in the combined DataFrame
    combined_data = pd.concat([unique_serial_combined.reset_index(drop=True),
                               weight_combined.reset_index(drop=True),
                               combined_data], axis=1)

    # Extract brand and AWC columns dynamically
    brand_columns = [col for col in combined_data.columns if 'brand' in col.lower()]
    awc_columns = [col for col in combined_data.columns if 'awc' in col.lower()]

    # Check for PC columns in the clean and misplaced DataFrames
    pc_columns_clean = [col for col in clean_data.columns if 'pc' in col.lower()]
    pc_columns_misplaced_1 = [col for col in misplaced_in_other_cat_1.columns if 'pc' in col.lower()]
    pc_columns_misplaced_2 = [col for col in misplaced_in_other_cat_2.columns if 'pc' in col.lower()]

    # Create a combined list of PC columns for those that have them
    pc_columns_combined = pc_columns_clean + pc_columns_misplaced_1 + pc_columns_misplaced_2

    # Order the columns
    ordered_columns = brand_columns + awc_columns + pc_columns_combined
    combined_data = combined_data[['Unique_serial', 'Weight'] + ordered_columns]

    # Create new column names
    num_brands = len(brand_columns)
    new_column_names = [f'Brand_{i + 1}' for i in range(num_brands)] + [f'AWC_{i + 1}' for i in range(num_brands)]

    # Add PC column names if they exist
    if pc_columns_combined:
        new_column_names += [f'PC_{i + 1}' for i in range(len(pc_columns_combined))]

    # Rename columns
    combined_data.columns = ['Unique_serial', 'Weight'] + new_column_names

    return combined_data


combined_disp = normalize_and_consolidate(clean_data_disp, misplaced_disp_in_rcs, misplaced_disp_in_os)
combined_rcs = normalize_and_consolidate(clean_data_rcs, misplaced_rcs_in_disp, misplaced_rcs_in_os)
combined_os = normalize_and_consolidate(clean_data_os, misplaced_os_in_disp, misplaced_os_in_rcs)

In [14]:
###########################################################

def create_au_awc_pc_columns(df, brands_list):
    # Initialize lists for AU, AWC, and PC columns for each brand
    au_data = {brand: [] for brand in brands_list}
    awc_data = {brand: [] for brand in brands_list}
    pc_data = {brand: [] for brand in brands_list}

    # Process each respondent row
    for _, row in df.iterrows():
        # Temporary storage for each row's AU, AWC, and PC values
        au_row = {brand: 0 for brand in brands_list}
        awc_row = {brand: None for brand in brands_list}
        pc_row = {brand: None for brand in brands_list}

        # Iterate through the brand, AWC, and PC columns
        for i in range(1, 109):  # Adjust if your data has a different range
            brand_name = row.get(f"Brand_{i}")
            awc_value = row.get(f"AWC_{i}")
            pc_value = row.get(f"PC_{i}")

            # Check if the brand is in brands_list
            if brand_name in brands_list:
                au_row[brand_name] = 1  # Mark as active user
                awc_row[brand_name] = awc_value  # Assign AWC value
                pc_row[brand_name] = pc_value  # Assign PC value

        # Append data to the AU, AWC, and PC columns
        for brand in brands_list:
            au_data[brand].append(au_row[brand])
            awc_data[brand].append(awc_row[brand])
            pc_data[brand].append(pc_row[brand])

    # Convert AU, AWC, and PC dictionaries to DataFrames
    au_df = pd.DataFrame(au_data).rename(columns=lambda x: f"{x}_AU")
    awc_df = pd.DataFrame(awc_data).rename(columns=lambda x: f"{x}_AWC").fillna(0)
    pc_df = pd.DataFrame(pc_data).rename(columns=lambda x: f"{x}_PC").fillna(0)

    # Combine Unique_serial with AU, AWC, and PC columns
    final_df = pd.concat([df[['Unique_serial', 'Weight']], au_df, awc_df, pc_df], axis=1)

    return final_df

# Apply the function to your data
AU_AWC_Disp = create_au_awc_pc_columns(combined_disp, disp_brands)
AU_AWC_RCS = create_au_awc_pc_columns(combined_rcs, rcs_brands)
AU_AWC_OS = create_au_awc_pc_columns(combined_os, os_brands)

  awc_df = pd.DataFrame(awc_data).rename(columns=lambda x: f"{x}_AWC").fillna(0)
  pc_df = pd.DataFrame(pc_data).rename(columns=lambda x: f"{x}_PC").fillna(0)
  awc_df = pd.DataFrame(awc_data).rename(columns=lambda x: f"{x}_AWC").fillna(0)
  pc_df = pd.DataFrame(pc_data).rename(columns=lambda x: f"{x}_PC").fillna(0)
  awc_df = pd.DataFrame(awc_data).rename(columns=lambda x: f"{x}_AWC").fillna(0)
  pc_df = pd.DataFrame(pc_data).rename(columns=lambda x: f"{x}_PC").fillna(0)


In [15]:
################################################################

def add_summary_columns(data):
    # Ensure Unique_serial exists
    if 'Unique_serial' not in data.columns:
        raise ValueError("The column 'Unique_serial' is missing from the dataset.")

    # Ensure Weight exists
    if 'Weight' not in data.columns:
        raise ValueError("The column 'Weight' is missing from the dataset.")

    # Identify AU and AWC columns dynamically
    au_cols = [col for col in data.columns if col.endswith('_AU')]
    awc_cols = [col for col in data.columns if col.endswith('_AWC')]

    # Ensure AU and AWC columns are numeric
    data[au_cols] = data[au_cols].apply(pd.to_numeric, errors='coerce').fillna(0)
    data[awc_cols] = data[awc_cols].apply(pd.to_numeric, errors='coerce').fillna(0)

    # Add "Brand AU" column: 1 if any AU column has value > 0, otherwise 0
    data['Brand AU'] = (data[au_cols].sum(axis=1) > 0).astype(int)

    # Add "Brand AWC" column: Sum of all AWC columns for each respondent
    data['Brand AWC'] = data[awc_cols].sum(axis=1)

    # Define the new column order: Unique_serial → Weight → Brand AU → Brand AWC → Rest
    cols = ['Unique_serial', 'Weight', 'Brand AU', 'Brand AWC'] + [col for col in data.columns if col not in ['Unique_serial', 'Weight', 'Brand AU', 'Brand AWC']]

    # Reorder DataFrame
    return data[cols]


# Apply the function to your DataFrame
AU_AWC_Disp = add_summary_columns(AU_AWC_Disp)
AU_AWC_RCS =  add_summary_columns(AU_AWC_RCS)
AU_AWC_OS =  add_summary_columns(AU_AWC_OS)

  data['Brand AU'] = (data[au_cols].sum(axis=1) > 0).astype(int)
  data['Brand AWC'] = data[awc_cols].sum(axis=1)
  data['Brand AU'] = (data[au_cols].sum(axis=1) > 0).astype(int)
  data['Brand AWC'] = data[awc_cols].sum(axis=1)
  data['Brand AU'] = (data[au_cols].sum(axis=1) > 0).astype(int)
  data['Brand AWC'] = data[awc_cols].sum(axis=1)


In [16]:
#################################################################
def add_format_columns(clean_df, df, category):
    """
    Adds 'Format AU' and 'Format AWC' columns to the cleaned dataset based on the category.

    Parameters:
        clean_df (pd.DataFrame): The cleaned dataset.
        df (pd.DataFrame): The original dataset containing Q18 and Q23/Q24/Q25 columns.
        category (str): The category of data ('disp', 'rcs', or 'os').

    Returns:
        pd.DataFrame: The cleaned dataset with 'Format AU' and 'Format AWC' inserted as columns 3 and 4.
    """

    # Extract AU column based on category
    au_mapping = {'disp': 'Q18_1', 'rcs': 'Q18_2', 'os': 'Q18_3'}
    au_col_name = au_mapping.get(category)

    if au_col_name in df.columns:
        format_au_col = df[[au_col_name]].copy().rename(columns={au_col_name: 'Format AU'})
        format_au_col.replace({'Mentioned': 1, 'Not mentioned': 0}, inplace=True)
    else:
        format_au_col = pd.DataFrame({'Format AU': [0] * len(df)})

    # Extract Format AWC column based on category
    awc_mapping = {
        'disp': ('Q23', 'DispConsump'),
        'rcs': ('Q24', 'CloseConsump'),
        'os': ('Q25', 'OpenConsump')
    }

    awc_col_name, consump_col = awc_mapping.get(category)

    if awc_col_name in df.columns and consump_col in df.columns:
        format_awc_col = df[[awc_col_name]].copy().rename(columns={awc_col_name: 'Format AWC'})
        format_awc_col['Format AWC'] = format_awc_col['Format AWC'].fillna(df[consump_col])
    else:
        format_awc_col = pd.DataFrame({'Format AWC': [0] * len(df)})

    # Replace specific values
    format_awc_col.replace({'7+': 7.5, 'Less than 0.5': 0.4}, inplace=True)

    # Reset index to ensure alignment
    format_au_col.reset_index(drop=True, inplace=True)
    format_awc_col.reset_index(drop=True, inplace=True)

    # Combine with clean data
    final_df = pd.concat([clean_df.iloc[:, :2], format_au_col, format_awc_col, clean_df.iloc[:, 2:]], axis=1)

    return final_df

# Example usage:
AU_AWC_Disp_Final = add_format_columns(AU_AWC_Disp, df, 'disp')
AU_AWC_RCS_Final = add_format_columns(AU_AWC_RCS, df, 'rcs')
AU_AWC_OS_Final = add_format_columns(AU_AWC_OS, df, 'os')

  format_au_col.replace({'Mentioned': 1, 'Not mentioned': 0}, inplace=True)
  format_au_col.replace({'Mentioned': 1, 'Not mentioned': 0}, inplace=True)
  format_au_col.replace({'Mentioned': 1, 'Not mentioned': 0}, inplace=True)


In [17]:
###############################################################
def calculate_volume(data, weight_col,au_suffix='_AU', awc_suffix='_AWC', vol_suffix='_VOL'):
    """
    Calculate volume (VOL) columns by multiplying AU, AWC, and respondent weight columns.

    Parameters:
        data (pd.DataFrame): Input DataFrame containing AU, AWC, and weight columns.
        weight_col (str): Name of the weight column.
        resp_weight_col (str): Name of the respondent weight column.
        au_suffix (str): Suffix for Active User (AU) columns.
        awc_suffix (str): Suffix for Average Weekly Consumption (AWC) columns.
        vol_suffix (str): Suffix for Volume (VOL) columns.
        multiplier (int, optional): Multiplier value. Default is 13.

    Returns:
        pd.DataFrame: DataFrame with added VOL columns.
    """
    data = data.copy()

    # Ensure relevant columns are treated as numeric
    data[weight_col] = data[weight_col].astype(float)

    # Extract AU and AWC columns
    au_columns = [col for col in data.columns if col.endswith(au_suffix)]
    awc_columns = [col.replace(au_suffix, awc_suffix) for col in au_columns]
    vol_columns = [col.replace(au_suffix, vol_suffix) for col in au_columns]

    # Calculate volumes and add them as new columns
    for au_col, awc_col, vol_col in zip(au_columns, awc_columns, vol_columns):
        if awc_col in data.columns:
            data[vol_col] = (
                 13*5878757.19
                * data[weight_col]
                * data[au_col].astype(float)
                * data[awc_col].astype(float)
            )

    return data

AU_AWC_Vol_Disp_final = calculate_volume(AU_AWC_Disp_Final,weight_col='Weight')
AU_AWC_Vol_RCS_final = calculate_volume(AU_AWC_RCS_Final,weight_col='Weight')
AU_AWC_Vol_OS_final = calculate_volume(AU_AWC_OS_Final,weight_col='Weight')

  data[vol_col] = (
  data[vol_col] = (
  data[vol_col] = (
  data[vol_col] = (
  data[vol_col] = (
  data[vol_col] = (
  data[vol_col] = (
  data[vol_col] = (
  data[vol_col] = (
  data[vol_col] = (
  data[vol_col] = (
  data[vol_col] = (
  data[vol_col] = (
  data[vol_col] = (
  data[vol_col] = (
  data[vol_col] = (
  data[vol_col] = (
  data[vol_col] = (
  data[vol_col] = (
  data[vol_col] = (
  data[vol_col] = (
  data[vol_col] = (
  data[vol_col] = (
  data[vol_col] = (
  data[vol_col] = (
  data[vol_col] = (
  data[vol_col] = (
  data[vol_col] = (
  data[vol_col] = (
  data[vol_col] = (
  data[vol_col] = (
  data[vol_col] = (
  data[vol_col] = (
  data[vol_col] = (
  data[vol_col] = (
  data[vol_col] = (
  data[vol_col] = (
  data[vol_col] = (
  data[vol_col] = (
  data[vol_col] = (
  data[vol_col] = (
  data[vol_col] = (
  data[vol_col] = (
  data[vol_col] = (
  data[vol_col] = (
  data[vol_col] = (
  data[vol_col] = (
  data[vol_col] = (
  data[vol_col] = (
  data[vol_col] = (


In [18]:
def calculate_au_awc_metrics(df, brand_columns, format_columns):
    weight_column = 'Weight'
    brand_au_column, brand_awc_column = brand_columns
    format_au_column, format_awc_column = format_columns

    # Calculate Brand AU Percentage
    weighted_au = (df[weight_column] * df[brand_au_column]).sum()
    total_weight = df[weight_column].sum()
    current_Brand_AU_percentage = (weighted_au / total_weight) * 100


In [20]:
 # Calculate Format AU Percentage
    weighted_au = (df[weight_column] * df[format_au_column]).sum()
    Format_AU_percentage = (weighted_au / total_weight) * 100

    # Calculate AWC for Brand
    brand_au_filter = (df[brand_au_column] == 1)
    initial_awc_numerator = df.loc[brand_au_filter, brand_awc_column].sum()
    total_weight_brand = df.loc[brand_au_filter, weight_column].sum()
    Current_Brand_awc = initial_awc_numerator / total_weight_brand

    # Calculate AWC for Format
    format_au_filter = (df[format_au_column] == 1)
    initial_Format_awc_numerator = df.loc[format_au_filter, format_awc_column].sum()
    total_weight_format = df.loc[format_au_filter, weight_column].sum()
    Current_Format_awc = initial_Format_awc_numerator / total_weight_format

IndentationError: unexpected indent (<ipython-input-20-10f4de4c33e4>, line 2)

In [None]:

    # Create summary DataFrame
    df_summary = pd.DataFrame({
        'Metric': ['Brand AU Percentage', 'Format AU Percentage', 'Current Brand AWC', 'Current Format AWC'],
        'Value': [current_Brand_AU_percentage, Format_AU_percentage, Current_Brand_awc, Current_Format_awc]
    })

    return df_summary

brand_columns = ('Brand AU', 'Brand AWC')
format_columns = ('Format AU', 'Format AWC')

df_summary_Disp = calculate_au_awc_metrics(AU_AWC_Disp_Final, brand_columns, format_columns)
df_summary_RCS = calculate_au_awc_metrics(AU_AWC_RCS_Final, brand_columns, format_columns)
df_summary_OS = calculate_au_awc_metrics(AU_AWC_OS_Final, brand_columns, format_columns)

# Dictionary of DataFrames
dfs = {
    "Disp": df_summary_Disp,
    "RCS": df_summary_RCS,
    "OS": df_summary_OS
}

# Rename "Value" column to include format name
for name, df in dfs.items():
    df.rename(columns={"Value": f"{name}_Value"}, inplace=True)

# Merge all DataFrames on "Metric"
final_df = df_summary_Disp
for name, df in list(dfs.items())[1:]:  # Skip the first one as it's already assigned
    final_df = final_df.merge(df, on="Metric", how="outer")