<a href="https://colab.research.google.com/github/yonabell/Report/blob/main/Reporting_final_python_code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import os
import re

In [3]:
from google.colab import files
uploaded = files.upload()

Saving 2023_sample_data1.xlsx to 2023_sample_data1.xlsx


In [4]:
file_path_previous_year = '2023_sample_data1.xlsx'

In [None]:
# file_path_current_year = 'path_to_input_folder/2024_sample_data.xlsx'
# file_path_previous_year = 'path_to_input_folder/2023_sample_data.xlsx'
# output_folder_path = 'path_to_output_folder/'

In [5]:
# Defining a function for the following data cleaning steps
def clean_data(file_path):
    # Load the data with no predefined header
    cpt_df = pd.read_excel(file_path, engine='openpyxl', header=4)

    # Filtering rows and extracting only 'A' followed by nine digits
    filtered_cpt_df = cpt_df[cpt_df['Unnamed: 0'].str.extract(r'(A\d{9})', expand=False).notna()].copy()

    # Extracting the ID
    filtered_cpt_df['ID'] = filtered_cpt_df['Unnamed: 0'].str.extract(r'(A\d{9})', expand=False)

    # Reorder columns to place 'ID' next to 'Unnamed: 0'
    cols = filtered_cpt_df.columns.tolist()
    id_index = cols.index('ID')
    cols = cols[:1] + [cols[id_index]] + cols[1:id_index] + cols[id_index + 1:]
    filtered_cpt_df = filtered_cpt_df[cols]

    # Drop 'Unnamed: 0' column
    filtered_cpt_df = filtered_cpt_df.drop('Unnamed: 0', axis=1)

    # Drop specific columns (1 to 7)
    filtered_cpt_df = filtered_cpt_df.drop(filtered_cpt_df.columns[1:8], axis=1)

    # Select every 7th column starting from the 8th column (index 7)
    # Unnecessary column for the analysis with no data
    column_names = filtered_cpt_df.columns.tolist()
    columns_to_drop = column_names[7::7]

    # Drop the selected columns from the DataFrame
    filtered_cpt_df.drop(columns=columns_to_drop, inplace=True)

    return filtered_cpt_df

In [6]:
# Call the clean_data function
filtered_cpt_df1 = clean_data(file_path_previous_year) # file_path_current_year

In [7]:
def replace_suffix_with_months(df):
    """
    This function replaces numerical suffixes in column names with corresponding month names
    and replaces dots with spaces.

    Args:
    df (pd.DataFrame): The input DataFrame whose column names need to be updated.

    Returns:
    pd.DataFrame: The DataFrame with updated column names.
    """
    # Mapping of number suffixes to month names
    month_map = {
        '.1': ' Jan',
        '.2': ' Feb',
        '.3': ' Mar',
        '.4': ' Apr',
        '.5': ' May',
        '.6': ' Jun',
        '.7': ' Jul',
        '.8': ' Aug',
        '.9': ' Sep',
        '.10': ' Oct',
        '.11': ' Nov',
        '.12': ' Dec'
    }

    # Replace dots with spaces and map suffixes at the end of column names to respective months
    new_column_names = []
    for column_name in df.columns:
        new_column_name = column_name.replace('.', ' ')  # Replace dots with spaces
        for suffix, month_name in month_map.items():
            # Ensure we replace only if the suffix appears at the end of the column name
            if new_column_name.endswith(suffix.replace('.', ' ')):
                new_column_name = new_column_name.replace(suffix.replace('.', ' '), month_name)
        new_column_names.append(new_column_name)

    # Assign the new column names to the dataframe
    df.columns = new_column_names

    return df

In [8]:
filtered_cpt_df1 = replace_suffix_with_months(filtered_cpt_df1)

In [9]:
def rename_columns(filtered_cpt_df):
    # Define the new column name mappings for replacements at the start of column names
    new_column_names = {
        'ID': 'BPID',
        'Purch Volume': 'PurchVolume',
        'S-Revenues': 'Revenue',
        'Profit Margin I*': 'PMIStar',  # Handle the specific case for PMIStar first
        'Profit Margin I': 'PMI',
        'PM I % of Purchased Volume': 'PMIRate',
        'PM I* % of Purchased Volume': 'PMIStarRate'
    }

    # Clean column names by collapsing multiple spaces into a single space
    filtered_cpt_df.columns = [re.sub(r'\s+', ' ', col) for col in filtered_cpt_df.columns]

    # Iterate through the columns and apply the renaming based on the starting part of the name
    updated_columns = []
    for column in filtered_cpt_df.columns:
        # Check if the column starts with any of the keys in new_column_names
        for old_name, new_name in new_column_names.items():
            if column.startswith(old_name):
                # Replace the old part of the column name with the new one and keep the suffix (e.g., "Jan", "Feb")
                new_column = column.replace(old_name, new_name, 1)
                updated_columns.append(new_column)
                break
        else:
            # If no match, keep the column name unchanged
            updated_columns.append(column)

    # Rename the columns in the dataframe
    filtered_cpt_df.columns = updated_columns

    return filtered_cpt_df

In [10]:
filtered_cpt_df1 = rename_columns(filtered_cpt_df1)

In [11]:
# Define the function
def add_blank_columns(input_df):
    # Create a new DataFrame with the first column (assumed to be "BPID") as is
    output_df = pd.DataFrame(input_df.iloc[:, 0])  # Keep the first column (e.g., "BPID") as is

    # Counter to track insertion of new blank columns
    counter = 0

    # Loop through the columns of input_df starting from the second column
    for i in range(1, len(input_df.columns)):
        # Add the current column from input_df to output_df
        output_df[input_df.columns[i]] = input_df.iloc[:, i]
        counter += 1

        # After every 6 columns, add a new blank column for readability
        if counter % 6 == 0:
            new_column_name = f"Spacer_{i // 6}"  # Temporary name for the blank column
            output_df[new_column_name] = ""  # Add a blank column with a temporary header
            counter = 0  # Reset the counter after adding the blank column

    # Replace temporary spacer column names with five spaces as headers
    output_df.columns = ["     " if "Spacer" in col else col for col in output_df.columns]

    return output_df




In [12]:
# Add blanck columns for readability
final_cpt_df1 = add_blank_columns(filtered_cpt_df1)

In [None]:
final_cpt_df1.columns

Index(['BPID', 'PurchVolume Jan', 'Revenue Jan', 'PMI Jan', 'PMIRate Jan',
       'PMIStar Jan', 'PMIStarRate Jan', '     ', 'PurchVolume Feb',
       'Revenue Feb', 'PMI Feb', 'PMIRate Feb', 'PMIStar Feb',
       'PMIStarRate Feb', '     ', 'PurchVolume Mar', 'Revenue Mar', 'PMI Mar',
       'PMIRate Mar', 'PMIStar Mar', 'PMIStarRate Mar', '     ',
       'PurchVolume Apr', 'Revenue Apr', 'PMI Apr', 'PMIRate Apr',
       'PMIStar Apr', 'PMIStarRate Apr', '     ', 'PurchVolume May',
       'Revenue May', 'PMI May', 'PMIRate May', 'PMIStar May',
       'PMIStarRate May', '     ', 'PurchVolume Jun', 'Revenue Jun', 'PMI Jun',
       'PMIRate Jun', 'PMIStar Jun', 'PMIStarRate Jun', '     ',
       'PurchVolume Jul', 'Revenue Jul', 'PMI Jul', 'PMIRate Jul',
       'PMIStar Jul', 'PMIStarRate Jul', '     ', 'PurchVolume Aug',
       'Revenue Aug', 'PMI Aug', 'PMIRate Aug', 'PMIStar Aug',
       'PMIStarRate Aug', '     ', 'PurchVolume Sep', 'Revenue Sep', 'PMI Sep',
       'PMIRate Sep', 'PM

In [13]:
# Save the DataFrame to an Excel file first
final_cpt_df1.to_excel('2023_cpt_monthly_profitability.xlsx', index=False)  # Save DataFrame to Excel file

# Download the Excel file
files.download('2023_cpt_monthly_profitability.xlsx')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# Save 2023_cpt_monthly_profitability as an excel file
final_cpt_df1.to_excel(f"{output_folder_path}2023_cpt_monthly_profitability.xlsx", index=False)
print("Successfully saved 2023_cpt_monthly_profitability.xlsx")

In [14]:
from google.colab import files
uploaded = files.upload()

Saving 2024_sample_data1.xlsx to 2024_sample_data1.xlsx


In [15]:
file_path_current_year = '2024_sample_data1.xlsx'

In [16]:
# Call the clean_data function
filtered_cpt_df2 = clean_data(file_path_current_year)

In [17]:
# Assuming 'filtered_cpt_df2' is your DataFrame
filtered_cpt_df2 = replace_suffix_with_months(filtered_cpt_df2)

In [18]:
filtered_cpt_df2 = rename_columns(filtered_cpt_df2)

In [19]:
# Add blanck columns for readability
final_cpt_df2 = add_blank_columns(filtered_cpt_df2)

In [None]:
# Save the DataFrame to an Excel file first
final_cpt_df2.to_excel('2024_cpt_monthly_profitability.xlsx', index=False)  # Save DataFrame to Excel file

# Download the Excel file
files.download('2024_cpt_monthly_profitability.xlsx')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# Save 2023_cpt_monthly_profitability as an excel file
final_cpt_df2.to_excel(f"{output_folder_path}2024_cpt_monthly_profitability.xlsx", index=False)
print("Successfully saved 2024_cpt_monthly_profitability.xlsx")

In [20]:
# Function to map month columns to their respective month-year combinations
def map_month_columns(df, year):
    month_mapping = {
        'Jan': '01', 'Feb': '02', 'Mar': '03', 'Apr': '04',
        'May': '05', 'Jun': '06', 'Jul': '07', 'Aug': '08',
        'Sep': '09', 'Oct': '10', 'Nov': '11', 'Dec': '12'
    }

    new_columns = []
    for col in df.columns:
        # Check if the column name contains one of the months
        for month, num in month_mapping.items():
            if month in col:
                # If the column matches a month, append the respective year and month
                # The first part of the column before the month is the metric name (e.g., 'PurchVolume')
                metric = col.replace(f' {month}', '')  # Remove the month from the name
                new_columns.append(f"{metric} {num}.{year}")
                break
        else:
            # Keep non-month columns (e.g., 'BPID') unchanged
            new_columns.append(col)

    df.columns = new_columns
    return df


# Mapping columns for 2023
filtered_cpt_df1_mapped = map_month_columns(filtered_cpt_df1, '2023')
# Mapping columns for 2024
filtered_cpt_df2_mapped = map_month_columns(filtered_cpt_df2, '2024')

In [21]:
# Combine the two dataframes (filtered_cpt_df1_mapped and filtered_cpt_df2_mapped)
# Skip the 'BPID' column in the second dataframe to avoid duplication
combined_df = pd.concat([filtered_cpt_df1_mapped, filtered_cpt_df2_mapped.iloc[:, 1:]], axis=1)

In [22]:
# Function to filter columns based on start and end months
def filter_columns_by_period(df, start_month, end_month):
    # Get the list of columns
    columns = df.columns.tolist()

    # Find all columns matching the start and end months
    start_idx = None
    end_idx = None

    for i, col in enumerate(columns):
        # Check for the first full set of columns that start with start_month
        if start_month in col and 'PurchVolume' in col:
            start_idx = i
        # Check for the last full set of columns that include the end_month
        if end_month in col and 'PMIStarRate' in col:
            end_idx = i + 1  # Add 1 to include the end month columns in the slice

    # Slice the dataframe columns within the start and end index
    if start_idx is not None and end_idx is not None:
        filtered_df = df.iloc[:, :1]  # Include the 'BPID' column
        filtered_df = pd.concat([filtered_df, df.iloc[:, start_idx:end_idx]], axis=1)
    else:
        raise ValueError("Invalid start or end month. Please check the month-year format.")

    return filtered_df

# Example usage:
start_month = '05.2023'  # Specify the start month (MM.YYYY format)
end_month = '03.2024'    # Specify the end month (MM.YYYY format)

# Filter the combined dataframe based on the given period
filtered_combined_df = filter_columns_by_period(combined_df, start_month, end_month)

In [24]:
# Example usage:
start_month = '10.2023'  # Specify the start month (MM.YYYY format)
end_month = '09.2024'    # Specify the end month (MM.YYYY format)

# Filter the combined dataframe based on the given period
filtered_combined_df = filter_columns_by_period(combined_df, start_month, end_month)

In [26]:
adjusted_factor = 0.0018

In [28]:
final_combined_cpt_df = add_blank_columns(filtered_combined_df)

In [34]:
# Create a new DataFrame with modified column names
output_cpt_df = final_combined_cpt_df.copy()
output_cpt_df.columns = [re.sub(r'\s\d{2}\.\d{4}$', '', col) for col in output_cpt_df.columns]
#output_cpt_df.columns = [re.sub(r'\.\d{2}\.\d{4}$', '', col) if col != 'BPID' else col for col in output_cpt_df.columns]

# Display the result to verify
print(output_cpt_df.head())

         BPID  PurchVolume Revenue    PMI  PMIRate  PMIStar  PMIStarRate  \
5  A987654321         65.0    34.0   34.0   9.8384     45.0        2.033   
8  A123456789        543.0   725.0  643.0  15.7322    542.0        2.519   

          PurchVolume Revenue  ...  PMIStar  PMIStarRate         PurchVolume  \
5                65.0    34.0  ...     45.0        2.033                65.0   
8               543.0   725.0  ...    542.0        2.519               543.0   

  Revenue    PMI  PMIRate  PMIStar  PMIStarRate         
5    34.0   34.0   9.8384     45.0        2.033         
8   725.0  643.0  15.7322    542.0        2.519         

[2 rows x 85 columns]


In [37]:
# Create DataFrame
df = pd.DataFrame(output_cpt_df)

# Extract the name of the first column to group by it (e.g., 'BPID')
group_column = df.columns[0]

# Define metrics and their patterns to sum across columns
metrics = ['PurchVolume', 'Revenue', 'PMI', 'PMIRate', 'PMIStar', 'PMIStarRate']

# Initialize a dictionary to store aggregated results
aggregated_results = {group_column: df[group_column].unique()}

# For each metric, find matching columns, sum them, and add to results
for metric in metrics:
    metric_columns = [col for col in df.columns if re.match(fr'^{metric}', col)]
    aggregated_results[metric] = df[metric_columns].sum(axis=1)

# Create the final aggregated DataFrame
aggregated_df = pd.DataFrame(aggregated_results)

# Group by BPID and take the sum to get a single row per BPID with aggregated metrics
aggregated_df = aggregated_df.groupby(group_column).sum().reset_index()

print(aggregated_df)

         BPID  PurchVolume    Revenue            PMI   PMIRate        PMIStar  \
0  A123456789     187812.0   130188.0  207575.956796  2747.544  118104.412796   
1  A987654321      68196.0  1054164.0  115384.911875  1718.448   17690.463875   

   PMIStarRate  
0   372.412796  
1   290.463875  


In [39]:
adjusted_factor = 0.0018

In [40]:
# Adjust the PMIStarRate column
aggregated_df['PMIStarRate'] = aggregated_df['PMIStarRate'] - (adjusted_factor * aggregated_df['PurchVolume'])

# Display the modified DataFrame
print(aggregated_df)


         BPID  PurchVolume    Revenue            PMI   PMIRate        PMIStar  \
0  A123456789     187812.0   130188.0  207575.956796  2747.544  118104.412796   
1  A987654321      68196.0  1054164.0  115384.911875  1718.448   17690.463875   

   PMIStarRate  
0    34.351196  
1   167.711075  


In [None]:
# Save the final combined (2023 and 2024) output
output_cpt_df.to_excel(f"{output_folder_path}Combined_cpt_monthly_profitability.xlsx", index=False)
print("Successfully saved Combined_cpt_monthly_profitability.xlsx")

In [None]:
# Get the current and previous year dynamically
current_year = datetime.now().year
previous_year = current_year - 1

# Define input folder and output folder paths
input_folder_path = 'path_to_your_input/'
output_folder_path = 'path_to_output_folder/'

# Assign file paths dynamically
current_year_file_path = os.path.join(input_folder_path, f'{current_year}_sample_data.xlsx')
previous_year_file_path = os.path.join(input_folder_path, f'{previous_year}_sample_data.xlsx')

# Saving aggregated data for the current year
aggregated_df1.to_excel(os.path.join(output_folder_path, f'{current_year}_aggregated.xlsx'), index=False)

# Saving aggregated data for the previous year
aggregated_df2.to_excel(os.path.join(output_folder_path, f'{previous_year}_aggregated.xlsx'), index=False)
