In [66]:
import os
from pathlib import Path

from dotenv import load_dotenv
import sys
load_dotenv()
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))
print(sys.path)

['/opt/anaconda3/envs/project-nova/lib/python311.zip', '/opt/anaconda3/envs/project-nova/lib/python3.11', '/opt/anaconda3/envs/project-nova/lib/python3.11/lib-dynload', '', '/opt/anaconda3/envs/project-nova/lib/python3.11/site-packages', '/Users/vamsisaigarapati/Documents/github/project-nova', '/Users/vamsisaigarapati/Documents/github/project-nova', '/Users/vamsisaigarapati/Documents/github/project-nova']


In [67]:
# Standard library imports
import os
import sys
from pathlib import Path

# Third-party imports
import pandas as pd
from dotenv import load_dotenv

load_dotenv()
# print(sys.path)
# sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))
# print(sys.path)

# Local application imports
from src.config import HEARST_DIR, HEARST_RAW_DIR, HEARST_PROCESSED, HEASRT_FILE, HEASRT_FILE_SISENSE,MSP_AGENNT_LOOKUP_FILE,HEARST_LOOKUP_DIR
from src.configs.hearst_configs import raw_column_types,sisense_columns
from src.utils.excel_file_operations import load_excel_file, write_df_to_excel
from src.utils.dataframe_utils import rearrange_columns

In [68]:




def calculate_revenue(raw_df):
    """
    Reads the 'Hearst Pub Market List' sheet from 'Heast Files.xlsx',
    joins with raw_df on 'Pub', creates 'Job Number +',
    and returns a DataFrame grouped by 'Job Number +' with summed Revenue.
    Output columns: ['Old Job Number', 'Job Number', 'Job Number +', 'Revenue'].
    """

    market_list = load_excel_file(
        path=HEARST_RAW_DIR,               
        file_name=HEASRT_FILE,
        sheet_name="Hearst Pub Market List",                
    )
    
    merged_df = raw_df.copy()
    market_list = market_list.copy()
    merged_df["Pub_key"] = merged_df["Pub"].astype(str).str.strip().str.lower()
    market_list["Pub_key"] = market_list["Pub"].astype(str).str.strip().str.lower()

    # Merge Market info (inner join for exact matches)
    merged_df = merged_df.merge(
        market_list[["Pub_key", "Market"]],
        on="Pub_key",
        how="inner"
    )

    # Normalize columns for concatenation
    merged_df["Job Number"] = merged_df["Job Number"].astype(str).str.strip()
    merged_df["Market"] = merged_df["Market"].astype(str).str.strip()

    # Create "Job Number +" = Market + Job Number
    merged_df["Job Number +"] = merged_df.apply(
        lambda r: f"{r['Market']}{r['Job Number']}"
        if r["Market"] not in ["", "nan", "None"]
        else r["Job Number"],
        axis=1
    )

    # Convert Revenue safely to numeric
    merged_df["Sum of 'Revenue'"] = pd.to_numeric(merged_df["Revenue"], errors="coerce").fillna(0)

    # ---- Aggregate ----
    # Keep first record for every column except the revenue column
    agg_dict = {col: "first" for col in merged_df.columns if col not in ["Sum of 'Revenue'"]}
    agg_dict["Sum of 'Revenue'"] = "sum"

    result_df = merged_df.groupby("Job Number +", as_index=False).agg(agg_dict)

    # Add the count of records per group
    counts = merged_df.groupby("Job Number +").size().reset_index(name="Count of Matches")
    result_df = result_df.merge(counts, on="Job Number +", how="left")
    print(result_df.columns)

    # Reorder columns — keep Job Number +, Sum of Revenue, Count of Matches at the end
    cols = [c for c in result_df.columns if c not in ["Job Number +", "Sum of 'Revenue'", "Count of Matches"]]
    result_df = result_df[cols + ["Job Number +", "Sum of 'Revenue'", "Count of Matches"]]
    result_df['Job Number +'], result_df['Job Number'] = result_df['Job Number'].copy(), result_df['Job Number +'].copy()

    result_df = result_df[result_df["Sum of 'Revenue'"] != 0.0]
    return result_df


In [69]:
raw_df = load_excel_file(
    path=HEARST_RAW_DIR,                 # or "/full/path/to/dir"
    file_name=HEASRT_FILE,
    column_types=raw_column_types,
    sheet_name="Raw",                    # or omit to read the first sheet
)
# write_df_to_excel(raw_df, HEARST_PROCESSED, "checking.xlsx", sheet_name="Sisense")
processed_df=calculate_revenue(raw_df)

Index(['Year', 'Period #', 'Job Number', 'Child Acct #', 'Inches', 'Ad Type',
       'Section', 'Class Code', 'WoRev Bill Cycle', 'Child Acct Name',
       'First Issue Date', 'Full Name LF', 'Business Unit GL', 'GL_LOB_L1',
       'Pub', 'Revenue', 'Pub_key', 'Market', 'Job Number +',
       'Sum of 'Revenue'', 'Count of Matches'],
      dtype='object')


In [70]:
rep_list = load_excel_file(
    path=HEARST_LOOKUP_DIR,               
    file_name=MSP_AGENNT_LOOKUP_FILE,
    sheet_name="All Rep Names",                  
)

In [71]:
rep_list

Unnamed: 0,Agent Names,System(s),Full Name,Name in Labor Report,Business Unit
0,30,TGAM,!!Check Rep Name!!,,MSP All
1,774,Postmedia Adjustments,Non-Commissioned Agent,,MSP All
2,776,Postmedia Adjustments,Non-Commissioned Agent,,MSP All
3,776,Postmedia Sams2,Sarah Piazza,"Piazza, Sarah",MSP All
4,782,Postmedia Sams2,Jakai Harrison,,MSP All
...,...,...,...,...,...
1590,sandra.faraj@hearst.com,Hearst(Corporate),Sandra Faraj,,MSP All
1591,wsmith@hearstmediact.com,Hearst(Corporate),Wendy Smith,,MSP All
1592,"Hewson, Nora",Hearst,Nora Hewson,,MSP All
1593,Tyra Abrams,LV Review Journal,Tyra Abrams,,MSP All


In [72]:
rep_filtered = rep_list[rep_list["System(s)"].str.contains("hearst", na=False)].copy()

In [73]:
rep=rep_filtered["Agent Names"].value_counts().reset_index()
rep[rep['count']>1]

Unnamed: 0,Agent Names,count


In [74]:
rep_list1

Unnamed: 0,Agent Names,System(s),Full Name,Name in Labor Report,Business Unit
0,30,tgam,!!check rep name!!,,MSP All
1,774,postmedia adjustments,non-commissioned agent,,MSP All
2,776,postmedia adjustments,non-commissioned agent,,MSP All
3,776,postmedia sams2,sarah piazza,"Piazza, Sarah",MSP All
4,782,postmedia sams2,jakai harrison,,MSP All
...,...,...,...,...,...
1590,sandra.faraj@hearst.com,hearst(corporate),sandra faraj,,MSP All
1591,wsmith@hearstmediact.com,hearst(corporate),wendy smith,,MSP All
1592,"hewson, nora",hearst,nora hewson,,MSP All
1593,tyra abrams,lv review journal,tyra abrams,,MSP All


In [75]:
import pandas as pd

def tag_msp_from_rep(processed_df: pd.DataFrame) -> pd.DataFrame:
    """
    Adds an 'MSP/non-MSP' flag to processed_df by joining with rep_list.

    Logic:
    1. Load rep_list (MSP agent lookup sheet).
    2. Create lowercase helper columns for matching.
    3. Filter rep_list where 'System(s)' contains 'hearst'
       and exclude agent 'wave2, wave2'.
    4. Deduplicate by Agent Names (first occurrence only).
    5. Left join processed_df.Full Name LF with rep_list.Agent Names (lowercase helper columns).
    6. Add 'MSP/non-MSP' = 'MSP' if match found, else 'Non-MSP'.
    7. Drop helper columns to return original unmodified columns.

    Returns:
        pd.DataFrame: processed_df with a new column 'MSP/non-MSP'
    """

    # --- Step 1: Load the rep_list from Excel lookup ---
    rep_list = load_excel_file(
        path=HEARST_LOOKUP_DIR,
        file_name=MSP_AGENNT_LOOKUP_FILE,
        sheet_name="All Rep Names",
    )

    # --- Step 2: Create lowercase helper columns (preserve originals) ---
    rep_list["_system_lower"] = rep_list["System(s)"].astype(str).str.lower().str.strip()
    rep_list["_agent_lower"] = rep_list["Agent Names"].astype(str).str.lower().str.strip()
    rep_list["_fullname_lower"] = rep_list["Full Name"].astype(str).str.lower().str.strip()

    processed_df["_fullname_lf_lower"] = processed_df["Full Name LF"].astype(str).str.lower().str.strip()

    # --- Step 3: Filter rep_list for 'hearst' but exclude agent 'wave2, wave2' ---
    rep_filtered = rep_list[
        rep_list["_system_lower"].str.contains("hearst", na=False)
        & (rep_list["_agent_lower"] != "wave2, wave2")
    ].copy()

    # --- Step 4: Keep only relevant columns & deduplicate by Agent Name ---
    rep_filtered = (
        rep_filtered[["Agent Names", "System(s)", "Full Name", "_agent_lower"]]
        .drop_duplicates(subset=["_agent_lower"], keep="first")
        .reset_index(drop=True)
    )

    print(f"Filtered rep_list to {len(rep_filtered)} records for 'hearst' system (excluding 'wave2, wave2').")

    # --- Step 5: Perform left join on lowercase helper columns ---
    merged = processed_df.merge(
        rep_filtered[["_agent_lower"]].rename(columns={"_agent_lower": "_join_key"}),
        how="left",
        left_on="_fullname_lf_lower",
        right_on="_join_key",
        indicator=True
    )

    print(f"Merged DataFrame has {len(merged)} records after join.")

    # --- Step 6: Add MSP flag ---
    merged["MSP/non-MSP"] = merged["_merge"].map({
        "both": "MSP",
        "left_only": "Non-MSP"
    })

    # --- Step 7: Cleanup helper columns ---
    merged = merged.drop(columns=["_fullname_lf_lower", "_join_key", "_merge"], errors="ignore")

    return merged


In [76]:
len(processed_df)

2444

In [77]:
processed_df1 = tag_msp_from_rep( processed_df)


Filtered rep_list to 175 records for 'hearst' system (excluding 'wave2, wave2').
Merged DataFrame has 2444 records after join.


In [78]:
processed_df1[processed_df1['MSP/non-MSP']=='Non-MSP']

Unnamed: 0,Year,Period #,Job Number,Child Acct #,Inches,Ad Type,Section,Class Code,WoRev Bill Cycle,Child Acct Name,...,Business Unit GL,GL_LOB_L1,Pub,Revenue,Pub_key,Market,Job Number +,Sum of 'Revenue',Count of Matches,MSP/non-MSP
5,2025,8,FF2923170,339085,6.2,CLS Display,Wave2 Death Notices,13000,Classified Commercial,HOLLY,...,3004,Newspapers Digital,Connpost.com,107.37,connpost.com,FF,2923170,310.67,3,Non-MSP
106,2025,8,FF2938786,110053,17.6,CLS Display,Wave2 Death Notices,13000,Classified Commercial,COGNETTA FUNERAL,...,3004,Newspapers Digital,Connpost.com,240.14,connpost.com,FF,2938786,752.21,3,Non-MSP
107,2025,8,FF2938788,376122,19.2,CLS Display,Wave2 Death Notices,13000,Classified Commercial,STEVEN,...,3004,Newspapers Digital,Connpost.com,355.18,connpost.com,FF,2938788,1134.79,4,Non-MSP
108,2025,8,FF2938790,376123,15.0,CLS Display,Wave2 Death Notices,13000,Classified Commercial,TRACY,...,3004,Newspapers Digital,Connpost.com,211.63,connpost.com,FF,2938790,657.40,4,Non-MSP
109,2025,8,FF2938793,125486,11.4,CLS Display,Wave2 Death Notices,13000,Classified Commercial,WIILLIAM MCDONALD F.H.,...,3004,Newspapers Digital,Connpost.com,163.00,connpost.com,FF,2938793,495.67,3,Non-MSP
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2438,2025,8,WA2944149,327078,33.2,CLS Display,Wave2 Death Notices,Life Tributes,Classified Commercial,JONATHAN,...,3004,Newspapers Digital,Rep-Am.com,155.51,rep-am.com,WA,2944149,1105.28,3,Non-MSP
2439,2025,8,WA2944179,366534,10.8,CLS Display,Wave2 Death Notices,Life Tributes,Classified Commercial,CASEY FUNERAL HOME,...,3004,Newspapers Digital,Rep-Am.com,35.62,rep-am.com,WA,2944179,280.14,3,Non-MSP
2441,2025,8,WA2944181,326509,16.8,CLS Display,Wave2 Death Notices,Life Tributes,Classified Commercial,PANAGIOTA,...,3004,Newspapers Digital,Rep-Am.com,67.24,rep-am.com,WA,2944181,497.76,3,Non-MSP
2442,2025,8,WA2944189,370930,14.4,CLS Display,Wave2 Death Notices,Life Tributes,Classified Commercial,STEPHANIE,...,3004,Newspapers Digital,Rep-Am.com,54.06,rep-am.com,WA,2944189,407.08,3,Non-MSP


In [81]:
sisense_provided=load_excel_file(
    path=HEARST_RAW_DIR,               
    file_name=HEASRT_FILE,
    sheet_name="Sisense",                  
)

In [83]:
sisense_provided

Unnamed: 0,Job Number,Sum of 'Revenue',Year,Period #,Job Number +,Child Acct #,Inches,Ad Type,Section,Class Code,...,Business Unit GL,GL_LOB_L1,Pub,Revenue,Count of matches,Verified Strategic,Welcome Back,Renewal,Revenue Date,Wave2 Prior Bill
0,FF2939642,145.00,2025,8,2939642,227873,10.65,CLS Liner,Merchandise,2900,...,3005,Newspapers Digital,stamfordadvocate.com,20.00,11,0,0,0,2025-07-28,
1,FF2940753,211.76,2025,8,2940753,227873,16.50,CLS Liner,Merchandise,2900,...,3005,Newspapers Digital,stamfordadvocate.com,5.00,10,0,0,0,2025-07-28,
2,FF2938757,3800.00,2025,8,2938757,147346,30.00,CLS Display,Employment,200,...,3004,Newspapers Traditional,Connecticut Post,1146.64,8,0,0,0,2025-07-28,
3,FF2939124,50.00,2025,8,2939124,347795,5.22,CLS Liner,Auto-Truck Sale-Serv,7200,...,3004,Newspapers Digital,Connpost.com,2.50,8,0,0,1,2025-07-28,
4,FF2939510,137.95,2025,8,2939510,182356,6.86,CLS Liner,Employment,272,...,3005,Newspapers Digital,stamfordadvocate.com,20.00,8,0,0,0,2025-07-28,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2406,WA2943559,27.78,2025,8,2943559,372352,0.78,CLS Liner,Merchandise,3000,...,3004,Newspapers Traditional,Republican-American,27.78,1,0,0,0,2025-07-28,
2407,WA2943636,32.21,2025,8,2943636,376416,1.17,CLS Liner,Merchandise,3000,...,3004,Newspapers Traditional,Republican-American,32.21,1,0,0,0,2025-07-28,
2408,WA2943664,1350.00,2025,8,2943664,376417,45.00,Legal Display,Public Notices,11030,...,3004,Newspapers Traditional,Republican-American,1350.00,1,0,0,0,2025-07-28,
2409,WA2943667,27.10,2025,8,2943667,373224,1.16,CLS Liner,Auto-Truck Sale-Serv,7207,...,3004,Newspapers Traditional,Republican-American,27.10,1,0,0,0,2025-07-28,


In [82]:
processed_df1

Unnamed: 0,Year,Period #,Job Number,Child Acct #,Inches,Ad Type,Section,Class Code,WoRev Bill Cycle,Child Acct Name,...,Business Unit GL,GL_LOB_L1,Pub,Revenue,Pub_key,Market,Job Number +,Sum of 'Revenue',Count of Matches,MSP/non-MSP
0,2025,8,FF2888723,156557,3.44,CLS Liner,Rentals,05600,Classified Commercial,"WESTPORT, CT BUILDING LOTS",...,3004,Newspapers Traditional,Fairfield Citizen Ne,15.40,fairfield citizen ne,FF,2888723,30.80,2,MSP
1,2025,8,FF2892888,142168,1.56,Legal Liners,Public Notices,11030,Classified Commercial,JP MORGAN CHASE BANK,...,3006,Newspapers Digital,newstimes.com,10.00,newstimes.com,FF,2892888,105.20,2,MSP
2,2025,8,FF2917124,373610,0.01,Online Only,LocalEdge Social Med,OL Retail,Classified Commercial,TOWN OF WALLINGFORD,...,3004,Newspapers Digital,Connpost.com,500.00,connpost.com,FF,2917124,500.00,1,MSP
3,2025,8,FF2918533,146439,0.50,CLS Liner,Service Directory,07346,Retail,TC-PCS,...,3005,Newspapers Digital,stamfordadvocate.com,0.00,stamfordadvocate.com,FF,2918533,20.00,6,MSP
4,2025,8,FF2920793,129375,0.00,Legal Display,Public Notices,11030,Classified Commercial,DANBURY LEGISLATIVE ASSISTANT,...,3006,Newspapers Traditional,Danbury News-Times,-1032.20,danbury news-times,FF,2920793,-1032.20,1,MSP
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2439,2025,8,WA2944179,366534,10.80,CLS Display,Wave2 Death Notices,Life Tributes,Classified Commercial,CASEY FUNERAL HOME,...,3004,Newspapers Digital,Rep-Am.com,35.62,rep-am.com,WA,2944179,280.14,3,Non-MSP
2440,2025,8,WA2944180,125602,13.20,CLS Display,Wave2 Death Notices,Life Tributes,Classified Commercial,WOODTICK MEMORIAL,...,3004,Newspapers Digital,Rep-Am.com,48.79,rep-am.com,WA,2944180,370.81,3,MSP
2441,2025,8,WA2944181,326509,16.80,CLS Display,Wave2 Death Notices,Life Tributes,Classified Commercial,PANAGIOTA,...,3004,Newspapers Digital,Rep-Am.com,67.24,rep-am.com,WA,2944181,497.76,3,Non-MSP
2442,2025,8,WA2944189,370930,14.40,CLS Display,Wave2 Death Notices,Life Tributes,Classified Commercial,STEPHANIE,...,3004,Newspapers Digital,Rep-Am.com,54.06,rep-am.com,WA,2944189,407.08,3,Non-MSP


In [None]:
sisense_provided["_agent_lower"] = rep_list["Agent Names"].astype(str)

In [86]:
sisense_provided.dtypes

Job Number                    object
Sum of 'Revenue'             float64
Year                           int64
Period #                       int64
Job Number +                   int64
Child Acct #                   int64
Inches                       float64
Ad Type                       object
Section                       object
Class Code                    object
WoRev Bill Cycle              object
Child Acct Name               object
First Issue Date      datetime64[ns]
Full Name LF                  object
Commission Rep                object
MSP/non-MSP                   object
Business Unit GL               int64
GL_LOB_L1                     object
Pub                           object
Revenue                      float64
Count of matches               int64
Verified Strategic             int64
Welcome Back                   int64
Renewal                        int64
Revenue Date          datetime64[ns]
Wave2 Prior Bill             float64
dtype: object

In [None]:
sis_mer=sisense_provided.merge(
    processed_df1,
    on="Job Number",
    how="left",
)

In [90]:
sis_mer['Job Number + ']== None

KeyError: 'Job Number + '