In [3]:
import pandas as pd
import re
from datetime import datetime

def calculate_age_of_concept(pubdate_str):
    try:
        publication_date = datetime.strptime(pubdate_str, "%b %d, %Y")
        reference_date = datetime(1994, 9, 1)
        return (publication_date.year - reference_date.year) * 12 + (publication_date.month - reference_date.month)
    except ValueError:
        return None

# Load the data from the CSV and Excel files
highest_scoring_df = pd.read_csv("highest_scoring_paragraphs.csv")
gold_standard_df = pd.read_excel("gold_standard.xlsx")

# Convert DocID to string in both DataFrames
highest_scoring_df["DocID"] = highest_scoring_df["DocID"].astype(str)
gold_standard_df["DocID"] = gold_standard_df["DocID"].astype(str)

# Initialize a dictionary for aggregated data
aggregated_data = {}

# Process each paragraph in the highest_scoring_df
for index, row in highest_scoring_df.iterrows():
    doc_id = row['DocID']
    paragraph = row['Paragraph'].lower()

    if doc_id not in aggregated_data:
        aggregated_data[doc_id] = {
            "Institute Mentions": 0,
            "Authors Mentions": 0,
            "Concept Mentions (Total)": 0,
            "Concept Mentions (Number Of Paragraphs)": 0,
            "Length of the Document": 0,
            "Keyword 1 mentions": 0,
            "Keyword 2 mentions": 0,
            "Keyword 3 mentions": 0
        }

    data = aggregated_data[doc_id]
    data["Institute Mentions"] += len(re.findall(r'\bharvard\b', paragraph))
    data["Authors Mentions"] += len(re.findall(r'\bherrnstein\b', paragraph)) + len(re.findall(r'\bmurray\b', paragraph))
    concept_mentions = len(re.findall(r'\bbell curve\b', paragraph))
    data["Concept Mentions (Total)"] += concept_mentions
    if concept_mentions > 0:
        data["Concept Mentions (Number Of Paragraphs)"] += 1
    data["Length of the Document"] += len(paragraph.split())
    data["Keyword 1 mentions"] += len(re.findall(r'\bi\.q\.|iq\b', paragraph))
    data["Keyword 2 mentions"] += len(re.findall(r'\bintelligence\b', paragraph))
    data["Keyword 3 mentions"] += len(re.findall(r'\brace\b', paragraph))

# Convert DocID to string and pad with zeros in both DataFrames
highest_scoring_df["DocID"] = highest_scoring_df["DocID"].astype(str).apply(lambda x: x.zfill(7))
gold_standard_df["DocID"] = gold_standard_df["DocID"].astype(str).apply(lambda x: x.zfill(7))

# Your existing code for processing the data goes here...

# Modification in the loop to handle zero-padding and multiple IDs
for doc_id, data in aggregated_data.items():
    # Splitting the DocID in case it contains multiple comma-separated IDs and padding each ID
    doc_ids = [id_str.zfill(7) for id_str in doc_id.split(',')]

    # Initialize variables to store the values
    document_type = None
    idea_career_3 = None
    year = None
    age_of_concept = None

    for single_doc_id in doc_ids:
        matching_rows = gold_standard_df[gold_standard_df["DocID"].str.contains(r'\b' + re.escape(single_doc_id) + r'\b')]

        # Iterate through each matching row and extract the values
        for _, row in matching_rows.iterrows():
            if 'documentType' in row and pd.notna(row['documentType']):
                document_type = row['documentType']
            if 'IDEA CAREER 3' in row and pd.notna(row['IDEA CAREER 3']):
                idea_career_3 = row['IDEA CAREER 3']
            if 'year' in row and pd.notna(row['year']):
                year = row['year']
            if 'pubdate' in row and pd.notna(row['pubdate']):
                pubdate = row['pubdate']
                if isinstance(pubdate, pd.Timestamp):
                    pubdate_str = pubdate.strftime("%b %d, %Y")
                else:
                    pubdate_str = str(pubdate)
                age_of_concept = calculate_age_of_concept(pubdate_str)

    # Assigning the extracted values to the aggregated data
    # data["documentType"] = document_type
    data["IDEA CAREER 3"] = idea_career_3
    data["year"] = year
    data["Age Of The Concept In Months"] = age_of_concept

# Create DataFrame from aggregated data
output_df = pd.DataFrame.from_dict(aggregated_data, orient='index')
output_df.reset_index(inplace=True)
output_df.rename(columns={'index': 'Doc ID'}, inplace=True)

# Rename "IDEA CAREER 3" to "Career Stage" in the DataFrame
output_df.rename(columns={'IDEA CAREER 3': 'Career Stage'}, inplace=True)
output_df.rename(columns={'year': 'Year'}, inplace=True)

# Save to CSV
output_df.to_csv("raw_data_highest_scoring_paragraphs.csv", index=False)