In [3]:
import pandas as pd
import re
from datetime import datetime

# Load the data from the CSV and Excel files
highest_scoring_df = pd.read_csv("highest_scoring_paragraphs.csv")
gold_standard_df = pd.read_excel("gold_standard.xlsx")

# Ensure DocID in gold_standard_df is a string
gold_standard_df["DocID"] = gold_standard_df["DocID"].astype(str)

# Function to parse the publication date and calculate the age of the concept in months
def calculate_age_of_concept(pubdate_str):
    publication_date = datetime.strptime(pubdate_str, "%b %d, %Y")
    reference_date = datetime(1994, 9, 1)
    return (publication_date.year - reference_date.year) * 12 + (publication_date.month - reference_date.month)

# Initialize a dictionary to store aggregated data
aggregated_data = {}

# Iterate over each row in the highest_scoring_df
for index, row in highest_scoring_df.iterrows():
    doc_id = row['DocID']
    paragraph = row['Paragraph'].lower()

    # Initialize dictionary for new DocID
    if doc_id not in aggregated_data:
        aggregated_data[doc_id] = {
            "Institute Mentions": 0,
            "Authors Mentions": 0,
            "Concept Mentions (Total)": 0,
            "Concept Mentions (Number Of Paragraphs)": 0,
            "Length of the Document": 0,
            "Keyword 1 mentions": 0,
            "Keyword 2 mentions": 0,
            "Keyword 3 mentions": 0,
        }

    # Update counts
    aggregated_data[doc_id]["Institute Mentions"] += len(re.findall(r'\bharvard\b', paragraph))
    authors_mentions = len(re.findall(r'\bherrnstein\b', paragraph)) + len(re.findall(r'\bmurray\b', paragraph))
    aggregated_data[doc_id]["Authors Mentions"] += authors_mentions
    concept_mentions = len(re.findall(r'\bbell curve\b', paragraph))
    aggregated_data[doc_id]["Concept Mentions (Total)"] += concept_mentions
    if concept_mentions > 0:
        aggregated_data[doc_id]["Concept Mentions (Number Of Paragraphs)"] += 1
    aggregated_data[doc_id]["Length of the Document"] += len(paragraph.split())
    aggregated_data[doc_id]["Keyword 1 mentions"] += len(re.findall(r'\bi\.q\.|iq\b', paragraph))
    aggregated_data[doc_id]["Keyword 2 mentions"] += len(re.findall(r'\bintelligence\b', paragraph))
    aggregated_data[doc_id]["Keyword 3 mentions"] += len(re.findall(r'\brace\b', paragraph))

# Merge the gold standard data for additional features
for doc_id, data in aggregated_data.items():
    # Ensure doc_id is a string
    str_doc_id = str(doc_id)
    matching_row = gold_standard_df[gold_standard_df["DocID"].str.contains(str_doc_id)]
    
    if not matching_row.empty:
        publication_year = matching_row["year"].values[0]
        publication_month = matching_row["publication_month"].values[0]  # Assuming 'publication_month' column exists
        data["Age Of The Concept In Months"] = calculate_age_of_concept(publication_year, publication_month)
        data["Document Type"] = matching_row["documentType"].values[0]
        data["Career Stage"] = matching_row["careerStage"].values[0]  # Assuming 'careerStage' column exists
        data["Year"] = publication_year
    else:
        # Default values if no matching row is found
        data["Age Of The Concept In Months"] = None
        data["Document Type"] = None
        data["Career Stage"] = None
        data["Year"] = None

# Create a DataFrame from the aggregated data
output_df = pd.DataFrame.from_dict(aggregated_data, orient='index')
output_df.index.name = "Doc ID"
output_df.reset_index(inplace=True)

# Save the DataFrame to a CSV file
output_df.to_csv("raw_data_highest_scoring_paragraphs.csv", index=False)


KeyError: 'publication_month'