In [3]:
import csv
import os
import re
import xml.etree.ElementTree as ET
from datetime import datetime
import pandas as pd
import nltk
from textblob import TextBlob
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk import pos_tag
import textstat
from transformers import BertTokenizer

folder_path = "gold_standard_files"
output_file = "raw_data.csv"
xlsx_file = "gold_standard.xlsx"
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Read the gold standard data from the Excel file
gold_standard_df = pd.read_excel(xlsx_file)

# Drop rows with NA or NaN values in the "DocID" column
gold_standard_df = gold_standard_df.dropna(subset=["DocID"])

# Get a sorted list of XML files in the folder
xml_files = sorted([filename for filename in os.listdir(folder_path) if filename.endswith(".xml")])

# Function to parse the publication date and calculate the age of the concept in months
def calculate_age_of_concept(pubdate_str):
    publication_date = datetime.strptime(pubdate_str, "%b %d, %Y")
    reference_date = datetime(1994, 9, 1)
    return (publication_date.year - reference_date.year) * 12 + (publication_date.month - reference_date.month)

# Create the CSV file and write the header
with open(output_file, "w", newline="") as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(["Doc ID", "Institute Mentions", "Authors Mentions",
                     "Concept Mentions (Total)", "Concept Mentions (Abstract)",
                     "Concept Mentions (Number Of Paragraphs)",
                     "Age Of The Concept In Months",
                     "Average Mentions per Paragraph", "Percentage of Mentions in Abstract",
                     "Length of the Document", "Keyword 1 mentions", "Keyword 2 mentions",
                     "Keyword 3 mentions", "Document Type", "Career Stage",
                     "Number of Unique Words", "Number of Sentences", 
                     "Average Sentence Length", "Sentiment Score", "Exclamation Marks Count",
                     "Question Marks Count", "Position of First Theory Mention", 
                     "Contextual Phrase Count", "Interpretative Word Count",
                     "Number of Adjectives", "Number of Nouns",
                     "Number of Verbs", "Number of Adverbs", "Lexical Density", "Flesch Reading Ease", "Year"])

    # Iterate over each XML file in the sorted order
    for filename in xml_files:
        xml_file = os.path.join(folder_path, filename)

        # Parse the XML file
        tree = ET.parse(xml_file)
        root = tree.getroot()

        # Extract the required information
        doc_id = root.find(".//doc-id").attrib["id-string"]
        
        # Add leading 0 to the doc ID if it is only six digits long
        if len(doc_id) == 6:
            doc_id = "0" + doc_id

        text_content = ET.tostring(root, encoding="unicode").lower()

        author1_mentions = len(re.findall(r'\bmurray\b', text_content))
        author2_mentions = len(re.findall(r'\bherrnstein\b', text_content))

        concept_total_mentions = len(re.findall(r'\bbell curve\b', text_content))

        abstract_text = root.find(".//block[@class='lead_paragraph']")
        if abstract_text is not None:
            abstract_content = ET.tostring(abstract_text, encoding="unicode").lower()
            concept_abstract_mentions = len(re.findall(r'\bbell curve\b', abstract_content))
        else:
            concept_abstract_mentions = 0

        paragraph_mentions = 0  # Initialize paragraph mention counter
        num_paragraphs = len(root.findall(".//block[@class='full_text']/p"))  # Count the number of paragraphs

        # Count the mentions of "bell curve" and "harvard" in paragraphs
        paragraph_texts = root.findall(".//block[@class='full_text']/p")
        institute_mentions = 0  # Initialize Institute mention counter
        for paragraph_text in paragraph_texts:
            paragraph_text_lower = paragraph_text.text.lower()
            paragraph_mentions += len(re.findall(r'\bbell curve\b', paragraph_text_lower))
            institute_mentions += len(re.findall(r'\bharvard\b', paragraph_text_lower))

        # Merge the author1 and author2 mentions
        combined_mentions = author1_mentions + author2_mentions

        # Get the publication month and year from the XML file
        publication_month_elem = root.find(".//meta[@name='publication_month']")
        publication_month = int(publication_month_elem.attrib["content"]) if publication_month_elem is not None else 0

        publication_year_elem = root.find(".//meta[@name='publication_year']")
        publication_year = int(publication_year_elem.attrib["content"]) if publication_year_elem is not None else 0

        # Calculate the month distance from September 1994
        target_date = datetime(publication_year, publication_month, 1)
        reference_date = datetime(1994, 9, 1)
        month_distance = (target_date.year - reference_date.year) * 12 + (target_date.month - reference_date.month)

        # Extract the document type
        document_type_label = 0

        # Find the matching rows in the gold standard data based on Doc ID
        matching_rows = gold_standard_df[gold_standard_df["DocID"].astype(str).str.contains(doc_id)]

        if not matching_rows.empty:
            doc_type_from_file = matching_rows["documentType"].values[0]
            if doc_type_from_file in ['List', 'Review', 'Event']:
                document_type_label = 1

        # Find the matching rows in the gold standard data based on Doc ID
        matching_rows = gold_standard_df[gold_standard_df["DocID"].astype(str).str.contains(doc_id)]

        # Extract the values from the "IDEA CAREER 3" column for the matching rows
        idea_career_3_values = matching_rows["IDEA CAREER 3"].astype(str).values.tolist()
        idea_career_3_combined = ", ".join(idea_career_3_values)

        # Calculate the average mentions per paragraph
        avg_mentions_per_paragraph = paragraph_mentions / num_paragraphs

        # Calculate the percentage of "Bell Curve" mentions in the abstract
        percent_mentions_in_abstract = concept_abstract_mentions / concept_total_mentions if concept_total_mentions > 0 else 0

        # Calculate the length of the document
        doc_length = len(text_content)

        # Count the mentions of "I.Q./iq"
        keyword1_mentions = len(re.findall(r'\bi\.q\.|iq\b', text_content))

        # Count the mentions of "intelligence"
        keyword2_mentions = len(re.findall(r'\bintelligence\b', text_content))

        # Count the mentions of "race"
        keyword3_mentions = len(re.findall(r'\brace\b', text_content))

        # Calculate number of unique words
        words = text_content.split()
        num_unique_words = len(set(words))

        # Calculate number of sentences
        sentences = nltk.sent_tokenize(text_content)
        num_sentences = len(sentences)

        # Calculate average sentence length
        avg_sentence_length = sum(len(s.split()) for s in sentences) / num_sentences

        # Calculate sentiment score
        sentiment_score = TextBlob(text_content).sentiment.polarity

        # Count number of exclamation and question marks
        exclamation_marks_count = text_content.count('!')
        question_marks_count = text_content.count('?')

        # Determine the position of the first mention of a theory
        position_first_theory_mention = text_content.find("Bell Curve") if "Bell Curve" in text_content else -1

        # Count the number of contextual phrases
        contextual_phrases = ["in this study", "in this research", "the purpose of this study", "this research"]
        contextual_phrase_count = sum(text_content.count(phrase) for phrase in contextual_phrases)

        # Count the number of interpretative words
        interpretative_words = ["because", "cause", "effect", "impact", "result", "consequence", "reason", "rationale"]
        interpretative_word_count = sum(text_content.count(word) for word in interpretative_words)

        # Count number of adjectives, nouns, verbs, and adverbs
        tagged = pos_tag(word_tokenize(text_content))
        counts = nltk.FreqDist(tag for (word, tag) in tagged)
        num_adjectives = counts['JJ']
        num_nouns = counts['NN']
        num_verbs = counts['VB']
        num_adverbs = counts['RB']

        # Calculate lexical density
        lexical_density = num_unique_words / len(words)

        # Calculate Flesch Reading Ease
        flesch_reading_ease = textstat.flesch_reading_ease(text_content)
        
        # Find the matching rows in the gold standard data based on Doc ID
        matching_rows = gold_standard_df[gold_standard_df["DocID"].astype(str).str.contains(doc_id)]

        # Get the 'year' value from the gold standard DataFrame
        year = matching_rows["year"].values[0] if "year" in matching_rows else None  
        writer.writerow([doc_id, institute_mentions, combined_mentions, concept_total_mentions,
                         concept_abstract_mentions, paragraph_mentions, month_distance,
                         avg_mentions_per_paragraph, percent_mentions_in_abstract, doc_length,
                         keyword1_mentions, keyword2_mentions, keyword3_mentions,
                         document_type_label, idea_career_3_combined, num_unique_words,
                         num_sentences, avg_sentence_length, sentiment_score,
                         exclamation_marks_count, question_marks_count, position_first_theory_mention,
                         contextual_phrase_count, interpretative_word_count, num_adjectives, num_nouns,
                         num_verbs, num_adverbs, lexical_density, flesch_reading_ease, year])
