In [None]:
import json
with open("data/dependency_parsing_data/results/annotate_clauses_parents_final.json", "r") as f:
    clauses_parents = json.load(f)

# for analyzing 

In [None]:
import json
import stanza
from tqdm import tqdm

# Initialize the Stanza pipeline
stanza.download('en')
nlp = stanza.Pipeline('en')

# Functions to find ancestors and descendants
def find_ancestors(word, sentence_words):
    ancestors = []
    current = word
    while current.head > 0:  # Head of 0 means the word is the root
        current = sentence_words[current.head - 1]  # Head is 1-based index
        ancestors.append(current.text)
    return ancestors

def find_descendants(word, sentence_words):
    descendants = []
    stack = [word]
    while stack:
        current = stack.pop()
        for w in sentence_words:
            if w.head == current.id:  # If the word points to the current word
                descendants.append(w.text)
                stack.append(w)
    return descendants

# Function to extract subjects and their related words from a clause
def extract_subjects_and_related_words(clause_text):
    doc = nlp(clause_text)
    results = []

    for sentence in doc.sentences:
        for word in sentence.words:
            if word.deprel in ('nsubj', 'nsubjpass'):
                result = {
                    'subject': word.text,
                    'words_pointing_to_subject': find_ancestors(word, sentence.words),
                    'words_subject_points_to': find_descendants(word, sentence.words)
                }
                results.append(result)
    return results

# Function to process a single clause
def process_clause(clause):
    clause_text = clause['sentence']
    clause['subjects'] = extract_subjects_and_related_words(clause_text)
    return clause

# Function to process clauses
def process_clauses(clauses):
    results = []
    for clause in tqdm(clauses, desc="Processing clauses"):
        results.append(process_clause(clause))
    return results


# Process the clauses
processed_clauses = process_clauses(annotated_clauses)

# Save the processed clauses to a JSON file
with open('dependency_parsing_parents.json', 'w') as f:
    json.dump(processed_clauses, f, indent=4)




In [None]:
# map clause tags back to the annotated json with object category labels
import json

# Load the data from both JSON files
with open('/Users/sunnyyu/Desktop/research/children_speech_books/code/python/generics/dependency_parsing.json', 'r') as file:
    generics_data = json.load(file)

with open('/Users/sunnyyu/Desktop/research/children_speech_books/code/python/analysis/annotated/annotated_books_clean.json', 'r') as file:
    books_data = json.load(file)

# Assuming both generics_data and books_data are lists of dictionaries
# Create a dictionary for quick lookup by sentence_num from generics_data
generics_dict = {item['sentence']: item for item in generics_data}

# Merge the data
for book in books_data:
    for sentence in book:
        sentence_str = sentence['sentence']
        if sentence_str in generics_dict:
            sentence.update({
                'category': generics_dict[sentence_str]['category'],
                'subjects': generics_dict[sentence_str]['subjects']
            })

# Save the merged data back to a new JSON file
with open('merged_data.json', 'w') as file:
    json.dump(books_data, file, indent=4)

print("Data merged and saved successfully.")

In [None]:
# merge parents speech BIO tags with dependency parsing and situation entity type data

# map clause tags back to the annotated json with object category labels
import json

# Load the data from both JSON files
with open('/Users/sunnyyu/Desktop/research/children_speech_books/code/python/generics/dependency_parsing_parents_final_updated.json', 'r') as file:
    generics_data = json.load(file)

with open('/Users/sunnyyu/Desktop/research/children_speech_books/code/python/generics/childes_parent_clean.json', 'r') as file:
    parents_data = json.load(file)

# Assuming both generics_data and books_data are lists of dictionaries
# Create a dictionary for quick lookup by sentence_num from generics_data
generics_dict = {item['sentence']: item for item in parents_data}

# Merge the data
for sentence in generics_data:
    sentence_str = sentence['sentence']
    if sentence_str in generics_dict:
        sentence.update({
                'tags': generics_dict[sentence_str]['tags'],
                'age_min': generics_dict[sentence_str]['age_min'],
                'age_max': generics_dict[sentence_str]['age_max']
            })

# Save the merged data back to a new JSON file
with open('merged_data_parents_speech.json', 'w') as file:
    json.dump(generics_data, file, indent=4)

print("Data merged and saved successfully.")

In [None]:
# merge parents speech data

# add category to dependency parsing

import json

# Load JSON data
with open('/Users/sunnyyu/Desktop/research/children_speech_books/code/python/generics/parents_generics/results/annotate_clauses_parents_final.json', 'r') as file:
    clauses_data = json.load(file)

with open('/Users/sunnyyu/Desktop/research/children_speech_books/code/python/generics/parents_generics/dependency_parsing/dependency_parsing_parents_final.json', 'r') as file:
    parsing_data = json.load(file)

# Create a mapping from sentences to categories
sentence_to_category = {item['sentence']: item['category'] for item in clauses_data}

# Add category to each entry in the dependency parsing data
for entry in parsing_data:
    sentence = entry['sentence']
    # Assign category if the sentence is found in the mapping
    if sentence in sentence_to_category:
        entry['category'] = sentence_to_category[sentence]
    else:
        entry['category'] = None  # or some default value if no category is found

# Save the updated data back to JSON
with open('dependency_parsing_parents_final_updated.json', 'w') as file:
    json.dump(parsing_data, file, indent=4)

print("Categories added successfully.")