## 1. Entity Extraction

In [None]:
import json

file_path = "outputs/normalized_output.json"

with open(file_path, "r", encoding="utf-8") as f:
    json_data = json.load(f)

In [None]:
import spacy

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

def extract_entities_from_value(value, field_name):
    """
    Recursively extract entities from any JSON value
    """
    entities = []

    if value is None:
        return entities

    # If nested dictionary
    if isinstance(value, dict):
        for k, v in value.items():
            entities.extend(
                extract_entities_from_value(v, f"{field_name}.{k}")
            )
        return entities

    # If list
    if isinstance(value, list):
        for item in value:
            entities.extend(
                extract_entities_from_value(item, field_name)
            )
        return entities

    # Convert value to text
    text = str(value).strip()
    if not text:
        return entities

    # Apply spaCy NER
    doc = nlp(text)

    for ent in doc.ents:
        entities.append({
            "entity": ent.text,
            "label": ent.label_,
            "source_field": field_name
        })

    return entities


def extract_entities_from_json_record(record):
    extracted_entities = []

    for field, value in record.items():
        extracted_entities.extend(
            extract_entities_from_value(value, field)
        )

    return extracted_entities


def process_json_in_chunks(json_data, chunk_size=1000):
    for i in range(0, len(json_data), chunk_size):
        yield json_data[i:i + chunk_size]

In [None]:
final_output = []

for chunk in process_json_in_chunks(json_data):
    for record in chunk:
        entities = extract_entities_from_json_record(record)

        final_output.append({
            "id": record.get("id"),
            "source_name": record.get("source_name"),
            "timestamp":record.get("timestamp"),
            "entities": entities
        })

In [None]:
output_file = "outputs/entity_extracted.jsonl"
with open(output_file, "w", encoding="utf-8") as f:
    json.dump(final_output, f)

## 2,3. Relationship Extraction and Triple creation

In [None]:
import gc

INPUT_FILE = "outputs/entity_extracted.jsonl"
OUTPUT_FILE = "outputs/relationship_triples.jsonl"

def normalize(text):
    return text.strip().title()

def process_record_streaming(fin, fout):
    record_count = 0
    
    for line in fin:
        try:
            record = json.loads(line)
        except json.JSONDecodeError:
            continue

        record_id = record.get("id")
        source_name = record.get("source_name")
        entities = record.get("entities", [])
        

        # Use a set to store unique triples for this record to save space
        unique_triples = set()
        
        norm_ents = []
        for e in entities:
            n_name = normalize(e["entity"])
            label = e.get("label", "")
            field = e.get("source_field", "")
            norm_ents.append((n_name, label))
            
            # Entity -> Source Field relationship
            if field:
                unique_triples.add((n_name, "MENTIONED_IN", field))
        
        # Pairwise relationships with deduplication
        ent_len = len(norm_ents)
        for i in range(ent_len):
            subj, label1 = norm_ents[i]
            for j in range(i + 1, ent_len):
                obj, label2 = norm_ents[j]
                
                if subj == obj: continue
                
                pred = None
                # Relationship Logic
                if {label1, label2} <= {"PERSON", "ORG"}:
                    pred = "ASSOCIATED_WITH"
                elif label2 in ("GPE", "LOC") and label1 not in ("GPE", "LOC"):
                    pred = "LOCATED_IN"
                elif label1 in ("GPE", "LOC") and label2 not in ("GPE", "LOC"):
                    # Flip order so it's always "Thing LOCATED_IN Place"
                    unique_triples.add((obj, "LOCATED_IN", subj))
                    continue

                if pred:
                    unique_triples.add((subj, pred, obj))

        if unique_triples:
            fout.write(json.dumps({
                "i": record_id,
                "s": source_name,
                "t": list(unique_triples) 
            }, separators=(',', ':')) + "\n")
        
        record_count += 1
        if record_count % 10000 == 0:
            print(f"Processed {record_count} records...")
            gc.collect()
            
    return record_count

gc.disable()
try:
    with open(INPUT_FILE, "r", encoding="utf-8") as fin, \
         open(OUTPUT_FILE, "w", encoding="utf-8") as fout:
        total = process_record_streaming(fin, fout)
        print(f"Success! Total records: {total}")
finally:
    gc.enable()
    gc.collect()