In [23]:
import json
import re

def extract_year(date_str):
    """Extracts a 4-digit year from a given date string using regular expression.
    Returns 'Unknown' if no year is found."""
    match = re.search(r'\b\d{4}\b', date_str)
    return match.group(0) if match else "Unknown"

def load_data(file_path):
    """Load JSON Lines file and return a list of dictionaries."""
    updated_lines = []
    with open(file_path, 'r') as file:
        for line in file:
            obj = json.loads(line)
            if 'mentions' in obj:
                for mention in obj['mentions']:
                    if mention.get('ne_type') == 'DATE':
                        # Extract year and ensure formatted_date is not None or empty
                        mention['formatted_date'] = extract_year(mention['ne_span'])
                        if not mention['formatted_date']:
                            mention['formatted_date'] = "Unknown"  # Default value if empty or None
            updated_lines.append(json.dumps(obj))
    return updated_lines

def save_data(updated_lines, file_path):
    """Saves the updated lines to a specified file path."""
    with open(file_path, 'w') as outfile:
        for line in updated_lines:
            outfile.write(f"{line}\n")
    return file_path

# Update the file paths to match your local setup
input_file_path = "/Users/thebekhruz/Desktop/100Days-Of-Code/100-Days-of-NLP-Odyssey/data/raw/sample_2k_swop_output.jsonl"
output_file_path = "/Users/thebekhruz/Desktop/100Days-Of-Code/100-Days-of-NLP-Odyssey/data/raw/data_formatted_date.jsonl"

updated_lines = load_data(input_file_path)
output_file_path = save_data(updated_lines, output_file_path)

print(f"Data processed and saved to {output_file_path}")


Data processed and saved to /Users/thebekhruz/Desktop/100Days-Of-Code/100-Days-of-NLP-Odyssey/data/raw/data_formatted_date.jsonl
