## Data Cleaning 

Cleans news data represented in JSON format

In [1]:
import json
import os
import json
import gzip
import shutil

In [2]:
def check_json_files_in_folder(folder_path):
    '''Check corruputed JSON'''

    results = []
    corrupted_count = 0
    
    for filename in os.listdir(folder_path):
        if filename.endswith('.json'):
            file_path = os.path.join(folder_path, filename)
            try:
                # Attempt to open and load the JSON file
                with open(file_path, 'r') as file:
                    json.load(file)
                results.append((filename, 'Valid JSON'))
            except json.JSONDecodeError:
                results.append((filename, 'Corrupted JSON'))
                corrupted_count += 1
            except Exception as e:
                results.append((filename, f'Error: {str(e)}'))
                corrupted_count += 1

    # for result in results:
    #     print(f'File: {result[0]}, Status: {result[1]}')
    
    print(len(results))
    print(f'corrupted  {corrupted_count}')

In [3]:
def check_caption(): 
    """Check num of captioned files"""
    folder_path = '' 
    results = []
    count = 0
    for filename in os.listdir(folder_path):
        if filename.endswith('.json'):
            file_path = os.path.join(folder_path, filename)
            try:
                # Attempt to open and load the JSON file
                with open(file_path, 'r') as file:
                    file = json.load(file)

                    captions = file['cap']
                    if captions: 
                        count +=1 
                results.append((filename, 'Valid JSON'))

            except Exception as e:
                results.append((filename, f'Error: {str(e)}'))

    print('caption exist', count)
    

Additional Captioning for the news articles images  - Some JSON files may not have the caption field even though the articles have them

In [10]:
from bs4 import BeautifulSoup 
import requests
import random
import time 

def scrape_image_captions(url):
    headers = {
        'Accept-Encoding': 'identity',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
    }
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()

        if response.headers.get('Content-Encoding') == 'gzip':
            content = gzip.decompress(response.content)
        else:
            content = response.content

        soup = BeautifulSoup(content, 'html.parser')
        figcaptions = soup.find_all('figcaption')
        captions = []

        for figcaption in figcaptions:
            caption_span = figcaption.find('span', class_='caption-text') or figcaption.find('span')
            if caption_span:
                captions.append(caption_span.text.strip())
            else:
                caption_p = figcaption.find('p')
                if caption_p:
                    captions.append(caption_p.text.strip())
                else:
                    if figcaption.text:
                        captions.append(figcaption.text.strip())

        return captions

    except requests.exceptions.RequestException as e:
        print(f"URL error: {e}")
        return None
    except Exception as e:
        print(f"unknown: {e}")
        return None

def process_json_files(directory_path):
    captioned_count = 0
    json_files = [f for f in os.listdir(directory_path) if f.endswith('.json')]
    random.shuffle(json_files)  # Randomize since limited url requests and top files would waste API call count 

    while json_files:
        start_time = time.time()
        
        while time.time() - start_time < 120:  
            if not json_files:
                break
            
            filename = json_files.pop(0)
            file_path = os.path.join(directory_path, filename)
            with open(file_path, 'r+', encoding='utf-8') as file:
                data = json.load(file)
                url = data.get('URL')
                
                if 'cap' not in data or not data['cap']:
                    new_caption = scrape_image_captions(url)

                    if not new_caption:
                        print("None", url)
                    else:
                        data['cap'] = new_caption
                        file.seek(0)
                        json.dump(data, file, indent=4)
                        file.truncate()
                        captioned_count += 1
                        print(f"Updated caption for {filename}")
                else:
                    continue

        print("sleep for 7 minutes...")
        time.sleep(420)  

    return captioned_count

folder_path = ''

captioned_count = process_json_files(folder_path)
print(f"Total updated: {captioned_count}")


None https://www.todayonline.com/singapore/2-weeks-jail-man-who-coughed-colleagues-after-testing-positive-covid-19-2257406
None https://straitstimes.com/singapore/subscriber-picks-is-the-pap-brand-in-trouble-relying-on-dr-google-may-do-you-more-harm-than-good
None https://straitstimes.com/opinion/st-editorial/new-frontiers-in-relations-with-vietnam
None https://straitstimes.com/singapore/courts-crime/jail-cane-for-man-who-slapped-and-sexually-abused-stepdaughter-12
None https://straitstimes.com/opinion/forum/forum-when-lee-kuan-yew-visited-my-father
None https://straitstimes.com/singapore/courts-crime/machine-operator-stole-286k-worth-of-gold-from-employer-over-two-years
None https://www.todayonline.com/8days/tom-and-jerry-and-9-more-tv-shows-and-movies-feature-cameos-singapore-2229401
None https://straitstimes.com/singapore/courts-crime/jail-and-caning-for-jealous-man-who-raped-beat-up-teen-mother-of-his-child
None https://straitstimes.com/opinion/forum/forum-do-more-to-mitigate-any-r

In [12]:
def shift(dest_directory, folder_path): 
    '''
    Shift captioned json to another folder  
    '''
    count = 0
    if not os.path.exists(dest_directory):
        os.makedirs(dest_directory)

    for filename in os.listdir(folder_path):
        if filename.endswith('.json'):
            file_path = os.path.join(folder_path, filename)
            
            with open(file_path, 'r', encoding='utf-8') as file:
                data = json.load(file)
                if 'cap' in data:
                    count +=1 
                    shutil.move(file_path, os.path.join(dest_directory, filename))
                    print(f"Moved: {filename}")

    print(count)
    
dest = ''
folder_path = ''
shift(dest, folder_path)

0


Format JSON information into text to fit into GraphRag Pipeline

In [1]:
import json
import os
from pathlib import Path

def read_json_file(file_path):
    """Read and return the content of a JSON file."""
    with open(file_path, 'r', encoding='utf-8') as file:
        return json.load(file)

def write_article_to_text(article, folder_path, index):

    filename = f"article{index}.txt"
    os.makedirs(folder_path, exist_ok=True)  
    file_path = os.path.join(folder_path, filename)

    with open(file_path, 'w', encoding='utf-8') as file:
        file.write(f"Title: {article['TITLE']}\n\n")
        file.write(f"Publish Time: {article['PUBLISHTIME']}\n\n")
        file.write("Text Body:\n")
        for paragraph in article['TEXT']:
            file.write(f"{paragraph}\n\n")
        file.write("\nImages Associated with the Article:\n")
        for image in article['IMAGE']:
            file.write(f"{image}\n")

def process_all_json_files(source_folder, target_folder):
    json_files = [f for f in os.listdir(source_folder) if f.endswith('.json')]
    
    for index, json_file in enumerate(json_files, start=1):
        file_path = os.path.join(source_folder, json_file)
        article = read_json_file(file_path)
        write_article_to_text(article, target_folder, index)


source_folder = '/home/tjustin/ragtest/data/news_PE_subset' 
target_folder = '/home/tjustin/ragtest/input' 

process_all_json_files(source_folder, target_folder)
