In [19]:
!gdown 15DTR0bPvowQj7CqyQH3jCHuaPaZJaT45

Downloading...
From: https://drive.google.com/uc?id=15DTR0bPvowQj7CqyQH3jCHuaPaZJaT45
To: /content/AnnotatedObsNamedDefined.zip
  0% 0.00/4.66M [00:00<?, ?B/s]100% 4.66M/4.66M [00:00<00:00, 152MB/s]


In [20]:
import zipfile
import os
import json
import pandas as pd

# Function to extract zip file
def extract_zip(zip_path, output_folder):
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(output_folder)
    print(f"Extracted files to {output_folder}")

# Function to process JSON files
def process_json_files(directory):
    data = []

    # Traverse the directory tree to find JSON files
    for root, _, files in os.walk(directory):
        for filename in files:
            # Skip hidden files (starting with '._')
            if filename.endswith(".json") and not filename.startswith("._"):
                filepath = os.path.join(root, filename)
                print(f"Processing file: {filepath}")
                try:
                    with open(filepath, 'r', encoding='utf-8') as file:
                        try:
                            json_data = json.load(file)
                        except json.JSONDecodeError as e:
                            print(f"JSON decode error in file {filename}: {e}")
                            continue  # Skip files with JSON decode errors

                        # Check the structure of the JSON data
                        for item in json_data:
                            if "Passage" in item or "Text" in item:
                                text = item.get("Passage", item.get("Text"))
                                if "Obligations" in item and item["Obligations"]:
                                    label = "obligation"
                                elif "ProcessedResponse" in item:
                                    if item["ProcessedResponse"] == "Heading or Short Text: No analysis required." or item["ProcessedResponse"] == "":
                                        label = "non-obligation"
                                    else:
                                        label = "obligation"
                                else:
                                    label = "non-obligation"
                                data.append({"text": text, "label": label})
                            else:
                                print(f"File {filename} does not have the expected format.")
                                break  # Skip the rest of this file if format is incorrect
                except FileNotFoundError as e:
                    print(f"File not found: {e}")
                except PermissionError as e:
                    print(f"Permission error: {e}")
                except Exception as e:
                    print(f"Error processing file {filepath}: {e}")
                    continue  # Skip the problematic file and continue with the next one

    df = pd.DataFrame(data)

    # Shuffle the DataFrame
    df = df.sample(frac=1).reset_index(drop=True)

    df.to_csv("/content/drive/Othercomputers/MBZUAI/MBZUAI/ADGM-Project/annotated_obligation_classification_data_pure_regulations.csv", index=False)
    print("CSV file has been created as output.csv")

# Define paths
zip_path = "/content/AnnotatedObsNamedDefined.zip"
output_folder = "/content/extracted_files"

# Extract zip file
extract_zip(zip_path, output_folder)

# Process JSON files
process_json_files(output_folder)


Extracted files to /content/extracted_files
Processing file: /content/extracted_files/AnnotatedObsNamedDefined/AML_VER09.211223_obligations_named_entities_defined_terms.json
Processing file: /content/extracted_files/AnnotatedObsNamedDefined/FUNDS_VER08.040723_obligations_named_entities_defined_terms.json
Processing file: /content/extracted_files/AnnotatedObsNamedDefined/CONF_VER03.18042019_obligations_named_entities_defined_terms.json
Processing file: /content/extracted_files/AnnotatedObsNamedDefined/FSMR (Consolidated_December 2023)_obligations_named_entities_defined_terms.json
Processing file: /content/extracted_files/AnnotatedObsNamedDefined/GPM VER03.120623_obligations_named_entities_defined_terms.json
Processing file: /content/extracted_files/AnnotatedObsNamedDefined/CMC_VER03.270922_obligations_named_entities_defined_terms.json
Processing file: /content/extracted_files/AnnotatedObsNamedDefined/CRS Regulations 2017 (Consolidated_October 2023) v6_obligations_named_entities_defined_

In [22]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
import re

# Download the NLTK data required for tokenization
nltk.download('punkt')

# Function to clean text
def clean_text(text):
    text = text.replace('\n', ' ').replace('\u2019', '').replace('\u2014', '').replace('\u2013', '').replace('\u200e', '').replace('\u2018', '').replace('\u2022', '').replace('\u201c', '').replace('\u201d', '').replace('\t', ' ')
    text = re.sub(r'/Table Start.*?/Table End', '', text, flags=re.DOTALL)
    text = re.sub(r'/Figure Start.*?/Figure End', '', text, flags=re.DOTALL)
    return text

# Function to remove empty values, short obligations, and long texts from CSV file
def clean_csv(output_file, max_word_count):
    # Read the CSV file into a DataFrame
    df = pd.read_csv(output_file)

    # Remove rows with any empty values
    df_cleaned = df.dropna()

    # Clean the text in each row
    df_cleaned['text'] = df_cleaned['text'].apply(clean_text)

    # Remove obligation entries with text shorter than 7 words
    def is_valid_obligation(text, label):
        if label == "obligation":
            word_count = len(word_tokenize(text))
            return 7 <= word_count <= max_word_count
        return True

    # Remove non-obligation entries with text longer than the max word count
    def is_valid_non_obligation(text, label):
        if label == "non-obligation":
            word_count = len(word_tokenize(text))
            return word_count <= max_word_count
        return True

    df_cleaned = df_cleaned[df_cleaned.apply(lambda row: is_valid_obligation(row['text'], row['label']) and is_valid_non_obligation(row['text'], row['label']), axis=1)]

    # Save the cleaned DataFrame back to the CSV file
    df_cleaned.to_csv(output_file, index=False)

    print(f"Cleaned CSV file has been saved as {output_file}")

# Function to calculate and print statistics from CSV file
def calculate_statistics_from_csv(output_file):
    df = pd.read_csv(output_file)
    obligation_count = df[df['label'] == 'obligation'].shape[0]
    non_obligation_count = df[df['label'] == 'non-obligation'].shape[0]

    print(f"Number of obligation entries: {obligation_count}")
    print(f"Number of non-obligation entries: {non_obligation_count}")

# Function to calculate and print average word count
def calculate_average_word_count(output_file):
    df = pd.read_csv(output_file)
    df['word_count'] = df['text'].apply(lambda text: len(word_tokenize(text)))

    obligation_word_count = df[df['label'] == 'obligation']['word_count'].mean()
    non_obligation_word_count = df[df['label'] == 'non-obligation']['word_count'].mean()
    overall_word_count = df['word_count'].mean()

    print(f"Average word count for obligation entries: {obligation_word_count}")
    print(f"Average word count for non-obligation entries: {non_obligation_word_count}")
    print(f"Overall average word count: {overall_word_count}")

# Define the path to the CSV file
output_file = "/content/drive/Othercomputers/MBZUAI/MBZUAI/ADGM-Project/annotated_obligation_classification_data_pure_regulations.csv"

# Clean the CSV file by removing empty values, short obligations, and long texts
clean_csv(output_file, max_word_count=100)

# Calculate and print statistics from the cleaned CSV
calculate_statistics_from_csv(output_file)

# Calculate and print average word count
#calculate_average_word_count(output_file)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Cleaned CSV file has been saved as /content/drive/Othercomputers/MBZUAI/MBZUAI/ADGM-Project/annotated_obligation_classification_data_pure_regulations.csv
Number of obligation entries: 5459
Number of non-obligation entries: 1807


In [23]:
calculate_statistics_from_csv('/content/drive/Othercomputers/MBZUAI/MBZUAI/ADGM-Project/obligation_dataset.csv')

Number of obligation entries: 51708
Number of non-obligation entries: 638


In [24]:
calculate_statistics_from_csv('/content/drive/Othercomputers/MBZUAI/MBZUAI/ADGM-Project/extended_obligation_classification_dataset.csv')

Number of obligation entries: 51708
Number of non-obligation entries: 37915


In [25]:
calculate_statistics_from_csv('/content/drive/Othercomputers/MBZUAI/MBZUAI/ADGM-Project/annotated_obligation_classification_data_pure_regulations_augmented.csv')

Number of obligation entries: 5459
Number of non-obligation entries: 5535
