# Process Causal Datasets: ATOMIC and Causal News Corpus


In [None]:
import pandas as pd
import json
import os
import ast # For safely evaluating string literals like lists/dicts
import re

# Define base path relative to the notebook location
base_path = os.path.dirname(os.path.abspath('__file__')) # Gets the directory of the notebook

# Define input file paths
atomic_file = os.path.join(base_path, 'Atomic Data', 'v4_atomic_all_agg.csv')
cnc_st1_train_file = os.path.join(base_path, 'CausalNewsCorpus-master', 'data', 'V2', 'train_subtask1.csv')
cnc_st1_dev_file = os.path.join(base_path, 'CausalNewsCorpus-master', 'data', 'V2', 'dev_subtask1.csv')
cnc_st2_train_file = os.path.join(base_path, 'CausalNewsCorpus-master', 'data', 'V2', 'train_subtask2_grouped.csv')
cnc_st2_dev_file = os.path.join(base_path, 'CausalNewsCorpus-master', 'data', 'V2', 'dev_subtask2_grouped.csv')

# Define output file paths
output_atomic_csv = os.path.join(base_path, 'atomic_causal_pairs.csv')
output_cnc_csv = os.path.join(base_path, 'cnc_causal_pairs.csv')

print(f"Atomic input: {atomic_file}")
print(f"CNC ST1 Train input: {cnc_st1_train_file}")
print(f"CNC ST1 Dev input: {cnc_st1_dev_file}")
print(f"CNC ST2 Train input: {cnc_st2_train_file}")
print(f"CNC ST2 Dev input: {cnc_st2_dev_file}")
print(f"Atomic output: {output_atomic_csv}")
print(f"CNC output: {output_cnc_csv}")

## 1. Process ATOMIC Dataset

In [None]:
import random
names = [ 'Alice', 'Bob', 'Charlie', 'Diana', 'Eve', 'Frank', 'Grace', 'Heidi', 'Ivan', 'Judy', 'Kevin', 'Laura', 'Mallory', 'Niaj', 'Olivia', 'Peggy', 'Quentin', 'Rita', 'Sybil', 'Trent', 'Uma', 'Victor', 'Wendy', 'Xavier', 'Yasmine', 'Zane', 'Aaron', 'Bianca', 'Carter', 'Delia', 'Ethan', 'Fiona', 'Gavin', 'Holly', 'Isabel', 'Jack', 'Kara', 'Liam', 'Mona', 'Nolan', 'Oscar', 'Paula', 'Quinn', 'Ralph', 'Sophie', 'Tina', 'Ursula', 'Vince', 'Will', 'Xena', 'Yuri', 'Zoey', 'Amber', 'Blake', 'Cleo', 'Derek', 'Elsa', 'Felix', 'Gina', 'Hank', 'Ivy', 'Jonas', 'Kelsey', 'Lars', 'Mira', 'Nate', 'Opal', 'Perry', 'Quincy', 'Rosa', 'Sam', 'Tara', 'Ulric', 'Vera', 'Wade', 'Ximena', 'Yosef', 'Zelda', 'Aiden', 'Brielle', 'Colin', 'Daisy', 'Emil', 'Freya', 'Gage', 'Hazel', 'Iris', 'Jasper', 'Kian', 'Lila', 'Myles', 'Nina', 'Omar', 'Pia', 'Rex', 'Sage', 'Toby', 'Una', 'Violet', 'Wyatt' ]

fillers = [ 'computer', 'plan', 'idea', 'project', 'device', 'solution', 'resource', 'tool', 'book', 'car', 'phone', 'strategy', 'document', 'report', 'presentation', 'email', 'message', 'gift', 'letter', 'assignment', 'recipe', 'map', 'key', 'password', 'ticket', 'invitation', 'contract', 'agreement', 'proposal', 'schedule', 'appointment', 'meeting', 'task', 'goal', 'challenge', 'opportunity', 'problem', 'question', 'answer', 'result', 'experiment', 'test', 'survey', 'analysis', 'review', 'summary', 'outline', 'draft', 'plan', 'blueprint', 'formula', 'method', 'procedure', 'routine', 'habit', 'custom', 'tradition', 'rule', 'law', 'policy', 'principle', 'value', 'belief', 'dream', 'wish', 'hope', 'ambition', 'desire', 'intention', 'purpose', 'mission', 'vision', 'target', 'objective', 'aim', 'priority', 'focus', 'theme', 'topic', 'subject', 'concept', 'notion', 'thought', 'memory', 'experience', 'event', 'incident', 'accident', 'story', 'tale', 'joke', 'anecdote', 'remark', 'comment', 'statement', 'declaration', 'announcement', 'notice', 'warning', 'alert' ]

# Load ATOMIC data
try:
    atomic_df = pd.read_csv(atomic_file)
    print(f"Loaded ATOMIC data: {atomic_df.shape[0]} rows")
except FileNotFoundError:
    print(f"Error: ATOMIC file not found at {atomic_file}")
    atomic_df = None

if atomic_df is not None:
    # Columns containing effect annotations
    effect_columns = ['oEffect', 'xEffect']

    # Function to safely parse JSON-like lists
    def safe_json_loads(x):
        if isinstance(x, list):
            return x
        if isinstance(x, str) and x.startswith('[') and x.endswith(']'):
            try:
                return ast.literal_eval(x)
            except:
                try:
                    return json.loads(x)
                except:
                    return []
        return []

    # Parse JSON lists in effect columns
    for col in effect_columns:
        if col in atomic_df.columns:
            atomic_df[col] = atomic_df[col].apply(safe_json_loads)
    # Parse prefix column if available
    if 'prefix' in atomic_df.columns:
        atomic_df['prefix'] = atomic_df['prefix'].apply(safe_json_loads)

    # Create list to store processed data
    atomic_processed = []

    # Iterate through the dataframe
    for index, row in atomic_df.iterrows():
        raw_event = row['event']
        # Shuffle names for each row
        shuffled_names = names.copy()
        random.shuffle(shuffled_names)
        nameX = shuffled_names[0]
        nameY = shuffled_names[1] if len(shuffled_names) > 1 else shuffled_names[0]
        filler = random.choice(fillers)
        event_filled = raw_event.replace('PersonX', nameX).replace('PersonY', nameY)
        cause = event_filled.replace('___', filler)
        # Replace names in the effects as well
        for col in effect_columns:
            if col in row and isinstance(row[col], list):
                valid_effects = [e for e in row[col] if isinstance(e, str) and e.lower() != 'none' and e.strip()]
                for effect in set(valid_effects):
                    effect_filled = effect.replace('PersonX', nameX).replace('PersonY', nameY).replace('___', filler)
                    atomic_processed.append({
                        'cause': cause,
                        'effect': effect_filled
                    })

    # Create final DataFrame with only cause and effect
    atomic_final_df = pd.DataFrame(atomic_processed, columns=['cause', 'effect'])

    # Save to CSV
    atomic_final_df.to_csv(output_atomic_csv, index=False)
    print(f"Saved enriched ATOMIC data to {output_atomic_csv}: {atomic_final_df.shape[0]} pairs")
    display(atomic_final_df.head())

## 2. Process Causal News Corpus (CNC) Dataset

In [None]:
# Load CNC data
try:
  cnc_st1_train = pd.read_csv(cnc_st1_train_file, index_col=0)
  cnc_st1_dev = pd.read_csv(cnc_st1_dev_file, index_col=0)
  cnc_st2_train = pd.read_csv(cnc_st2_train_file, index_col=0)
  cnc_st2_dev = pd.read_csv(cnc_st2_dev_file, index_col=0)

  # Combine train/dev for both subtasks
  cnc_st1_df = pd.concat([cnc_st1_train, cnc_st1_dev])
  cnc_st2_df = pd.concat([cnc_st2_train, cnc_st2_dev])
  print(f"Loaded CNC ST1 data: {cnc_st1_df.shape[0]} rows")
  print(f"Loaded CNC ST2 data: {cnc_st2_df.shape[0]} rows")

except FileNotFoundError as e:
  print(f"Error loading CNC files: {e}")
  cnc_st1_df = None
  cnc_st2_df = None

if cnc_st1_df is not None and cnc_st2_df is not None:
  cnc_processed = []

  # Helper function to extract cause/effect from a string with <ARG0> and <ARG1> tags
  def extract_cause_effect(span_str):
    cause = None
    effect = None
    # Find first <ARG0>...</ARG0>
    cause_match = re.search(r'<ARG0>(.*?)</ARG0>', span_str)
    if cause_match:
      cause = cause_match.group(1).strip()
    # Find first <ARG1>...</ARG1>
    effect_match = re.search(r'<ARG1>(.*?)</ARG1>', span_str)
    if effect_match:
      effect = effect_match.group(1).strip()
    return cause, effect

  # Process Subtask 2 (Causal sentences with spans)
  for index, row in cnc_st2_df.iterrows():
    sentence = row['text']
    pairs_str = row['causal_text_w_pairs']
    try:
      # Safely evaluate the string representation of the list
      pairs_list = ast.literal_eval(pairs_str)
      if isinstance(pairs_list, list):
        for span_str in pairs_list:
          if isinstance(span_str, str):
            cause, effect = extract_cause_effect(span_str)
            if cause and effect:
              cnc_processed.append({
                'sentence': sentence,
                'cause': cause,
                'effect': effect,
                'is_causal': 1
              })
    except (ValueError, SyntaxError, TypeError):
      pass

  # Process Subtask 1 (Add non-causal sentences as negative examples)
  non_causal_st1 = cnc_st1_df[cnc_st1_df['label'] == 0]
  for index, row in non_causal_st1.iterrows():
    # Check if this sentence index was already processed from ST2 (unlikely but possible)
    # This check assumes indices might overlap and have different causality interpretations
    # A simpler approach might be to just add all label=0 from ST1.
    is_already_added = any(p['sentence'] == row['text'] for p in cnc_processed)
    if not is_already_added:
      cnc_processed.append({
        'sentence': row['text'],
        'cause': None, # No cause span for non-causal
        'effect': None, # No effect span for non-causal
        'is_causal': 0
      })

  # Create final DataFrame
  cnc_final_df = pd.DataFrame(cnc_processed)

  # Remove potential duplicates based on sentence, cause, effect
  cnc_final_df = cnc_final_df.drop_duplicates()

  # Save to CSV
  cnc_final_df.to_csv(output_cnc_csv, index=False)
  print(f"Saved processed CNC data to {output_cnc_csv}: {cnc_final_df.shape[0]} entries")
  display(cnc_final_df.head())

In [None]:
print(f"--- First 5 rows of {output_atomic_csv} ---")
try:
  display(pd.read_csv(output_atomic_csv).head())
except FileNotFoundError:
  print("File not found.")

print(f"--- First 5 rows of {output_cnc_csv} ---")
try:
  display(pd.read_csv(output_cnc_csv).head())
except FileNotFoundError:
  print("File not found.")