In [1]:
%pip install flair
%pip install tqdm
!pip install ipywidgets --upgrade
!jupyter nbextension enable --py widgetsnbextension --sys-prefix
!jupyter nbextension install --py widgetsnbextension --sys-prefix

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
usage: jupyter [-h] [--version] [--config-dir] [--data-dir] [--runtime-dir]
               [--paths] [--json] [--debug]
               [subcommand]

Jupyter: Interactive Computing

positional arguments:
  subcommand     the subcommand to launch

options:
  -h, --help     show this help message and exit
  --version      show the versions of core jupyter packages and exit
  --config-dir   show Jupyter config dir
  --data-dir     show Jupyter data dir
  --runtime-dir  show Jupyter runtime dir
  --paths        show all Jupyter paths. Add --json for machine-readable
                 format.
  --json         output paths as machine-readable json
  --debug        output debug information about paths

Available subcommands: console dejavu events execute kernel kernelspec lab
labextension labhub migrate nbconvert notebook run server troubleshoo

In [2]:
from tqdm.notebook import tqdm
import time
from flair.data import Sentence
from flair.models import SequenceTagger
import os
import json
import time
from flair.data import Sentence

In [1]:
import os
import json
import time
from flair.data import Sentence

# Define the base directory and input files with the correct relative paths
base_input_dir = '../data/cleaned'
input_files = ['cleaned_biden_data.json',
               'cleaned_obama_data.json',
               'cleaned_trump_data.json']  # Adjust these paths based on your file locations

output_dir = '../flair/tokenized/'  # Directory to save tokenized files

# Create output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

# Function to tokenize text using Flair
def tokenize_text(text):
    sentence = Sentence(text)
    tokens = [token.text for token in sentence]
    return tokens

# Add logging function to see the progress
def log_message(message):
    print(f"log: {message}")

# Process each file and tokenize with logging and immediate writes
for input_file in input_files:
    input_path = os.path.join(base_input_dir, input_file)
    output_file = os.path.join(output_dir, input_file)

    # Check if the input file exists
    if not os.path.exists(input_path):
        log_message(f"File not found: {input_path}")
        continue

    # Open the input file and check that it's reading properly
    try:
        start_time = time.time()
        log_message(f"Started processing {input_file}...")

        with open(input_path, 'r') as infile:
            log_message(f"Opened {input_file} for reading")
            data = json.load(infile)

            # Open output file to write tokens
            with open(output_file, 'w') as outfile:
                log_message(f"Opened {output_file} for writing")

                tokenized_data = []
                for i, item in enumerate(data):
                    for text in item:
                        # Tokenize each text and append to the list
                        tokens = tokenize_text(text.strip())
                        tokenized_data.append(tokens)

                    # Logging progress every 1000 items
                    if i % 1000 == 0:
                        log_message(f"{i} items processed for {input_file}")

                # Write the tokenized data to the output file
                json.dump(tokenized_data, outfile)

            log_message(f"Tokenization completed for {input_file}")
            end_time = time.time()
            log_message(f"Time taken for {input_file}: {end_time - start_time:.2f} seconds")
    except Exception as e:
        log_message(f"Error while processing {input_file}: {e}")

log: Started processing cleaned_biden_data.json...
log: Opened cleaned_biden_data.json for reading
log: Opened ../flair/tokenized/cleaned_biden_data.json for writing
log: 0 items processed for cleaned_biden_data.json
log: 1000 items processed for cleaned_biden_data.json
log: 2000 items processed for cleaned_biden_data.json
log: 3000 items processed for cleaned_biden_data.json
log: 4000 items processed for cleaned_biden_data.json
log: 5000 items processed for cleaned_biden_data.json
log: 6000 items processed for cleaned_biden_data.json
log: 7000 items processed for cleaned_biden_data.json
log: Tokenization completed for cleaned_biden_data.json
log: Time taken for cleaned_biden_data.json: 23.14 seconds
log: Started processing cleaned_obama_data.json...
log: Opened cleaned_obama_data.json for reading
log: Opened ../flair/tokenized/cleaned_obama_data.json for writing
log: 0 items processed for cleaned_obama_data.json
log: 1000 items processed for cleaned_obama_data.json
log: 2000 items pro

Named Entity Recognition (NER)

In [6]:
from flair.models import SequenceTagger
from flair.data import Sentence
import time
from tqdm import tqdm
import json
import csv

def flatten(tokens):
    return [item for sublist in tokens for item in sublist]
# Load the pre-trained NER model from Flair
tagger = SequenceTagger.load("ner")

# Paths to your cleaned reports
obama_file = "../flair/tokenized/cleaned_obama_data.json"
trump_file = "../flair/tokenized/cleaned_trump_data.json"
biden_file = "../flair/tokenized/cleaned_biden_data.json"

#Path to the folder containing the output
output_folder = "../flair/ner/"
# Ensure the folder exists, if not, create it
os.makedirs(output_folder, exist_ok=True)
#create paths for each president
obama_output_path = output_folder + 'obama_ner_output.csv'
trump_output_path = output_folder + 'trump_ner_output.csv'
biden_output_path = output_folder + 'biden_ner_output.csv'

# Function to load the JSON data (assuming tokenized data is stored in JSON format)
def load_tokenized_data(file_path):
    with open(file_path, 'r') as file:
        return json.load(file)

def apply_ner_to_tokens(tokens, output_csv_path):
    tokens = flatten(tokens)
    chunk_size = 1000
    chunks = [tokens[i:i + chunk_size] for i in range(0, len(tokens), chunk_size)]

    with open(output_csv_path, mode='w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(['Entity', 'Type', 'Start Position', 'End Position'])  # Header
    
        # Initialize progress bar
        with tqdm(total=len(chunks), desc="Processing Chunks") as pbar:
            for chunk in chunks:
                start_time = time.time()
                sentence = Sentence(" ".join(chunk))  # Create sentence from tokens
                tagger.predict(sentence)  # Apply NER model

                # Write detected entities to the CSV
                for entity in sentence.get_spans('ner'):
                    writer.writerow([entity.text, entity.tag, entity.start_position, entity.end_position])

                # Update progress bar
                pbar.update(1)
                elapsed_time = time.time() - start_time
                pbar.set_postfix({'Chunk Time (s)': f'{elapsed_time:.2f}'})

# Load tokenized data from JSON files
obama_tokens = load_tokenized_data(obama_file)
print(f"Total tokens to process: {len(obama_tokens)}")
trump_tokens = load_tokenized_data(trump_file)
biden_tokens = load_tokenized_data(biden_file)

# Apply NER and save results for each report
apply_ner_to_tokens(obama_tokens, obama_output_path)
apply_ner_to_tokens(trump_tokens, trump_output_path)
apply_ner_to_tokens(biden_tokens, biden_output_path)

print("NER completed and results saved for Obama, Trump, and Biden reports.")

2024-10-10 13:01:58,941 SequenceTagger predicts: Dictionary with 20 tags: <unk>, O, S-ORG, S-MISC, B-PER, E-PER, S-LOC, B-ORG, E-ORG, I-PER, S-PER, B-MISC, I-MISC, E-MISC, I-ORG, B-LOC, E-LOC, I-LOC, <START>, <STOP>
Total tokens to process: 221740


Processing Chunks:   0%|          | 4/11939 [02:02<101:34:02, 30.64s/it, Chunk Time (s)=26.79]


KeyboardInterrupt: 