In [None]:
import pandas as pd
import os
import re

def load_and_process_json(input_file, output_file, row_limit=500):
    df = pd.read_json(input_file, lines=True)

    df = df.dropna(subset=['processed_text'])
    
    df['processed_text'] = df['processed_text'].apply(lambda x: re.sub(r'\s+', ' ', x.strip()))  # Replace multiple spaces/newlines with a single space
    df = df[df['processed_text'].str.strip() != '']  # Keep only rows with non-empty 'processed_text'

    df = df[df['processed_text'].str.len() > 10]

    print(f"Total records loaded from {input_file}: {len(df)} after cleaning")
    
    df_cleaned = df[['publish_date', 'processed_text']].dropna()

    df_sampled = df_cleaned.sample(n=row_limit, random_state=42)
    
    os.makedirs(os.path.dirname(output_file), exist_ok=True)
    
    df_sampled.to_csv(output_file, index=False)
    print(f"Processed data saved to {output_file}")

input_files = [
    '../data/preprocessed/obama_preprocessed.json',
    '../data/preprocessed/trump_preprocessed.json',
    '../data/preprocessed/biden_preprocessed.json'
]

presidents = ['obama', 'trump', 'biden']
output_dir = '../flairData'

for input_file, president in zip(input_files, presidents):
    output_file = f'{output_dir}/sampled_cleaned_data_{president}.csv'
    load_and_process_json(input_file, output_file, row_limit=100)

print("Processing complete.")

Total records loaded from ../data/preprocessed/obama_preprocessed.json: 15704 after cleaning
Processed data saved to ../francescoData/sampled_cleaned_data_obama.csv
Total records loaded from ../data/preprocessed/trump_preprocessed.json: 7458 after cleaning
Processed data saved to ../francescoData/sampled_cleaned_data_trump.csv
Total records loaded from ../data/preprocessed/biden_preprocessed.json: 7736 after cleaning
Processed data saved to ../francescoData/sampled_cleaned_data_biden.csv
Processing complete.


In [None]:
import pandas as pd
from flair.data import Sentence
from flair.tokenization import SegtokTokenizer

tokenizer = SegtokTokenizer()

# Function to tokenize the text using Flair
def tokenize_text(text):
    if pd.isna(text):  # Check if the text is missing
        return []
    sentence = Sentence(text, use_tokenizer=tokenizer)
    tokens = [token.text for token in sentence.tokens]
    return tokens

# Function to process and tokenize data from CSV
def tokenize_from_csv(input_file, output_file):

    df = pd.read_csv(input_file)
    
    df['tokens'] = df['processed_text'].apply(tokenize_text)
    
    df[['publish_date', 'tokens']].to_csv(output_file, index=False)
    print(f"Tokenized data saved to {output_file}")

input_files = [
    '../flairData/sampled_cleaned_data_obama.csv',
    '../flairData/sampled_cleaned_data_trump.csv',
    '../flairData/sampled_cleaned_data_biden.csv'
]

output_files = [
    '../flairData/tokenized_data_obama.csv',
    '../flairData/tokenized_data_trump.csv',
    '../flairData/tokenized_data_biden.csv'
]

for input_file, output_file in zip(input_files, output_files):
    tokenize_from_csv(input_file, output_file)

print("Tokenization complete.")

Tokenized data saved to ../francescoData/tokenized_data_obama.csv
Tokenized data saved to ../francescoData/tokenized_data_trump.csv
Tokenized data saved to ../francescoData/tokenized_data_biden.csv
Tokenization complete.


In [None]:
import pandas as pd
from flair.models import SequenceTagger
from flair.data import Sentence
import time
import json

tagger = SequenceTagger.load('ner')

# Function to run NER on tokenized text
def ner_on_tokens(tokens):
    sentence_text = ' '.join(tokens)  
    
    sentence = Sentence(sentence_text)
    
    tagger.predict(sentence)
    
    entities = []
    for entity in sentence.get_spans('ner'):
        entities.append({
            'text': entity.text,
            'label': entity.get_label('ner').value
        })
    return entities

# Function to process the CSV file and perform NER
def perform_ner(input_file, output_file):

    df = pd.read_csv(input_file)
    
    total_rows = len(df)
    start_time = time.time()  

    row_times = []

    for index, row in df.iterrows():
        row_start = time.time() 

        tokens = eval(row['tokens'])  
        entities = ner_on_tokens(tokens)

        df.at[index, 'entities'] = json.dumps(entities)

        row_time = time.time() - row_start
        row_times.append(row_time)

        avg_time_per_row = sum(row_times) / len(row_times)
        remaining_rows = total_rows - (index + 1)
        remaining_time = remaining_rows * avg_time_per_row

        print(f"Processed {index + 1}/{total_rows} rows. Estimated time remaining: {remaining_time:.2f} seconds")

    df[['publish_date', 'entities']].to_csv(output_file, index=False)
    
    total_time = time.time() - start_time
    print(f"NER results saved to {output_file}. Total time: {total_time:.2f} seconds")

input_files = [
    '../flairData/tokenized_data_obama.csv',
    '../flairData/tokenized_data_trump.csv',
    '../flairData/tokenized_data_biden.csv'
]

output_files = [
    '../flairData/ner_results_obama.csv',
    '../flairData/ner_results_trump.csv',
    '../flairData/ner_results_biden.csv'
]

for input_file, output_file in zip(input_files, output_files):
    perform_ner(input_file, output_file)

print("NER process complete.")

2024-10-11 15:28:49,033 SequenceTagger predicts: Dictionary with 20 tags: <unk>, O, S-ORG, S-MISC, B-PER, E-PER, S-LOC, B-ORG, E-ORG, I-PER, S-PER, B-MISC, I-MISC, E-MISC, I-ORG, B-LOC, E-LOC, I-LOC, <START>, <STOP>
Processed 1/100 rows. Estimated time remaining: 1056.40 seconds
Processed 2/100 rows. Estimated time remaining: 851.89 seconds
Processed 3/100 rows. Estimated time remaining: 3938.84 seconds
Processed 4/100 rows. Estimated time remaining: 3010.81 seconds
Processed 5/100 rows. Estimated time remaining: 2509.44 seconds
Processed 6/100 rows. Estimated time remaining: 2143.67 seconds
Processed 7/100 rows. Estimated time remaining: 1845.53 seconds
Processed 8/100 rows. Estimated time remaining: 1638.61 seconds
Processed 9/100 rows. Estimated time remaining: 1679.28 seconds
Processed 10/100 rows. Estimated time remaining: 1559.03 seconds
Processed 11/100 rows. Estimated time remaining: 1429.36 seconds
Processed 12/100 rows. Estimated time remaining: 1323.30 seconds
Processed 13/1