In [22]:
import spacy
import csv
import random
spacy.load('en_core_web_sm')

<spacy.lang.en.English at 0x28324a4efd0>

In [19]:
# Load spaCy model for lemmatization
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

In [20]:
def word_shape(word):
    shape = ''
    for char in word:
        if char.isupper():
            shape += 'X'
        elif char.islower():
            shape += 'x'
        elif char.isdigit():
            shape += 'd'
        else:
            shape += char
    return shape

def is_nan(value):
    return value == '' or value.lower() == 'nan'

def balance_classes(input_file, threshold=None):
    class_counts = {'B-ORG': 0, 'O': 0, 'B-MISC': 0, 'B-PER': 0, 'I-PER': 0, 
                    'B-LOC': 0, 'I-ORG': 0, 'I-MISC': 0, 'I-LOC': 0}

    with open(input_file, 'r', encoding='utf-8') as file:
        reader = csv.reader(file, delimiter='\t')
        for row in reader:
            if row:
                label = row[-1]
                class_counts[label] += 1

    if not threshold:
        threshold = sum(class_counts.values()) // len(class_counts)

    selected_rows = []
    with open(input_file, 'r', encoding='utf-8') as file:
        reader = csv.reader(file, delimiter='\t')

        for row in reader:
            if row:
                label = row[-1]
                if class_counts[label] < threshold:
                    selected_rows.append(row)
                    class_counts[label] += 1
                elif label == 'O' and random.random() < (threshold / class_counts[label]):
                    selected_rows.append(row)
                    class_counts[label] += 1

    return selected_rows

def process_file(input_file, output_file, max_output_columns):
    balanced_rows = balance_classes(input_file)

    with open(output_file, 'w', newline='', encoding='utf-8') as outfile:
        writer = csv.writer(outfile, delimiter='\t')

        prev_word = 'START'
        next_word = None
        buffer = []

        for row in balanced_rows:
            if not row:
                prev_word = 'START'
                buffer = []
                continue

            if len(buffer) == 0 or is_nan(row[0]):
                buffer.append(row)
                continue

            next_word = row[0] if not is_nan(row[0]) else 'NaN'

            current_word = buffer[-1][0]
            if not is_nan(current_word):
                doc = nlp(current_word)
                lemma = doc[0].lemma_ if doc else 'NaN'
                capitalization = current_word[0].isupper() if current_word else False
                word_shape_feature = word_shape(current_word) if current_word else 'NaN'
                word_length = len(current_word) if current_word else 0

                features = [
                    current_word,  # Original word
                    prev_word,  # Previous word
                    next_word,  # Next word
                    lemma,  # Lemma
                    str(capitalization),  # Capitalization
                    word_shape_feature,  # Word shape
                    str(word_length)  # Word length
                ]
            else:
                features = ['NaN'] * 7

            extra_columns = buffer[-1][1:]
            total_columns = features + extra_columns
            total_columns = (total_columns[:max_output_columns] +
                             [''] * (max_output_columns - len(total_columns)))

            writer.writerow(total_columns)

            prev_word = current_word if not is_nan(current_word) else 'NaN'
            buffer.append(row)

In [24]:
input_file_path = 'data/conll2003.dev.conll'
output_file_path = 'data/pre.conll2003.dev.conll'
max_output_columns = 10  # Define your constant number of output columns here

process_file(input_file_path, output_file_path, max_output_columns)

In [17]:

input_file_path = 'data/conll2003.train.conll'
output_file_path = 'data/pre.conll2003.train.conll'
max_output_columns = 10  # Define your constant number of output columns here

process_file(input_file_path, output_file_path, max_output_columns)