In [1]:
import re
from collections import Counter

In [2]:
def is_titlecase(word):
    """ Check if a word is titlecase. """
    return word.istitle()


def is_uppercase(word):
    """ Check if a word is uppercase. """
    return word.isupper()


def is_digit(word):
    """ Check if a word is a digit. """
    return word.isdigit()


def extract_features_and_bigrams(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        sentences = file.read().strip().split('\n\n')
    data = []
    bigrams_frequency = Counter()
    for sentence in sentences:
        words = sentence.split('\n')
        tokens, pos_tags = [], []
        orthographic_features = []
        bigrams = []
        for word in words:
            parts = word.split('\t')
            if len(parts) == 4:
                token, pos_tag = parts[0], parts[1]
                tokens.append(token)
                pos_tags.append(pos_tag)
                orthographic_features.append({
                    'is_titlecase': is_titlecase(token),
                    'is_uppercase': is_uppercase(token),
                    'is_digit': is_digit(token)
                })
                # Building bigrams for the sentence
                if len(tokens) > 1:
                    bigram = (tokens[-2], token)
                    bigrams.append(bigram)
                    bigrams_frequency[bigram] += 1
        # Append the processed sentence to the data
        data.append({
            'tokens': tokens,
            'pos_tags': pos_tags,
            'orthographic_features': orthographic_features,
            'bigrams': bigrams
        })
    return data, bigrams_frequency

In [3]:
# Extract features and bigrams from the file
file_path = 'data/conll2003.train.conll'
data, bigrams_frequency = extract_features_and_bigrams(file_path)

In [7]:
# Showing a sample from the processed data
data_sample = data[:2]  # First two sentences for sample
data_sample, bigrams_frequency.most_common(5)  # Showing 5 most common bigrams

#print(bigrams_frequency.most_common(5))

([{'tokens': ['EU',
    'rejects',
    'German',
    'call',
    'to',
    'boycott',
    'British',
    'lamb',
    '.'],
   'pos_tags': ['NNP', 'VBZ', 'JJ', 'NN', 'TO', 'VB', 'JJ', 'NN', '.'],
   'orthographic_features': [{'is_titlecase': False,
     'is_uppercase': True,
     'is_digit': False},
    {'is_titlecase': False, 'is_uppercase': False, 'is_digit': False},
    {'is_titlecase': True, 'is_uppercase': False, 'is_digit': False},
    {'is_titlecase': False, 'is_uppercase': False, 'is_digit': False},
    {'is_titlecase': False, 'is_uppercase': False, 'is_digit': False},
    {'is_titlecase': False, 'is_uppercase': False, 'is_digit': False},
    {'is_titlecase': True, 'is_uppercase': False, 'is_digit': False},
    {'is_titlecase': False, 'is_uppercase': False, 'is_digit': False},
    {'is_titlecase': False, 'is_uppercase': False, 'is_digit': False}],
   'bigrams': [('EU', 'rejects'),
    ('rejects', 'German'),
    ('German', 'call'),
    ('call', 'to'),
    ('to', 'boycott'),
    (