### Importing required data

In [1]:
from datasets import load_dataset

conllu_train= load_dataset("conll2003", split="train")

#### Storing this in a dataframe

In [2]:
import pandas as pd

In [3]:
df_train = conllu_train.to_pandas()

In [4]:
df_train.head()

Unnamed: 0,id,tokens,pos_tags,chunk_tags,ner_tags
0,0,"[EU, rejects, German, call, to, boycott, Briti...","[22, 42, 16, 21, 35, 37, 16, 21, 7]","[11, 21, 11, 12, 21, 22, 11, 12, 0]","[3, 0, 7, 0, 0, 0, 7, 0, 0]"
1,1,"[Peter, Blackburn]","[22, 22]","[11, 12]","[1, 2]"
2,2,"[BRUSSELS, 1996-08-22]","[22, 11]","[11, 12]","[5, 0]"
3,3,"[The, European, Commission, said, on, Thursday...","[12, 22, 22, 38, 15, 22, 28, 38, 15, 16, 21, 3...","[11, 12, 12, 21, 13, 11, 11, 21, 13, 11, 12, 1...","[0, 3, 4, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, ..."
4,4,"[Germany, 's, representative, to, the, Europea...","[22, 27, 21, 35, 12, 22, 22, 27, 16, 21, 22, 2...","[11, 11, 12, 13, 11, 12, 12, 11, 12, 12, 12, 1...","[5, 0, 0, 0, 0, 3, 4, 0, 0, 0, 1, 2, 0, 0, 0, ..."


#### Reduce number of labels  
(By removing Before, Inside tags)

In [5]:
ner_tags = {'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-ORG': 3, 'I-ORG': 4, 'B-LOC': 5, 'I-LOC': 6, 'B-MISC': 7, 'I-MISC': 8}

In [6]:
new_mapping = {0: 'O', 1: 'PER', 2: 'PER', 3: 'ORG', 4: 'ORG', 5: 'LOC', 6: 'LOC', 7: 'MISC', 8: 'MISC'}

df_train['ner_tags'] = df_train['ner_tags'].apply(lambda tags: [new_mapping[tag] for tag in tags])

In [7]:
df_train.head()

Unnamed: 0,id,tokens,pos_tags,chunk_tags,ner_tags
0,0,"[EU, rejects, German, call, to, boycott, Briti...","[22, 42, 16, 21, 35, 37, 16, 21, 7]","[11, 21, 11, 12, 21, 22, 11, 12, 0]","[ORG, O, MISC, O, O, O, MISC, O, O]"
1,1,"[Peter, Blackburn]","[22, 22]","[11, 12]","[PER, PER]"
2,2,"[BRUSSELS, 1996-08-22]","[22, 11]","[11, 12]","[LOC, O]"
3,3,"[The, European, Commission, said, on, Thursday...","[12, 22, 22, 38, 15, 22, 28, 38, 15, 16, 21, 3...","[11, 12, 12, 21, 13, 11, 11, 21, 13, 11, 12, 1...","[O, ORG, ORG, O, O, O, O, O, O, MISC, O, O, O,..."
4,4,"[Germany, 's, representative, to, the, Europea...","[22, 27, 21, 35, 12, 22, 22, 27, 16, 21, 22, 2...","[11, 11, 12, 13, 11, 12, 12, 11, 12, 12, 12, 1...","[LOC, O, O, O, O, ORG, ORG, O, O, O, PER, PER,..."


### Creating Features

In [8]:
def is_capitalized(word):
    return int(word[0].isupper())

df_train['capitalized'] = df_train['tokens'].apply(lambda tokens: [is_capitalized(word) for word in tokens])

Checks if a word is capitalized or not

In [9]:
def is_NNP(tags):
    return [1 if tag == 22 else 0 for tag in tags]

df_train['is_NNP'] = df_train['pos_tags'].apply(is_NNP)

Checks if the Part of speech tagging is NNP (Proper Noun)

In [10]:
def next_token_possessive(tokens):
    result = []
    for i in range(len(tokens) - 1):
        result.append(1 if tokens[i+1] == "'s" else 0)
    result.append(0)
    return result

# Apply the function to the 'tokens' column
df_train['is_possession'] = df_train['tokens'].apply(next_token_possessive)

Checks if the next token is 's which indicates possesion

In [11]:
def has_digit(tokens):
    binary_array = [1 if any(char.isdigit() for char in word) else 0 for word in tokens]
    return binary_array

# Apply the function to the 'tokens' column
df_train['has_digit'] = df_train['tokens'].apply(has_digit)

Checks if the token contains a digit

### Transition Features

Below features do not look at x, but rather at y(i) and y(i-1)

In [12]:
labels = ['O', 'PER', 'ORG', 'LOC', 'MISC']

In [13]:
def starts_with(ner_tags, tag):
    result = []
    for i in range(len(ner_tags)):
        if i == 0 and ner_tags[i] == tag:
            result.append(1)
        else:
            result.append(0)
    return result

for label in labels:
    df_train[f'starts_with_{label}'] = df_train['ner_tags'].apply(starts_with, tag = label)  

Creates 5 new features to check if first token is of that particular label

In [14]:
def transition(ner_tags, curr, prev):
    result = [0]
    for i in range(1, len(ner_tags)):
        if ner_tags[i] == curr and ner_tags[i-1] == prev:
            result.append(1)
        else:
            result.append(0)
    return result

for previous in labels:
    for current in labels:
        df_train[f'{previous}_{current}'] = df_train['ner_tags'].apply(transition, curr = current, prev = previous)

Creates 5*5 = 25 new features to check current label and previous label dependencies

In [15]:
df_train.head(3)

Unnamed: 0,id,tokens,pos_tags,chunk_tags,ner_tags,capitalized,is_NNP,is_possession,has_digit,starts_with_O,...,LOC_O,LOC_PER,LOC_ORG,LOC_LOC,LOC_MISC,MISC_O,MISC_PER,MISC_ORG,MISC_LOC,MISC_MISC
0,0,"[EU, rejects, German, call, to, boycott, Briti...","[22, 42, 16, 21, 35, 37, 16, 21, 7]","[11, 21, 11, 12, 21, 22, 11, 12, 0]","[ORG, O, MISC, O, O, O, MISC, O, O]","[1, 0, 1, 0, 0, 0, 1, 0, 0]","[1, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0]",...,"[0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 1, 0, 0, 0, 1, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0]"
1,1,"[Peter, Blackburn]","[22, 22]","[11, 12]","[PER, PER]","[1, 1]","[1, 1]","[0, 0]","[0, 0]","[0, 0]",...,"[0, 0]","[0, 0]","[0, 0]","[0, 0]","[0, 0]","[0, 0]","[0, 0]","[0, 0]","[0, 0]","[0, 0]"
2,2,"[BRUSSELS, 1996-08-22]","[22, 11]","[11, 12]","[LOC, O]","[1, 0]","[1, 0]","[0, 0]","[0, 1]","[0, 0]",...,"[0, 1]","[0, 0]","[0, 0]","[0, 0]","[0, 0]","[0, 0]","[0, 0]","[0, 0]","[0, 0]","[0, 0]"


So, in total we now have 4 + 5 + 25  = 34 features. Now, if we were to extract the features for a certain word, let's say just for 'EU' aka the first token from the 0th index sentence. We should expect to get an array of length 34

In [22]:
test_li = []
for i in range(5, df_train.shape[1]):
    test_li.append(df_train[df_train.columns[i]][0][0])
print(test_li)

[1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


So, this is the value of the word 'EU' when passed through all 34 features. To confirm, let us check the lenght of this array.

In [23]:
len(test_li)

34

When building the model, these can easily be converted into tensors for each token

So, each word/token in a sequence will have a 34 length vector and our weights vector will be of same size. We compute the dot product of this weight vector with each word vector in the sentence and sum it up.