In [2]:
from datasets import load_dataset
import sklearn_crfsuite
from sklearn_crfsuite import metrics
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics

# Load dataset
dataset = load_dataset("wnut_17")


In [44]:
def word2features(sent, i):
    word = sent[i]
    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
    }
    if i > 0:
        word1 = sent[i-1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
        })
    else:
        features['EOS'] = True
    # print(features[0])
    return features

def extract_features(doc):
    return [word2features(doc, i) for i in range(len(doc))]

def extract_labels(doc, label_list):
    return [label_list[label] for label in doc]

# Assuming the dataset provides a 'features' object with a mapping of tags
label_list = dataset['train'].features['ner_tags'].feature.names
print(label_list)
# Prepare the data
X_train = [extract_features(sent['tokens']) for sent in dataset['train']]
print(dataset['train'])
y_train = [extract_labels(sent['ner_tags'], label_list) for sent in dataset['train']]
X_test = [extract_features(sent['tokens']) for sent in dataset['test']]
print(dataset['test'])
print(dataset['validation'])
y_test = [extract_labels(sent['ner_tags'], label_list) for sent in dataset['test']]


['O', 'B-corporation', 'I-corporation', 'B-creative-work', 'I-creative-work', 'B-group', 'I-group', 'B-location', 'I-location', 'B-person', 'I-person', 'B-product', 'I-product']
Dataset({
    features: ['id', 'tokens', 'ner_tags'],
    num_rows: 3394
})
Dataset({
    features: ['id', 'tokens', 'ner_tags'],
    num_rows: 1287
})
Dataset({
    features: ['id', 'tokens', 'ner_tags'],
    num_rows: 1009
})


In [45]:
import sklearn_crfsuite
from sklearn_crfsuite import metrics

# Create and train the CRF model
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)

crf.fit(X_train, y_train)

# Predict on the test set
y_pred = crf.predict(X_test)

# Calculate and print out the metrics
print("F1 Score:", metrics.flat_f1_score(y_test, y_pred, average='weighted'))
print("Precision:", metrics.flat_precision_score(y_test, y_pred, average='weighted'))
print("Recall:", metrics.flat_recall_score(y_test, y_pred, average='weighted'))


F1 Score: 0.9018296551943255
Precision: 0.8931341083648305
Recall: 0.9289133965974181


In [46]:
# Retrieve the tag names from the dataset's feature schema
tag_names = dataset['train'].features['ner_tags'].feature.names

# Print the BIO tags
print("BIO Tags Available in WNUT-17 Dataset:")
for tag in tag_names:
    print(tag)


BIO Tags Available in WNUT-17 Dataset:
O
B-corporation
I-corporation
B-creative-work
I-creative-work
B-group
I-group
B-location
I-location
B-person
I-person
B-product
I-product


In [47]:
def predict_sentence(sentence):
    tokens = sentence.split()  # Simple tokenization, consider using a proper tokenizer
    features = [word2features(tokens, i) for i in range(len(tokens))]
    tags = crf.predict_single(features)
    return list(zip(tokens, tags))

# Example usage
sentence = "My name is Sarah, I live in London and New York with Obama. Google is a company. i am using java to develop this"
print(predict_sentence(sentence))


[('My', 'O'), ('name', 'O'), ('is', 'O'), ('Sarah,', 'O'), ('I', 'O'), ('live', 'O'), ('in', 'O'), ('London', 'B-location'), ('and', 'O'), ('New', 'B-location'), ('York', 'I-location'), ('with', 'O'), ('Obama.', 'B-person'), ('Google', 'I-person'), ('is', 'O'), ('a', 'O'), ('company.', 'O'), ('i', 'O'), ('am', 'O'), ('using', 'O'), ('java', 'O'), ('to', 'O'), ('develop', 'O'), ('this', 'O')]
