In [19]:
pip install datasets sklearn-crfsuite


Note: you may need to restart the kernel to use updated packages.


In [22]:
from datasets import load_dataset
import sklearn_crfsuite
from sklearn_crfsuite import metrics

dataset = load_dataset("wnut_17")

def word2features(tokens, i):
    word = tokens[i]
    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
    }
    if i > 0:
        word1 = tokens[i-1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
        })
    else:
        features['BOS'] = True

    if i < len(tokens) - 1:
        word1 = tokens[i + 1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
        })
    else:
        features['EOS'] = True

    return features

def extract_features(doc):
    return [word2features(doc['tokens'], i) for i in range(len(doc['tokens']))]

def extract_labels(doc):
    return [str(label) for label in doc['ner_tags']]  # Convert each label to a string

train_sents = dataset['train']
X_train = [extract_features(sent) for sent in train_sents]
y_train = [extract_labels(sent) for sent in train_sents]


In [24]:
import sklearn_crfsuite
from sklearn_crfsuite import metrics
from sklearn.metrics import precision_score, recall_score, f1_score

try:
    crf = sklearn_crfsuite.CRF(
        algorithm='lbfgs',
        c1=0.1,
        c2=0.1,
        max_iterations=100,
        all_possible_transitions=True
    )
    crf.fit(X_train, y_train)
    print("Model trained successfully")
except Exception as e:
    print("An error occurred:", e)

try:
    y_pred = crf.predict(X_test)
    y_pred_flat = [item for sublist in y_pred for item in sublist]
    y_test_flat = [item for sublist in y_test for item in sublist]
    
    print("Accuracy:", metrics.flat_accuracy_score(y_test, y_pred))
    print("Precision:", precision_score(y_test_flat, y_pred_flat, average='weighted'))
    print("Recall:", recall_score(y_test_flat, y_pred_flat, average='weighted'))
    print("F1 Score:", f1_score(y_test_flat, y_pred_flat, average='weighted'))
except Exception as e:
    print("An error occurred during prediction or evaluation:", e)


Model trained successfully
Accuracy: 0.9289133965974181
Precision: 0.8931341083648303
Recall: 0.9289133965974181
F1 Score: 0.9018296551943256


In [31]:
def process_new_sentence(sentence, crf_model):
    # Tokenize the sentence - assuming a simple split for illustration; consider a proper tokenizer for complex scenarios
    tokens = sentence.split()
    
    # Extract features for each token in the sentence
    features = [word2features(tokens, i) for i in range(len(tokens))]
    
    # Predict the tags using the trained CRF model
    tags = crf_model.predict_single(features)
    
    # Obtain the marginal probabilities for each tag for confidence scoring
    probabilities = crf_model.predict_marginals_single(features)
    
    # Extract the highest probability for each token to serve as the confidence score
    confidence_scores = [max(prob.values()) for prob in probabilities]
    
    return tokens, tags, confidence_scores

# Example usage:
sentence = "My name is Sarah, I live in Empire state building. I like my apple phone."
tokens, predicted_tags, confidence_scores = process_new_sentence(sentence, crf)

for token, tag, score in zip(tokens, predicted_tags, confidence_scores):
    print(f"{token:15} {tag:5} {score:.4f}")


My              0     0.9943
name            0     0.9994
is              0     0.9998
Sarah,          0     0.8426
I               0     0.9652
live            0     0.9999
in              0     0.9994
Empire          7     0.9034
state           0     0.9338
building.       0     0.9950
I               0     0.9998
like            0     0.9999
my              0     0.9998
apple           0     0.9924
phone.          0     0.9981
