In [None]:
# Install the sklearn-crfsuite package, which provides tools for Conditional Random Fields (CRFs)
# This package is used for sequence modeling, often applied to tasks like Named Entity Recognition (NER)
!pip install sklearn-crfsuite

# Import necessary libraries
import nltk                      # Natural Language Toolkit for NLP tasks
import pandas as pd              # Data manipulation and analysis
import sklearn_crfsuite          # CRF suite for sequence modeling
from sklearn.model_selection import train_test_split  # Function to split data into train and test sets
import numpy as np               # Numerical operations on arrays
from sklearn_crfsuite import metrics as crf_metrics  # Metrics for evaluating CRF models

# Download NLTK resources:
# 'punkt' is required for tokenization of text into sentences or words.
# 'averaged_perceptron_tagger' is used for part-of-speech (POS) tagging.
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

In [None]:
# Load the dataset into a pandas DataFrame from a URL pointing to a compressed .csv file.
# The file is in gzip format, so `compression='gzip'` is used to handle it directly.
# The 'ISO-8859-1' encoding is specified to correctly interpret the character set used in the file.
df = pd.read_csv('https://github.com/dipanjanS/nlp_workshop_dhs18/raw/master/Unit%2008%20-%20Project%206%20-%20Build%20your%20NER%20Tagger/ner_dataset.csv.gz',
                 compression='gzip', encoding='ISO-8859-1')

# Use forward fill to replace any missing values in the DataFrame with the last known value.
# This is particularly useful for filling in consecutive rows that are part of the same sentence
# but have missing data due to the way the dataset is structured.
df = df.fillna(method='ffill')

# Display information about the DataFrame, such as the number of entries, columns, and data types.
# This helps in understanding the structure of the dataset and identifying potential issues.
df.info()


In [None]:
# Display the DataFrame to show an example of the dataset.
df

In [None]:
# Display the count of unique sentences, words, POS tags, and tags in the dataset
unique_sentences = df['Sentence #'].nunique()
unique_words = df['Word'].nunique()
unique_pos_tags = df['POS'].nunique()
unique_tags = df['Tag'].nunique()
unique_sentences, unique_words, unique_pos_tags, unique_tags

In [None]:
# Display the frequency of each tag in the dataset
tag_counts = df['Tag'].value_counts()
tag_counts

**Tag Explanation (Recap)** <br>
`IOB tagging Format:`
*   I- prefix before a tag indicates that the tag is inside a chunk.
*   B- prefix before a tag indicates that the tag is the beginning of a chunk.
*   O- tag indicates that a token belongs to no chunk (outside).

`The tags in this dataset:`

*  geo = Geographical Entity
*  org = Organization
*  per = Person
*  gpe = Geopolitical Entity
*  tim = Time indicator
*  art = Artifact
*  eve = Event
*  nat = Natural Phenomenon

In [None]:
# Define a function that extracts features from a word within a sentence
def word2features(sent, i):
    # Extract the word and part-of-speech (POS) tag at the given index 'i'
    word = sent[i][0]
    postag = sent[i][1]


    #################### ADD YOUR CODE HERE################################
    # Initialize a dictionary to store features for the current word
    features = {
        'bias': 1.0,  # A constant feature to help with learning algorithms
        'word.lower()': word.lower(),  # The word in lowercase
        ####YOUR CODE#####,  # Last three characters of the word (for suffix-based features)
        ####YOUR CODE#####,  # Last two characters of the word
        ####YOUR CODE#####,  # Check if the word is in uppercase
        ####YOUR CODE#####,  # Check if the word is title-cased (first letter capitalized)
        ####YOUR CODE#####,  # Check if the word is numeric
        ####YOUR CODE#####,  # The POS tag of the word
        ####YOUR CODE#####  # The first two characters of the POS tag (for coarse POS categories)
    }

    # If there is a previous word in the sentence, add features for the previous word
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            ####YOUR CODE#####,  # Lowercase form of the previous word
            ####YOUR CODE#####,  # Check if the previous word is title-cased
            ####YOUR CODE#####,  # Check if the previous word is in uppercase
            ####YOUR CODE#####,  # POS tag of the previous word
            ####YOUR CODE#####,  # First two characters of the previous POS tag
        })
    else:
        features['BOS'] = True  # Mark the beginning of the sentence

    # If there is a next word in the sentence, add features for the next word
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            ####YOUR CODE#####,  # Lowercase form of the next word
            ####YOUR CODE#####,  # Check if the next word is title-cased
            ####YOUR CODE#####,  # Check if the next word is in uppercase
            ####YOUR CODE#####,  # POS tag of the next word
            ####YOUR CODE#####,  # First two characters of the next POS tag
        })
    else:
        features['EOS'] = True  # Mark the end of the sentence

    # Return the feature dictionary for the current word
    return features

# Function to convert a sentence into a list of feature dictionaries for each word
def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

# Function to extract labels from a sentence
# Assumes each element in the sentence is a tuple (word, pos, label)
def sent2labels(sent):
    return [label for token, postag, label in sent]

In [None]:
# Define an aggregation function that combines 'Word', 'POS', and 'Tag' columns into a list of tuples
# Each tuple represents a word with its corresponding POS tag and Tag
agg_func = lambda s: [(w, p, t) for w, p, t in zip(s['Word'].values.tolist(),
                                                   s['POS'].values.tolist(),
                                                   s['Tag'].values.tolist())]

# Group the DataFrame by 'Sentence #' and apply the aggregation function to each group
# This creates a DataFrame where each sentence is represented as a list of (Word, POS, Tag) tuples
grouped_df = df.groupby('Sentence #').apply(agg_func)

# Convert the grouped sentences into a list of sentences, where each sentence is a list of tuples
sentences = [s for s in grouped_df]

# Display the first sentence as an example, showing it as a list of (Word, POS, Tag) tuples
sentences[0]

In [None]:
# Apply the feature extraction function to a slice of a sample sentence
# Here, it extracts features for the words at positions 5 and 6 in the first sentence
sent2features(sentences[0][5:7])

[{'bias': 1.0,
  'word.lower()': 'through',
  'word[-3:]': 'ugh',
  'word[-2:]': 'gh',
  'word.isupper()': False,
  'word.istitle()': False,
  'word.isdigit()': False,
  'postag': 'IN',
  'postag[:2]': 'IN',
  'BOS': True,
  '+1:word.lower()': 'london',
  '+1:word.istitle()': True,
  '+1:word.isupper()': False,
  '+1:postag': 'NNP',
  '+1:postag[:2]': 'NN'},
 {'bias': 1.0,
  'word.lower()': 'london',
  'word[-3:]': 'don',
  'word[-2:]': 'on',
  'word.isupper()': False,
  'word.istitle()': True,
  'word.isdigit()': False,
  'postag': 'NNP',
  'postag[:2]': 'NN',
  '-1:word.lower()': 'through',
  '-1:word.istitle()': False,
  '-1:word.isupper()': False,
  '-1:postag': 'IN',
  '-1:postag[:2]': 'IN',
  'EOS': True}]

In [None]:
# Example usage: extract labels for a subset of words from the first sentence
sent2labels(sentences[0][0:10])

In [None]:
# Convert the list of sentences into lists of feature sets
# Each sentence is transformed into a list of feature dictionaries using sent2features
X = [sent2features(s) for s in sentences]

# Convert the list of sentences into lists of label sets
# Each sentence is transformed into a list of labels using sent2labels
y = [sent2labels(s) for s in sentences]

# Split the data into training and testing sets
# 25% of the data will be used for testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Display the lengths of the training and testing sets to confirm the split
len(X_train), len(X_test)

In [None]:
# Initialize a CRF model with specific hyperparameters:
# - 'lbfgs': Uses the Limited-memory Broyden–Fletcher–Goldfarb–Shanno (L-BFGS) optimization algorithm,
#   which is commonly used for training large-scale machine learning models.
# - 'c1' and 'c2': Regularization parameters that control L1 and L2 regularization, respectively,
#   helping to prevent overfitting. Here, both are set to 0.1.
# - 'max_iterations': The maximum number of training iterations to run. Here, it is set to 100.
# - 'all_possible_transitions': When True, considers all possible state transitions, which can improve accuracy.
# - 'verbose': Enables verbose output to monitor the training progress and optimization details.
crf = sklearn_crfsuite.CRF(algorithm='lbfgs',
                           c1=0.1,
                           c2=0.1,
                           max_iterations=100,
                           all_possible_transitions=True,
                           verbose=True)

# Train the CRF model on the training data.
# The fit method takes the feature sets (X_train) and corresponding labels (y_train) as inputs.

##### ADD YOUR CODE HERE #####

In [None]:
# Use the trained CRF model to predict labels for the test data (X_test).
# The predict method generates a list of predicted label sequences for each sentence in X_test.
# Each element in y_pred is a list of predicted labels corresponding to one sentence.
y_pred = crf.predict(X_test)

# Print the predicted labels for the first test sentence.
# This provides a quick look at the model's output, showing how it has labeled each word in the sentence.
print(y_pred[0])

In [None]:
# Print the list of true labels for the first sentence in the test set.
# Each element in the list corresponds to the named entity label for a word in the sentence.
print(y_test[0])

In [None]:
# Get a list of all labels (classes) that the CRF model has learned.
# This includes the named entity labels and possibly the 'O' label for non-entities.
labels = list(crf.classes_)

# Remove the 'O' label from the list of labels, as it's often excluded from evaluation.
# The 'O' label represents non-entity words, and excluding it focuses the evaluation on entity recognition performance.
labels.remove('O')

# Print a detailed classification report showing precision, recall, and F1-score for each label.
# The flat_classification_report function computes metrics for each entity type across all words in all sentences.
# It "flattens" the lists of labels (y_test and y_pred) into single lists for easier computation.
print(crf_metrics.flat_classification_report(y_test, y_pred, labels=labels))

# **Build an End-to-End NER Tagger with a Trained NER Model**

In [None]:
# example text
text = """Last week, Google announced a partnership with Microsoft to develop a new cloud computing platform. The project, named Project Quantum, aims to integrate artificial intelligence and machine learning into cloud services. According to Sundar Pichai, Google’s CEO, this platform will leverage the capabilities of Microsoft Azure and Google Cloud to provide enhanced data analytics and storage solutions. During a press conference in San Francisco, Satya Nadella, the CEO of Microsoft, mentioned that this collaboration could significantly reduce operational costs for enterprises. The initiative is expected to launch by early 2025, with beta testing available by mid-2024. Additionally, Amazon Web Services (AWS) was mentioned as a key competitor in this space, as they recently released updates to their own AI-powered services, including AWS Lambda and SageMaker. In a related development, cybersecurity firms such as Palo Alto Networks and CrowdStrike have started exploring partnerships with cloud providers to enhance security measures. This trend reflects the growing need for comprehensive solutions that can protect sensitive data from cyber threats, especially with the rise of remote work and increasing dependence on virtual private networks (VPNs)."""
text

In [None]:
# This splits the text into individual word tokens, which are necessary for POS tagging and feature extraction.
# Apply Part-of-Speech (POS) tagging to the list of word tokens.
# The nltk.pos_tag function returns a list of tuples, where each tuple contains a word and its POS tag.
# Extract features for each word in the tokenized and POS-tagged text.
# The sent2features function transforms the list of (word, POS) tuples into feature dictionaries for each word.
##### ADD YOUR CODE HERE #####
text_tokens = #####CODE HERE #####
text_pos = #####CODE HERE #####
features = #####CODE HERE #####

In [None]:
# Use the trained CRF model to predict NER tags for the input features.
labels = #####CODE HERE #####

# Extract the predicted labels for the entire document (first and only item in 'labels').
# Each label corresponds to a named entity tag for each token in the input text.
doc_labels = labels[0]

# Pair each token with its predicted named entity tag.
# 'text_ner' is a list of tuples, where each tuple contains a word from the text and its predicted tag.
text_ner = [(token, tag) for token, tag in zip(text_tokens, doc_labels)]

# Initialize an empty list to collect named entities and variables to temporarily store entity names and tags.
named_entities = []
temp_entity_name = ''
temp_named_entity = None

# Iterate over each token and its tag to extract named entities.
for term, tag in text_ner:
    # If the tag is not 'O' (outside), the token is part of a named entity.
    if tag != 'O':
        # Append the current term to the entity name (useful for multi-word entities).
        temp_entity_name = ' '.join([temp_entity_name, term]).strip()
        # Update the temporary named entity with the current entity name and its tag.
        temp_named_entity = (temp_entity_name, tag)
    else:
        # If an 'O' tag is encountered and there is a stored entity, save the named entity.
        if temp_named_entity:
            named_entities.append(temp_named_entity)
            # Reset temporary variables for the next potential entity.
            temp_entity_name = ''
            temp_named_entity = None

# Convert the list of named entities to a DataFrame for easier visualization.
# Each named entity is a row with 'Entity' and 'Tag' columns, showing the entity's text and its predicted tag.
pd.DataFrame(named_entities, columns=['Entity', 'Tag']).T


In [None]:
# compare the result with spacy library
import spacy
from spacy import displacy

nlp = spacy.load('en_core_web_sm')
text_nlp = nlp(text)
displacy.render(text_nlp, style='ent', jupyter=True)