In [1]:
!pip install scikit-learn sklearn-crfsuite gensim numpy pandas tqdm

Collecting sklearn-crfsuite
  Downloading sklearn_crfsuite-0.5.0-py2.py3-none-any.whl.metadata (4.9 kB)
Collecting python-crfsuite>=0.9.7 (from sklearn-crfsuite)
  Downloading python_crfsuite-0.9.11-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.3 kB)
Downloading sklearn_crfsuite-0.5.0-py2.py3-none-any.whl (10 kB)
Downloading python_crfsuite-0.9.11-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m22.9 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: python-crfsuite, sklearn-crfsuite
Successfully installed python-crfsuite-0.9.11 sklearn-crfsuite-0.5.0


In [5]:
# Libraries
import sklearn_crfsuite
from sklearn_crfsuite import metrics
from sklearn.metrics import classification_report
from gensim.models import KeyedVectors
import numpy as np
import platform
import time
import sys
import sklearn
import gensim
from importlib.metadata import version, metadata

In [6]:
# Fasttext CRF Model for NER Training and Prediction
def check_environment():
    print("=== Environment Details ===")
    print(f"Python Version: {sys.version}")
    print(f"Platform: {platform.system()} {platform.release()}")
    print(f"scikit-learn Version: {sklearn.__version__}")
    print(f"sklearn-crfsuite version: {pkg_resources.get_distribution('sklearn-crfsuite').version}")
    print(f"Gensim Version: {gensim.__version__}")
    print("===========================")

# Check environment
check_environment()

# Load FastText embeddings
fasttext_file = "/kaggle/input/glove-100d/cc.my.300.vec"  
fasttext_model = KeyedVectors.load_word2vec_format(fasttext_file)
print(f"Loaded {len(fasttext_model)} words from FastText.")

# Read CoNLL data
def read_conll(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()

    sentences = []
    sentence = []
    for line in lines:
        if line.strip() == "":
            if sentence:
                sentences.append(sentence)
                sentence = []
        else:
            token, pos, ner = line.strip().split('\t')
            sentence.append((token, pos, ner))

    if sentence:
        sentences.append(sentence)
    return sentences

def is_numeric(token):
    numeric_chars = set("၁၂၃၄၅၆၇၈၉၀")
    return token.isdigit() or all(char in numeric_chars for char in token)

# Feature extraction with FastText embeddings
def extract_features(sentence, index):
    token = sentence[index][0]
    features = {
        'word': token,
        'is_first': index == 0,
        'is_last': index == len(sentence) - 1,
        'prefix-1': token[0],
        'prefix-2': token[:2],
        'prefix-3': token[:3],
        'suffix-1': token[-1],
        'suffix-2': token[-2:],
        'suffix-3': token[-3:],
        'prev_word': '' if index == 0 else sentence[index - 1][0],
        'next_word': '' if index == len(sentence) - 1 else sentence[index + 1][0],
        'has_hyphen': '-' in token,
        'is_numeric': is_numeric(token),  # Use the combined numeric check
    }
    if token in fasttext_model:
        features['fasttext_avg'] = np.mean(fasttext_model[token])
    else:
        features['fasttext_avg'] = 0.0  
    return features

def prepare_data(conll_data):
    X = []
    y_ner = []
    for sentence in conll_data:
        X_sentence = []
        y_ner_sentence = []
        for i in range(len(sentence)):
            X_sentence.append(extract_features(sentence, i))
            y_ner_sentence.append(sentence[i][2])  # NER label
        X.append(X_sentence)
        y_ner.append(y_ner_sentence)
    return X, y_ner

train_file_path = "/kaggle/input/split-fix-data/train_v5.conll"
val_file_path = "/kaggle/input/split-fix-data/val_v5.conll"
test_file_path = "/kaggle/input/split-fix-data/test_v5.conll"

train_data = read_conll(train_file_path)
val_data = read_conll(val_file_path)
test_data = read_conll(test_file_path)

X_train, y_train_ner = prepare_data(train_data)
X_val, y_val_ner = prepare_data(val_data)
X_test, y_test_ner = prepare_data(test_data)

crf_ner = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,  # L1 regularization
    c2=0.1,  # L2 regularization
    max_iterations=100,
    all_possible_transitions=True
)

# Measure training time
start_time = time.time()
print("Training CRF model for NER...")
crf_ner.fit(X_train, y_train_ner)
training_time = time.time() - start_time
print(f"Training completed in {training_time:.2f} seconds.")

# Validate NER model
print("NER Validation F1-score:", metrics.flat_f1_score(y_val_ner, crf_ner.predict(X_val), average='weighted'))

# Test NER model
y_test_pred_ner = crf_ner.predict(X_test)

# Flatten the true and predicted labels for classification report
y_test_ner_flat = [label for sent in y_test_ner for label in sent]
y_test_pred_ner_flat = [label for sent in y_test_pred_ner for label in sent]

# Generate classification report for NER
print("\nNER Classification Report:")
print(classification_report(y_test_ner_flat, y_test_pred_ner_flat, zero_division=0,digits=4))

=== Environment Details ===
Python Version: 3.10.16 (main, Dec 25 2024, 01:31:21) [GCC 12.2.0]
Platform: Linux 6.1.42+
scikit-learn Version: 1.6.1
sklearn-crfsuite version: 0.5.0
Gensim Version: 4.3.3
Loaded 335230 words from FastText.
Training CRF model for NER...
Training completed in 52.29 seconds.
NER Validation F1-score: 0.9776269694795953

NER Classification Report:
              precision    recall  f1-score   support

      B-DATE     0.8833    0.8030    0.8413        66
       B-LOC     0.9837    0.9695    0.9766      1182
       B-NUM     0.5000    0.3333    0.4000        15
       B-ORG     0.7179    0.5833    0.6437        48
       B-PER     0.9375    0.8824    0.9091        34
      B-TIME     0.8750    0.7778    0.8235         9
      E-DATE     0.8710    0.8182    0.8438        66
       E-LOC     0.9687    0.9687    0.9687      1182
       E-NUM     0.5000    0.3333    0.4000        15
       E-ORG     0.7436    0.6042    0.6667        48
       E-PER     0.9375    0.8

In [7]:
# Base CRF Model for NER Training and Prediction
def check_environment():
    print("=== Environment Details ===")
    print(f"Python Version: {sys.version}")
    print(f"Platform: {platform.system()} {platform.release()}")
    print(f"scikit-learn Version: {sklearn.__version__}")
    print(f"sklearn-crfsuite version: {pkg_resources.get_distribution('sklearn-crfsuite').version}")
    print("===========================")

# Check environment
check_environment()

# Read CoNLL data
def read_conll(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()

    sentences = []
    sentence = []
    for line in lines:
        if line.strip() == "":
            if sentence:
                sentences.append(sentence)
                sentence = []
        else:
            token, pos, ner = line.strip().split('\t')
            sentence.append((token, pos, ner))

    if sentence:
        sentences.append(sentence)
    return sentences

def is_numeric(token):
    numeric_chars = set("၁၂၃၄၅၆၇၈၉၀")
    return token.isdigit() or all(char in numeric_chars for char in token)

# Feature extraction without FastText embeddings
def extract_features(sentence, index):
    token = sentence[index][0]
    features = {
        'word': token,
        'is_first': index == 0,
        'is_last': index == len(sentence) - 1,
        'prefix-1': token[0],
        'prefix-2': token[:2],
        'prefix-3': token[:3],
        'suffix-1': token[-1],
        'suffix-2': token[-2:],
        'suffix-3': token[-3:],
        'prev_word': '' if index == 0 else sentence[index - 1][0],
        'next_word': '' if index == len(sentence) - 1 else sentence[index + 1][0],
        'has_hyphen': '-' in token,
        'is_numeric': is_numeric(token),  # Use the combined numeric check
    }
    return features

def prepare_data(conll_data):
    X = []
    y_ner = []
    for sentence in conll_data:
        X_sentence = []
        y_ner_sentence = []
        for i in range(len(sentence)):
            X_sentence.append(extract_features(sentence, i))
            y_ner_sentence.append(sentence[i][2])  # NER label
        X.append(X_sentence)
        y_ner.append(y_ner_sentence)
    return X, y_ner

train_file_path = "/kaggle/input/split-fix-data/train_v5.conll"
val_file_path = "/kaggle/input/split-fix-data/val_v5.conll"
test_file_path = "/kaggle/input/split-fix-data/test_v5.conll"

train_data = read_conll(train_file_path)
val_data = read_conll(val_file_path)
test_data = read_conll(test_file_path)

X_train, y_train_ner = prepare_data(train_data)
X_val, y_val_ner = prepare_data(val_data)
X_test, y_test_ner = prepare_data(test_data)

crf_ner = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,  # L1 regularization
    c2=0.1,  # L2 regularization
    max_iterations=100,
    all_possible_transitions=True
)

# Measure training time
start_time = time.time()
print("Training CRF model for NER...")
crf_ner.fit(X_train, y_train_ner)
training_time = time.time() - start_time
print(f"Training completed in {training_time:.2f} seconds.")

# Validate NER model
print("NER Validation F1-score:", metrics.flat_f1_score(y_val_ner, crf_ner.predict(X_val), average='weighted'))

# Test NER model
y_test_pred_ner = crf_ner.predict(X_test)

# Flatten the true and predicted labels for classification report
y_test_ner_flat = [label for sent in y_test_ner for label in sent]
y_test_pred_ner_flat = [label for sent in y_test_pred_ner for label in sent]

# Generate classification report for NER
print("\nNER Classification Report:")
print(classification_report(y_test_ner_flat, y_test_pred_ner_flat, zero_division=0, digits=4))


=== Environment Details ===
Python Version: 3.10.16 (main, Dec 25 2024, 01:31:21) [GCC 12.2.0]
Platform: Linux 6.1.42+
scikit-learn Version: 1.6.1
sklearn-crfsuite version: 0.5.0
Training CRF model for NER...
Training completed in 51.00 seconds.
NER Validation F1-score: 0.9781392852758948

NER Classification Report:
              precision    recall  f1-score   support

      B-DATE     0.8667    0.7879    0.8254        66
       B-LOC     0.9837    0.9712    0.9774      1182
       B-NUM     0.5000    0.3333    0.4000        15
       B-ORG     0.6667    0.5833    0.6222        48
       B-PER     0.9394    0.9118    0.9254        34
      B-TIME     0.8750    0.7778    0.8235         9
      E-DATE     0.8710    0.8182    0.8438        66
       E-LOC     0.9688    0.9704    0.9696      1182
       E-NUM     0.5000    0.3333    0.4000        15
       E-ORG     0.6905    0.6042    0.6444        48
       E-PER     0.9394    0.9118    0.9254        34
      E-TIME     0.8750    0.7778

In [9]:
# Fasttext CRF Model for Joint POS and NER Training and Prediction

def check_environment():
    print("=== Environment Details ===")
    print(f"Python Version: {sys.version}")
    print(f"Platform: {platform.system()} {platform.release()}")
    print(f"scikit-learn Version: {sklearn.__version__}")
    print(f"sklearn-crfsuite version: {pkg_resources.get_distribution('sklearn-crfsuite').version}")
    print("===========================")

# Check environment
check_environment()

# Load FastText embeddings
fasttext_file = "/kaggle/input/glove-100d/cc.my.300.vec"  
fasttext_model = KeyedVectors.load_word2vec_format(fasttext_file)
print(f"Loaded {len(fasttext_model)} words from FastText.")

def read_conll(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()

    sentences = []
    sentence = []
    for line in lines:
        if line.strip() == "":
            if sentence:
                sentences.append(sentence)
                sentence = []
        else:
            token, pos, ner = line.strip().split('\t')
            sentence.append((token, pos, ner))

    if sentence:
        sentences.append(sentence)
    return sentences

def is_numeric(token):
    numeric_chars = set("၁၂၃၄၅၆၇၈၉၀")
    return token.isdigit() or all(char in numeric_chars for char in token)

# Feature extraction with FastText embeddings
def extract_features(sentence, index):
    token = sentence[index][0]
    features = {
        'word': token,
        'is_first': index == 0,
        'is_last': index == len(sentence) - 1,
        'prefix-1': token[0],
        'prefix-2': token[:2],
        'prefix-3': token[:3],
        'suffix-1': token[-1],
        'suffix-2': token[-2:],
        'suffix-3': token[-3:],
        'prev_word': '' if index == 0 else sentence[index - 1][0],
        'next_word': '' if index == len(sentence) - 1 else sentence[index + 1][0],
        'has_hyphen': '-' in token,
        'is_numeric': is_numeric(token),  
    }
    if token in fasttext_model:
        features['fasttext_avg'] = np.mean(fasttext_model[token])
    else:
        features['fasttext_avg'] = 0.0  # Default value if token is not in FastText
    return features

# Combine POS and NER labels into a single label
def combine_labels(pos_labels, ner_labels):
    combined_labels = []
    for pos, ner in zip(pos_labels, ner_labels):
        combined_labels.append(f"{pos}_{ner}")
    return combined_labels

def prepare_data(conll_data):
    X = []
    y_joint = []
    for sentence in conll_data:
        X_sentence = []
        y_joint_sentence = []
        for i in range(len(sentence)):
            X_sentence.append(extract_features(sentence, i))
        y_pos_sentence = [token[1] for token in sentence]  # POS labels
        y_ner_sentence = [token[2] for token in sentence]  # NER labels
        y_joint_sentence = combine_labels(y_pos_sentence, y_ner_sentence)  # Combined labels
        X.append(X_sentence)
        y_joint.append(y_joint_sentence)
    return X, y_joint

train_file_path = "/kaggle/input/split-fix-data/train_v5.conll"
val_file_path = "/kaggle/input/split-fix-data/val_v5.conll"
test_file_path = "/kaggle/input/split-fix-data/test_v5.conll"

train_data = read_conll(train_file_path)
val_data = read_conll(val_file_path)
test_data = read_conll(test_file_path)

X_train, y_train_joint = prepare_data(train_data)
X_val, y_val_joint = prepare_data(val_data)
X_test, y_test_joint = prepare_data(test_data)

crf_joint = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,  # L1 regularization
    c2=0.1,  # L2 regularization
    max_iterations=100,
    all_possible_transitions=True
)

print("Training joint CRF model...")
crf_joint.fit(X_train, y_train_joint)

y_val_pred_joint = crf_joint.predict(X_val)
print("Joint Validation F1-score:", metrics.flat_f1_score(y_val_joint, y_val_pred_joint, average='weighted'))

y_test_pred_joint = crf_joint.predict(X_test)

def split_labels(combined_labels):
    pos_labels = []
    ner_labels = []
    for label in combined_labels:
        pos, ner = label.split("_")
        pos_labels.append(pos)
        ner_labels.append(ner)
    return pos_labels, ner_labels

y_test_joint_flat = [label for sent in y_test_joint for label in sent]
y_test_pred_joint_flat = [label for sent in y_test_pred_joint for label in sent]

y_test_pos_flat, y_test_ner_flat = split_labels(y_test_joint_flat)
y_test_pred_pos_flat, y_test_pred_ner_flat = split_labels(y_test_pred_joint_flat)

print("\nPOS Classification Report:")
print(classification_report(y_test_pos_flat, y_test_pred_pos_flat, zero_division=0,digits=4))

print("\nNER Classification Report:")
print(classification_report(y_test_ner_flat, y_test_pred_ner_flat, zero_division=0,digits=4))

# Print 5 predicted sentences from the test set
print("\n=== 5 Predicted Sentences from Test Set ===")
for i in range(5): 
    print(f"\nSentence {i + 1}:")
    print("Token\t\tTrue POS\tPredicted POS\tTrue NER\tPredicted NER")
    print("-" * 60)
    for token, true_joint, pred_joint in zip(test_data[i], y_test_joint[i], y_test_pred_joint[i]):
        true_pos, true_ner = true_joint.split("_")
        pred_pos, pred_ner = pred_joint.split("_")
        print(f"{token[0]}\t\t{true_pos}\t\t{pred_pos}\t\t{true_ner}\t\t{pred_ner}")

=== Environment Details ===
Python Version: 3.10.16 (main, Dec 25 2024, 01:31:21) [GCC 12.2.0]
Platform: Linux 6.1.42+
scikit-learn Version: 1.6.1
sklearn-crfsuite version: 0.5.0
Loaded 335230 words from FastText.
Training joint CRF model...
Joint Validation F1-score: 0.9576719339780975

POS Classification Report:
              precision    recall  f1-score   support

         abb     1.0000    0.9444    0.9714        18
         adj     0.8875    0.8735    0.8804       569
         adv     0.9470    0.8034    0.8693       356
        conj     0.9462    0.9513    0.9487       739
          fw     0.9655    0.8615    0.9106        65
         int     0.9412    0.9412    0.9412        17
           n     0.9783    0.9840    0.9811      7694
         num     0.9984    0.9984    0.9984       641
        part     0.9781    0.9827    0.9804      4461
         ppm     0.9934    0.9947    0.9940      4114
        pron     0.9699    0.9657    0.9678       467
        punc     1.0000    1.0000  

In [10]:
# Base CRF Model for Joint POS and NER Training and Prediction
def check_environment():
    print("=== Environment Details ===")
    print(f"Python Version: {sys.version}")
    print(f"Platform: {platform.system()} {platform.release()}")
    print(f"scikit-learn Version: {sklearn.__version__}")
    print(f"sklearn-crfsuite version: {pkg_resources.get_distribution('sklearn-crfsuite').version}")
    print("===========================")

# Check environment
check_environment()

# Read CoNLL data
def read_conll(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()

    sentences = []
    sentence = []
    for line in lines:
        if line.strip() == "":
            if sentence:
                sentences.append(sentence)
                sentence = []
        else:
            token, pos, ner = line.strip().split('\t')
            sentence.append((token, pos, ner))

    if sentence:
        sentences.append(sentence)
    return sentences

def is_numeric(token):
    numeric_chars = set("၁၂၃၄၅၆၇၈၉၀")
    return token.isdigit() or all(char in numeric_chars for char in token)

# Feature extraction for joint POS and NER
def extract_features(sentence, index):
    token = sentence[index][0]
    features = {
        'word': token,
        'is_first': index == 0,
        'is_last': index == len(sentence) - 1,
        'prefix-1': token[0],
        'prefix-2': token[:2],
        'prefix-3': token[:3],
        'suffix-1': token[-1],
        'suffix-2': token[-2:],
        'suffix-3': token[-3:],
        'prev_word': '' if index == 0 else sentence[index - 1][0],
        'next_word': '' if index == len(sentence) - 1 else sentence[index + 1][0],
        'has_hyphen': '-' in token,
        'is_numeric': is_numeric(token),  # Use the combined numeric check
    }
    return features

# Combine POS and NER labels into a single label
def combine_labels(pos_labels, ner_labels):
    combined_labels = []
    for pos, ner in zip(pos_labels, ner_labels):
        combined_labels.append(f"{pos}_{ner}")  # Combine POS and NER with an underscore
    return combined_labels

def prepare_data(conll_data):
    X = []
    y_joint = []
    for sentence in conll_data:
        X_sentence = []
        y_joint_sentence = []
        for i in range(len(sentence)):
            X_sentence.append(extract_features(sentence, i))
        y_pos_sentence = [token[1] for token in sentence]  # POS labels
        y_ner_sentence = [token[2] for token in sentence]  # NER labels
        y_joint_sentence = combine_labels(y_pos_sentence, y_ner_sentence)  # Combined labels
        X.append(X_sentence)
        y_joint.append(y_joint_sentence)
    return X, y_joint

train_file_path = "/kaggle/input/split-fix-data/train_v5.conll"
val_file_path = "/kaggle/input/split-fix-data/val_v5.conll"
test_file_path = "/kaggle/input/split-fix-data/test_v5.conll"

train_data = read_conll(train_file_path)
val_data = read_conll(val_file_path)
test_data = read_conll(test_file_path)

X_train, y_train_joint = prepare_data(train_data)
X_val, y_val_joint = prepare_data(val_data)
X_test, y_test_joint = prepare_data(test_data)

# Joint CRF model for POS and NER
crf_joint = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,  # L1 regularization
    c2=0.1,  # L2 regularization
    max_iterations=100,
    all_possible_transitions=True
)

# Measure training time
start_time = time.time()
print("Training joint CRF model for POS and NER...")
crf_joint.fit(X_train, y_train_joint)
training_time = time.time() - start_time
print(f"Training completed in {training_time:.2f} seconds.")

# Validate joint model
y_val_pred_joint = crf_joint.predict(X_val)
print("Joint Validation F1-score:", metrics.flat_f1_score(y_val_joint, y_val_pred_joint, average='weighted'))

# Test joint model
y_test_pred_joint = crf_joint.predict(X_test)

# Split combined labels back into POS and NER for evaluation
def split_labels(combined_labels):
    pos_labels = []
    ner_labels = []
    for label in combined_labels:
        pos, ner = label.split("_")  # Split combined label into POS and NER
        pos_labels.append(pos)
        ner_labels.append(ner)
    return pos_labels, ner_labels

# Flatten the true and predicted labels for classification report
y_test_joint_flat = [label for sent in y_test_joint for label in sent]
y_test_pred_joint_flat = [label for sent in y_test_pred_joint for label in sent]

# Split combined labels into POS and NER
y_test_pos_flat, y_test_ner_flat = split_labels(y_test_joint_flat)
y_test_pred_pos_flat, y_test_pred_ner_flat = split_labels(y_test_pred_joint_flat)

# Generate classification reports for POS and NER
print("\nPOS Classification Report:")
print(classification_report(y_test_pos_flat, y_test_pred_pos_flat, zero_division=0, digits=4))

print("\nNER Classification Report:")
print(classification_report(y_test_ner_flat, y_test_pred_ner_flat, zero_division=0, digits=4))

=== Environment Details ===
Python Version: 3.10.16 (main, Dec 25 2024, 01:31:21) [GCC 12.2.0]
Platform: Linux 6.1.42+
scikit-learn Version: 1.6.1
sklearn-crfsuite version: 0.5.0
Training joint CRF model for POS and NER...
Training completed in 1207.79 seconds.
Joint Validation F1-score: 0.9579652288656015

POS Classification Report:
              precision    recall  f1-score   support

         abb     1.0000    0.9444    0.9714        18
         adj     0.8887    0.8699    0.8792       569
         adv     0.9349    0.8062    0.8658       356
        conj     0.9474    0.9499    0.9486       739
          fw     0.9655    0.8615    0.9106        65
         int     0.9412    0.9412    0.9412        17
           n     0.9784    0.9839    0.9811      7694
         num     0.9984    0.9984    0.9984       641
        part     0.9773    0.9827    0.9800      4461
         ppm     0.9934    0.9947    0.9940      4114
        pron     0.9678    0.9657    0.9668       467
        punc   