# CRF Tagger

## Read Data

In [None]:
def read_file(f):
    data = open(f,'r').readlines()[1:]
    row_id = [i.split('\t')[0].strip() for i in data]
    data = [i.split('\t')[1].strip().split(' ') for i in data]
    return row_id,data

In [None]:
!unzip review_data.zip

In [None]:
row_id_text, texts = read_file('./review_data/REVIEW_TEXT.txt')
row_id_tags, tags = read_file('./review_data/REVIEW_LABELSEQ.txt')

In [None]:
index = 5
print('num of data', len(row_id_text))
assert len(row_id_text) == len(row_id_tags)
######## label dataset 

print('-'*89)
print('Token\tTag')
for idx in range(len(texts[index])):
    print(texts[index][idx], '\t', tags[index][idx])

num of data 4744
-----------------------------------------------------------------------------------------
Token	Tag
I 	 O
had 	 O
terrible 	 B-AE
anxiety 	 I-AE
the 	 I-AE
whole 	 I-AE
time 	 I-AE
, 	 O
the 	 B-AE
worst 	 I-AE
kind 	 I-AE
of 	 I-AE
anxiety 	 I-AE
I've 	 O
ever 	 O
experienced. 	 O


## Input Features

In [None]:
import nltk
nltk.download('averaged_perceptron_tagger')

def word2features(doc, i):
    word = doc[i]
    postag = nltk.pos_tag([word])[-1][-1]

    # Common features for all words
    features = [
        'bias',
        'word.lower=' + word.lower(),       # The word itself
        'word[-3:]=' + word[-3:],           # Word Prefix
        'word[-2:]=' + word[-2:],           # Word Prefix
        'word.isupper=%s' % word.isupper(), # Whether the word is in uppercase
        'word.istitle=%s' % word.istitle(), # Whether the word is capitilized
        'word.isdigit=%s' % word.isdigit(), # Whether the word is a number, or contains digits
        'postag=' + postag                  # POS tag of the word
    ]

    # Features for words that are not
    # at the beginning of a document
    if i > 0:
        word1 = doc[i-1]
        postag1 = nltk.pos_tag([word1])[-1][-1]
        features.extend([
            '-1:word.lower=' + word1.lower(),
            '-1:word.istitle=%s' % word1.istitle(),
            '-1:word.isupper=%s' % word1.isupper(),
            '-1:word.isdigit=%s' % word1.isdigit(),
            '-1:postag=' + postag1
        ])
    else:
        # Indicate that it is the 'beginning of a document'
        features.append('BOS')

    # Features for words that are not
    # at the end of a document
    if i < len(doc)-1:
        word1 = doc[i+1]
        postag1 = nltk.pos_tag([word1])[-1][-1]
        features.extend([
            '+1:word.lower=' + word1.lower(),
            '+1:word.istitle=%s' % word1.istitle(),
            '+1:word.isupper=%s' % word1.isupper(),
            '+1:word.isdigit=%s' % word1.isdigit(),
            '+1:postag=' + postag1
        ])
    else:
        # Indicate that it is the 'end of a document'
        features.append('EOS')

    return features

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [None]:
from sklearn.model_selection import train_test_split

# A function for extracting features in documents
def extract_features(text):
    return [word2features(text, i) for i in range(len(text))]

X = [extract_features(text) for text in texts]
y = tags

## Create Train and Validation Sets

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_validation, y_train, y_validation = train_test_split(X, y, test_size = 0.2, random_state=42)

## CRF Model

In [None]:
!pip install sklearn_crfsuite

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sklearn_crfsuite
  Downloading sklearn_crfsuite-0.3.6-py2.py3-none-any.whl (12 kB)
Collecting python-crfsuite>=0.8.3
  Downloading python_crfsuite-0.9.9-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m16.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: python-crfsuite, sklearn_crfsuite
Successfully installed python-crfsuite-0.9.9 sklearn_crfsuite-0.3.6


In [None]:
# https://sklearn-crfsuite.readthedocs.io/en/latest/
from sklearn_crfsuite import CRF

crf = CRF(algorithm='lbfgs', # Gradient descent using the L-BFGS method lbfgs
    c1=0.2, # The coefficient for L1 regularization.
    c2=0.2,  # The coefficient for L2 regularization.
    max_iterations=100,
    all_possible_transitions=True)

## Training and Prediction

In [None]:
crf.fit(X_train, y_train) # train step 
y_pred = crf.predict(X_validation) # inference step

## Results

In [None]:
from sklearn.metrics import classification_report

y_val = [i for j in y_validation for i in j]
y_p = [i for j in y_pred for i in j]
report = classification_report(y_val, y_p)
print(report)

              precision    recall  f1-score   support

        B-AE       0.78      0.70      0.74       752
       B-SSI       0.76      0.55      0.64       168
        I-AE       0.81      0.62      0.70      1485
       I-SSI       0.35      0.18      0.24        66
           O       0.94      0.98      0.96     11859

    accuracy                           0.92     14330
   macro avg       0.73      0.61      0.66     14330
weighted avg       0.91      0.92      0.91     14330

