### Read data

In [1]:
def read_file(f):
    data = open(f,'r').readlines()[1:]
    row_id = [i.split('\t')[0].strip() for i in data]
    data = [i.split('\t')[1].strip().split(' ') for i in data]
    return row_id,data

In [2]:
row_id_text, texts = read_file('./review_data/REVIEW_TEXT.txt')
row_id_tags, tags = read_file('./review_data/REVIEW_LABELSEQ.txt')  # tags: answers

### Two entities of interest -- AE (adverse events) and SSI (signs, symptoms, and indications).

We use BIO scheme: 

     B- to denote beginning of a tagged named entity, 
     
     I- to denote inside a tagged named entity tag, 
     
     O to denote outside of any tagged named entity 
     
So, your sequential labeling task has five tags: B-AE, I-AE, B-SSI, I-SSI, and O.

Two example sentences in the train dataset

In [5]:
index = 5
print('num of data', len(row_id_text))
assert len(row_id_text) == len(row_id_tags)
######## label dataset 

print('-----------------------------')
print('Token\tTag')
for idx in range(len(texts[index])):
    print(texts[index][idx], '\t', tags[index][idx])

num of data 4744
-----------------------------
Token	Tag
I 	 O
had 	 O
terrible 	 B-AE
anxiety 	 I-AE
the 	 I-AE
whole 	 I-AE
time 	 I-AE
, 	 O
the 	 B-AE
worst 	 I-AE
kind 	 I-AE
of 	 I-AE
anxiety 	 I-AE
I've 	 O
ever 	 O
experienced. 	 O


In [6]:
index = 22
print('Token\tTag')
for idx in range(len(texts[index])):
    print(texts[index][idx], '\t', tags[index][idx])

Token	Tag
constipation 	 B-AE
, 	 O
drastic 	 B-AE
mood 	 I-AE
swings 	 I-AE
, 	 O
100% 	 O
helped 	 O
my 	 O
anxiety 	 B-SSI
and 	 O
panic 	 B-SSI
. 	 O


### Inputs

In [9]:
def word2features(word):
    features = {
        'word.lower()': word.lower(),  # 
        'word.isdigit()': word.isdigit(), 
        ## you can add more feature extractor here
        # https://sklearn-crfsuite.readthedocs.io/en/latest/tutorial.html#features
    }
    # print(features)
    return features

def text2features(text):
    return [word2features(i) for i in text]


In [10]:
X = [text2features(text) for text in texts]
y = tags

In [13]:
X

[[{'word.lower()': 'total', 'word.isdigit()': False},
  {'word.lower()': 'nightmare.', 'word.isdigit()': False}],
 [{'word.lower()': 'this', 'word.isdigit()': False},
  {'word.lower()': 'was', 'word.isdigit()': False},
  {'word.lower()': 'arguably', 'word.isdigit()': False},
  {'word.lower()': 'the', 'word.isdigit()': False},
  {'word.lower()': 'worst', 'word.isdigit()': False},
  {'word.lower()': 'period', 'word.isdigit()': False},
  {'word.lower()': 'of', 'word.isdigit()': False},
  {'word.lower()': 'time', 'word.isdigit()': False},
  {'word.lower()': 'in', 'word.isdigit()': False},
  {'word.lower()': 'my', 'word.isdigit()': False},
  {'word.lower()': 'life.', 'word.isdigit()': False}],
 [{'word.lower()': 'it', 'word.isdigit()': False},
  {'word.lower()': 'made', 'word.isdigit()': False},
  {'word.lower()': 'me', 'word.isdigit()': False},
  {'word.lower()': 'so', 'word.isdigit()': False},
  {'word.lower()': 'mentally', 'word.isdigit()': False},
  {'word.lower()': 'ill', 'word.isdigit

### Train / Validation sets

In [14]:
from sklearn.model_selection import train_test_split
X_train, X_validation, y_train, y_validation = train_test_split(X, y, test_size = 0.2, random_state=42)

In [15]:
X_train

[[{'word.lower()': 'dry', 'word.isdigit()': False},
  {'word.lower()': 'mouth', 'word.isdigit()': False},
  {'word.lower()': ',', 'word.isdigit()': False},
  {'word.lower()': 'insomnia', 'word.isdigit()': False},
  {'word.lower()': ',', 'word.isdigit()': False},
  {'word.lower()': 'anxiety', 'word.isdigit()': False},
  {'word.lower()': '.', 'word.isdigit()': False}],
 [{'word.lower()': 'i', 'word.isdigit()': False},
  {'word.lower()': 'stopped', 'word.isdigit()': False},
  {'word.lower()': 'over', 'word.isdigit()': False},
  {'word.lower()': 'analyzing', 'word.isdigit()': False},
  {'word.lower()': 'everything.', 'word.isdigit()': False}],
 [{'word.lower()': 'i', 'word.isdigit()': False},
  {'word.lower()': 'began', 'word.isdigit()': False},
  {'word.lower()': 'to', 'word.isdigit()': False},
  {'word.lower()': 'have', 'word.isdigit()': False},
  {'word.lower()': 'terrible', 'word.isdigit()': False},
  {'word.lower()': 'nightmares', 'word.isdigit()': False},
  {'word.lower()': 'which', 

### CRF model

In [17]:
# https://sklearn-crfsuite.readthedocs.io/en/latest/
from sklearn_crfsuite import CRF
crf = CRF()

### Training & Prediction

In [18]:
crf.fit(X_train, y_train) # train step 
y_pred = crf.predict(X_validation) # inference step

### Result

In [20]:
from sklearn.metrics import classification_report
y_val = [i for j in y_validation for i in j]
y_p = [i for j in y_pred for i in j]

report = classification_report(y_val, y_p)
print(report)

              precision    recall  f1-score   support

        B-AE       0.70      0.43      0.53       752
       B-SSI       0.78      0.50      0.61       168
        I-AE       0.59      0.33      0.42      1485
       I-SSI       0.21      0.05      0.08        66
           O       0.89      0.97      0.92     11859

    accuracy                           0.86     14330
   macro avg       0.63      0.45      0.51     14330
weighted avg       0.84      0.86      0.84     14330



In [21]:
# find instruction of hyperparameters here :https://sklearn-crfsuite.readthedocs.io/en/latest/api.html

crf = CRF(
        algorithm='lbfgs', # Gradient descent using the L-BFGS method
        c1=0.2, # The coefficient for L1 regularization.
        c2=0.2,  # The coefficient for L1 regularization.
        max_iterations=50,
        all_possible_transitions=True
    )

crf.fit(X_train, y_train) # train step 
y_pred = crf.predict(X_validation) # inference step
report = classification_report(y_val, y_p)
print(report)

              precision    recall  f1-score   support

        B-AE       0.70      0.43      0.53       752
       B-SSI       0.78      0.50      0.61       168
        I-AE       0.59      0.33      0.42      1485
       I-SSI       0.21      0.05      0.08        66
           O       0.89      0.97      0.92     11859

    accuracy                           0.86     14330
   macro avg       0.63      0.45      0.51     14330
weighted avg       0.84      0.86      0.84     14330

