
# Entity Recognition using Conditional Random Fields
### Task: Train a machine learning model using the provided training dataset to identify adverse events and SSI from drug reviews. 

## Import packages

In [67]:
#!{sys.executable} -m pip install -U 'scikit-learn<0.24'

#!{sys.executable} -m pip install -U 'scikit-learn<0.24'
#!{sys.executable} -m pip install sklearn_crfsuite

#!{sys.executable} -m pip install tensorflow

import re, sys, os, unicodedata, string, nltk, operator, itertools, collections, sklearn, tensorflow
from sklearn.model_selection import train_test_split
from sklearn_crfsuite import CRF
from sklearn_crfsuite.metrics import flat_classification_report
from collections import Counter
from heapq import nlargest
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.corpus import wordnet as wn
from nltk.stem import WordNetLemmatizer 
from collections import defaultdict
from nltk.corpus import stopwords
from tqdm.notebook import tqdm
from sklearn_crfsuite import metrics
from sklearn.metrics import make_scorer
from sklearn.model_selection  import RandomizedSearchCV


from pymetamap import MetaMap

tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV


sia = SentimentIntensityAnalyzer()
mm = MetaMap.get_instance('./opt/public_mm/bin/metamap21')


## Read in the data

In [68]:
os.chdir(r"/Users/szoriac/OneDrive/Michigan/=WN 2021/LHS 712/Assignment 3 CRF LSTM") 
print(os.getcwd())

row_id_text, texts = read_file('./REVIEW_TEXT.txt')
row_id_tags, tags = read_file('./REVIEW_LABELSEQ.txt')

#For this demo, let's just use the first 100 sentences 
texts = texts
tags = tags


/Users/szoriac/OneDrive/Michigan/=WN 2021/LHS 712/Assignment 3 CRF LSTM


## Derive POS dictionary From Brown Corpus 

In [69]:
#wordtags = dict(nltk.ConditionalFreqDist((w.lower(), t) 
#        for w, t in nltk.corpus.brown.tagged_words(tagset = 'universal')))

#wordtagsnofreq = {}
#for k, v in wordtags.items():
#    wordtagsnofreq[k] = dict(v)


## Derive POS dictionary from Own Data

In [70]:
texts

loweredsent = []
for sent in texts:
    loweredword = []
    for word in sent:
        x = word
        x = re.sub('[^A-Za-z0-9]+', ' ', x) 
        loweredword.append(x)
    loweredsent.append(loweredword)

taggedsent = [nltk.pos_tag(sent, tagset='universal') for sent in loweredsent]
flattaggedsent = [item for sublist in taggedsent for item in sublist]



In [71]:
corpusdictfromtraining = dict(nltk.ConditionalFreqDist((w.lower(), t) 
        for w, t in flattaggedsent))


In [72]:
corpusdictfromtrainingnofreq = {}
for k, v in corpusdictfromtraining.items():
    corpusdictfromtrainingnofreq[k] = dict(v)

#masterposdict = corpusdictfromtrainingnofreq

## Merge dictionaries

In [73]:
def mergebtoa(a, b, path=None):
    "merges b into a"
    if path is None: path = []
    for key in b:
        if key in a:
            if isinstance(a[key], dict) and isinstance(b[key], dict):
                merge(a[key], b[key], path + [str(key)])
            elif a[key] == b[key]:
                pass # same leaf value
            else:
                raise Exception('Conflict at %s' % '.'.join(path + [str(key)]))
        else:
            a[key] = b[key]
    return a

In [74]:
#masterposdict = mergebtoa(masterposdict, wordtagsnofreq)
corpusdictfromtrainingnofreq['i']


{'PRON': 3074, 'ADV': 5, 'NOUN': 168, 'ADJ': 60, 'VERB': 35, 'ADP': 1}

## Get set of sickness and symptom hyponyms

In [75]:
sickness = wn.synset('sickness.n.01')
typesOfsickness= list(set([w for s in sickness.closure(lambda s:s.hyponyms()) for w in s.lemma_names()]))

sicktype = []
lemma_function = WordNetLemmatizer()
for token, tag in nltk.pos_tag(typesOfsickness):
    x = token.replace("_", " ").replace("-", " ")
    lemma = lemma_function.lemmatize(x, tag_map[tag[0]])
    y = lemma.split(' ')
    z = token.replace("_", " ").replace("-", " ").split(' ')
    for item in y:
        lem = lemma_function.lemmatize(item, tag_map[tag[0]])
        sicktype.append(lem)   
    for item in z:
        sicktype.append(item)

sicknesses = set(sicktype)



symptom = wn.synset('symptom.n.01')
typesOfsymptom= list(set([w for s in symptom.closure(lambda s:s.hyponyms()) for w in s.lemma_names()]))

symptype = []
lemma_function = WordNetLemmatizer()
for token, tag in nltk.pos_tag(typesOfsymptom):
    x = token.replace("_", " ").replace("-", " ")
    lemma = lemma_function.lemmatize(x, tag_map[tag[0]])
    y = lemma.split(' ')
    z = token.replace("_", " ").replace("-", " ").split(' ')
    for item in y:
        lem = lemma_function.lemmatize(item, tag_map[tag[0]])
        symptype.append(lem)   
    for item in z:
        symptype.append(item)

symptoms = set(symptype)


In [76]:


filtered_sicknesses = [word for word in sicknesses if word not in stopwords.words('english')]
filtered_symptoms = [word for word in symptoms if word not in stopwords.words('english')]

## Functions

In [82]:
def word2features(word, index, tokenizedtext):
    strippedascii = str(unicodedata.normalize('NFD', word).encode('ascii', 'ignore').decode("utf-8"))
    strippedasciilower = str(unicodedata.normalize('NFD', word).encode('ascii', 'ignore').decode("utf-8"))
    strippedpunc = re.sub('[^A-Za-z0-9\']+', '', strippedascii.lower())

    dictionary = corpusdictfromtrainingnofreq.get(strippedpunc, dict({'None':1}))
    

    features = {
        'tokenizedtext': tokenizedtext,
        'position': index,
        'word.lower()': word.lower(),
        'word.stripascii()': strippedasciilower,
        'word.strippunc()': strippedpunc,
        'word.istitle()': (re.sub('[^A-Za-z0-9\']+', '', word)).istitle(),
        'word.isupper()': (re.sub('[^A-Za-z0-9\']+', '', word)).isupper(),
        'word.isdigit()': (re.sub('[^A-Za-z0-9\']+', '', word)).isdigit(),
        'word[-3:]': strippedpunc[-3:],
        'word[-2:]': strippedpunc[-2:],
        'word[:2]': strippedpunc[:2],
        'word[:3]': strippedpunc[:3],
        'word.POS1freq()': nlargest(100, dictionary, key=dictionary.get)[0] if len(dictionary) > 0 else 'None',
        'word.POS2freq()': nlargest(100, dictionary, key=dictionary.get)[1] if len(dictionary) > 1 else 'None',
        'word.POS3freq()': nlargest(100, dictionary, key=dictionary.get)[2] if len(dictionary) > 2 else 'None',
        'word.POS4freq()': nlargest(100, dictionary, key=dictionary.get)[3] if len(dictionary) > 3 else 'None',
        'word.POS5freq()': nlargest(100, dictionary, key=dictionary.get)[4] if len(dictionary) > 4 else 'None',
        'word.POS6freq()': nlargest(100, dictionary, key=dictionary.get)[5] if len(dictionary) > 5 else 'None',
        'word.POSmin()':max(dictionary.items(), key=operator.itemgetter(1))[0],
        'word.POSfirst()': list(dictionary.items())[0][0],
        'word.POSlast()': list(dictionary.items())[-1][0],
        'sent.sentimentpos()': float(sia.polarity_scores(" ".join(tokenizedtext))['pos']),
        'sent.sentimentneg()': float(sia.polarity_scores(" ".join(tokenizedtext))['neg']),
        'sent.sentimentneu()': float(sia.polarity_scores(" ".join(tokenizedtext))['neu']),

        'word.in.sickness()' : strippedpunc in filtered_sicknesses,
        'word.in.symptoms()' : strippedpunc in filtered_symptoms,
        'word.sentimentpos()': float(sia.polarity_scores(strippedpunc)['pos']),
        'word.sentimentneg()': float(sia.polarity_scores(strippedpunc)['neg']),
        'word.sentimentneu()': float(sia.polarity_scores(strippedpunc)['neu']),

        'word.isdepress()': 'yes' if strippedpunc in ['depression', 'depressed'] else 'no',
        'word.isanxious()': 'yes' if strippedpunc in ['anxiety', 'anxious', 'worry', 'worried'] else 'no',
        'word.issuicide()': 'yes' if strippedpunc in ['suicide', 'suicidal', 'kill'] else 'no',
        'word.isinsomnia()': 'yes' if strippedpunc in ['insomnia', 'insomniac', 'sleep'] else 'no',
        'word.isheadtired()': 'yes' if strippedpunc in ['tired', 'fatigue', 'migraine'] else 'no',
        'word.isdelusion()': 'yes' if strippedpunc in ['delusional', 'delusion'] else 'no',
        'word.isanger()': 'yes' if strippedpunc in ['anger', 'angry', 'fury', 'furious', 'mad'] else 'no',
        'word.isbulimia()': 'yes' if strippedpunc in ['bulimia', 'bulimic'] else 'no',
        'word.isafraid()': 'yes' if strippedpunc in ['fear', 'afraid'] else 'no',
        'word.isdisorder()': 'yes' if strippedpunc in ['disorder'] else 'no',
        'word.isbipolar()': 'yes' if strippedpunc in ['bipolar'] else 'no'




    }

    if index > 0:
        word = tokenizedtext[index-1]
        strippedascii = str(unicodedata.normalize('NFD', word).encode('ascii', 'ignore').decode("utf-8"))
        strippedasciilower = str(unicodedata.normalize('NFD', word).encode('ascii', 'ignore').decode("utf-8"))
        strippedpunc = re.sub('[^A-Za-z0-9\']+', '', strippedascii.lower())
        dictionary = corpusdictfromtrainingnofreq.get(strippedpunc, dict({'None':1}))


        features.update({
            '-1:word.lower()': word.lower(),
            '-1:word.stripascii()': strippedasciilower,
            '-1:word.strippunc()': strippedpunc,
            '-1word.istitle()': (re.sub('[^A-Za-z0-9\']+', '', word)).istitle(),
            '-1word.isupper()': (re.sub('[^A-Za-z0-9\']+', '', word)).isupper(),
            '-1word.isdigit()': (re.sub('[^A-Za-z0-9\']+', '', word)).isdigit(),
            '-1word[-3:]': strippedpunc[-3:],
            '-1word[-2:]': strippedpunc[-2:],
            '-1word[:2]': strippedpunc[:2],
            '-1word[:3]': strippedpunc[:3],
            '-1word.POS1freq()': nlargest(100, dictionary, key=dictionary.get)[0] if len(dictionary) > 0 else 'None',
            '-1word.POS2freq()': nlargest(100, dictionary, key=dictionary.get)[1] if len(dictionary) > 1 else 'None',
            '-1word.POS3freq()': nlargest(100, dictionary, key=dictionary.get)[2] if len(dictionary) > 2 else 'None',
            '-1word.POS4freq()': nlargest(100, dictionary, key=dictionary.get)[3] if len(dictionary) > 3 else 'None',
            '-1word.POS5freq()': nlargest(100, dictionary, key=dictionary.get)[4] if len(dictionary) > 4 else 'None',
            '-1word.POS6freq()': nlargest(100, dictionary, key=dictionary.get)[5] if len(dictionary) > 5 else 'None',
            '-1word.POSmin()':max(dictionary.items(), key=operator.itemgetter(1))[0],
            '-1word.POSfirst()': list(dictionary.items())[0][0],
            '-1word.POSlast()': list(dictionary.items())[-1][0],
            '-1word.in.sickness()' : strippedpunc in filtered_sicknesses,
            '-1word.in.symptoms()' : strippedpunc in filtered_symptoms,
            '-1word.sentimentpos()': float(sia.polarity_scores(strippedpunc)['pos']),
            '-1word.sentimentneg()': float(sia.polarity_scores(strippedpunc)['neg']),
            '-1word.sentimentneu()': float(sia.polarity_scores(strippedpunc)['neu'])
        })

    if index > 1:
        word = tokenizedtext[index-2]
        strippedascii = str(unicodedata.normalize('NFD', word).encode('ascii', 'ignore').decode("utf-8"))
        strippedasciilower = str(unicodedata.normalize('NFD', word).encode('ascii', 'ignore').decode("utf-8"))
        strippedpunc = re.sub('[^A-Za-z0-9\']+', '', strippedascii.lower())
        dictionary = corpusdictfromtrainingnofreq.get(strippedpunc, dict({'None':1}))


        features.update({
            '-2:word.lower()': word.lower(),
            '-2:word.stripascii()': strippedasciilower,
            '-2:word.strippunc()': strippedpunc,
            '-2word.istitle()': (re.sub('[^A-Za-z0-9\']+', '', word)).istitle(),
            '-2word.isupper()': (re.sub('[^A-Za-z0-9\']+', '', word)).isupper(),
            '-2word.isdigit()': (re.sub('[^A-Za-z0-9\']+', '', word)).isdigit(),
            '-2word[-3:]': strippedpunc[-3:],
            '-2word[-2:]': strippedpunc[-2:],
            '-2word[:2]': strippedpunc[:2],
            '-2word[:3]': strippedpunc[:3],
            '-2word.POS1freq()': nlargest(100, dictionary, key=dictionary.get)[0] if len(dictionary) > 0 else 'None',
            '-2word.POS2freq()': nlargest(100, dictionary, key=dictionary.get)[1] if len(dictionary) > 1 else 'None',
            '-2word.POS3freq()': nlargest(100, dictionary, key=dictionary.get)[2] if len(dictionary) > 2 else 'None',
            '-2word.POS4freq()': nlargest(100, dictionary, key=dictionary.get)[3] if len(dictionary) > 3 else 'None',
            '-2word.POS5freq()': nlargest(100, dictionary, key=dictionary.get)[4] if len(dictionary) > 4 else 'None',
            '-2word.POS6freq()': nlargest(100, dictionary, key=dictionary.get)[5] if len(dictionary) > 5 else 'None',
            '-2word.POSmin()':max(dictionary.items(), key=operator.itemgetter(1))[0],
            '-2word.POSfirst()': list(dictionary.items())[0][0],
            '-2word.POSlast()': list(dictionary.items())[-1][0],
            '-2word.in.sickness()' : strippedpunc in filtered_sicknesses,
            '-2word.in.symptoms()' : strippedpunc in filtered_symptoms,
            '-2word.sentimentpos()': float(sia.polarity_scores(strippedpunc)['pos']),
            '-2word.sentimentneg()': float(sia.polarity_scores(strippedpunc)['neg']),
            '-2word.sentimentneu()': float(sia.polarity_scores(strippedpunc)['neu'])
        })
    
    if index < len(tokenizedtext)-1:
        word = tokenizedtext[index+1]
        strippedascii = str(unicodedata.normalize('NFD', word).encode('ascii', 'ignore').decode("utf-8"))
        strippedasciilower = str(unicodedata.normalize('NFD', word).encode('ascii', 'ignore').decode("utf-8"))
        strippedpunc = re.sub('[^A-Za-z0-9\']+', '', strippedascii.lower())
        dictionary = corpusdictfromtrainingnofreq.get(strippedpunc, dict({'None':1}))


        features.update({
            '+1:word.lower()': word.lower(),
            '+1:word.stripascii()': strippedasciilower,
            '+1:word.strippunc()': strippedpunc,
            '+1word.istitle()': (re.sub('[^A-Za-z0-9\']+', '', word)).istitle(),
            '+1word.isupper()': (re.sub('[^A-Za-z0-9\']+', '', word)).isupper(),
            '+1word.isdigit()': (re.sub('[^A-Za-z0-9\']+', '', word)).isdigit(),
            '+1word[-3:]': strippedpunc[-3:],
            '+1word[-2:]': strippedpunc[-2:],
            '+1word[:2]': strippedpunc[:2],
            '+1word[:3]': strippedpunc[:3],
            '+1word.POS1freq()': nlargest(100, dictionary, key=dictionary.get)[0] if len(dictionary) > 0 else 'None',
            '+1word.POS2freq()': nlargest(100, dictionary, key=dictionary.get)[1] if len(dictionary) > 1 else 'None',
            '+1word.POS3freq()': nlargest(100, dictionary, key=dictionary.get)[2] if len(dictionary) > 2 else 'None',
            '+1word.POS4freq()': nlargest(100, dictionary, key=dictionary.get)[3] if len(dictionary) > 3 else 'None',
            '+1word.POS5freq()': nlargest(100, dictionary, key=dictionary.get)[4] if len(dictionary) > 4 else 'None',
            '+1word.POS6freq()': nlargest(100, dictionary, key=dictionary.get)[5] if len(dictionary) > 5 else 'None',
            '+1word.POSmin()':max(dictionary.items(), key=operator.itemgetter(1))[0],
            '+1word.POSfirst()': list(dictionary.items())[0][0],
            '+1word.POSlast()': list(dictionary.items())[-1][0],
            '+1word.in.sickness()' : strippedpunc in filtered_sicknesses,
            '+1word.in.symptoms()' : strippedpunc in filtered_symptoms,
            '+1word.sentimentpos()': float(sia.polarity_scores(strippedpunc)['pos']),
            '+1word.sentimentneg()': float(sia.polarity_scores(strippedpunc)['neg']),
            '+1word.sentimentneu()': float(sia.polarity_scores(strippedpunc)['neu'])
        })
    if index < len(tokenizedtext)-2:
        
        word = tokenizedtext[index+2]
        strippedascii = str(unicodedata.normalize('NFD', word).encode('ascii', 'ignore').decode("utf-8"))
        strippedasciilower = str(unicodedata.normalize('NFD', word).encode('ascii', 'ignore').decode("utf-8"))
        strippedpunc = re.sub('[^A-Za-z0-9\']+', '', strippedascii.lower())
        dictionary = corpusdictfromtrainingnofreq.get(strippedpunc, dict({'None':1}))


        features.update({
            '+2:word.lower()': word.lower(),
            '+2:word.stripascii()': strippedasciilower,
            '+2:word.strippunc()': strippedpunc,
            '+2word.istitle()': (re.sub('[^A-Za-z0-9\']+', '', word)).istitle(),
            '+2word.isupper()': (re.sub('[^A-Za-z0-9\']+', '', word)).isupper(),
            '+2word.isdigit()': (re.sub('[^A-Za-z0-9\']+', '', word)).isdigit(),
            '+2word[-3:]': strippedpunc[-3:],
            '+2word[-2:]': strippedpunc[-2:],
            '+2word[:2]': strippedpunc[:2],
            '+2word[:3]': strippedpunc[:3],
            '+2word.POS1freq()': nlargest(100, dictionary, key=dictionary.get)[0] if len(dictionary) > 0 else 'None',
            '+2word.POS2freq()': nlargest(100, dictionary, key=dictionary.get)[1] if len(dictionary) > 1 else 'None',
            '+2word.POS3freq()': nlargest(100, dictionary, key=dictionary.get)[2] if len(dictionary) > 2 else 'None',
            '+2word.POS4freq()': nlargest(100, dictionary, key=dictionary.get)[3] if len(dictionary) > 3 else 'None',
            '+2word.POS5freq()': nlargest(100, dictionary, key=dictionary.get)[4] if len(dictionary) > 4 else 'None',
            '+2word.POS6freq()': nlargest(100, dictionary, key=dictionary.get)[5] if len(dictionary) > 5 else 'None',
            '+2word.POSmin()':max(dictionary.items(), key=operator.itemgetter(1))[0],
            '+2word.POSfirst()': list(dictionary.items())[0][0],
            '+2word.POSlast()': list(dictionary.items())[-1][0],
            '+2word.in.sickness()' : strippedpunc in filtered_sicknesses,
            '+2word.in.symptoms()' : strippedpunc in filtered_symptoms,
            '+2word.sentimentpos()': float(sia.polarity_scores(strippedpunc)['pos']),
            '+2word.sentimentneg()': float(sia.polarity_scores(strippedpunc)['neg']),
            '+2word.sentimentneu()': float(sia.polarity_scores(strippedpunc)['neu'])
        })

    return features

def text2features(text):
    tokenizedtext = list(text)
    return [word2features(i, index, tokenizedtext) for index,i in enumerate(text)]







## Define input and split data

In [83]:
X = [text2features(text) for text in tqdm(texts)]
y = tags

X_train, X_validation, y_train, y_validation = train_test_split(X, y, test_size = 0.2)


HBox(children=(FloatProgress(value=0.0, max=4744.0), HTML(value='')))




## Fit and predict CRF

In [84]:
import scipy

crf = CRF(algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=300,
    all_possible_transitions=True,
    verbose=True)

params_space = {
    'c1': scipy.stats.expon(scale=0.1),
    'c2': scipy.stats.expon(scale=0.1),
}

crf.fit(X_train, y_train)
y_pred = crf.predict(X_validation)

loading training data to CRFsuite: 100%|██████████| 3795/3795 [00:15<00:00, 249.85it/s]

Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 1
0....1....2....3....4....5....6....7....8....9....10
Number of features: 165947
Seconds required: 2.694

L-BFGS optimization
c1: 0.100000
c2: 0.100000
num_memories: 6
max_iterations: 300
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

Iter 1   time=1.09  loss=77071.38 active=164465 feature_norm=0.50
Iter 2   time=0.34  loss=62585.27 active=152981 feature_norm=0.42
Iter 3   time=0.29  loss=45018.38 active=121286 feature_norm=0.49
Iter 4   time=0.29  loss=42603.59 active=161857 feature_norm=0.52
Iter 5   time=0.33  loss=35659.00 active=164292 feature_norm=0.61
Iter 6   time=0.29  loss=35200.75 active=142622 feature_norm=0.73
Iter 7   time=0.29  loss=32517.24 active=145659 feature_norm=0.77
Iter 8   time=0.32  loss=31688.98 active=152241 

In [85]:
print(flat_classification_report(
    y_validation, y_pred, digits=3
))

              precision    recall  f1-score   support

        B-AE      0.714     0.669     0.691       661
       B-SSI      0.735     0.606     0.664       165
        I-AE      0.679     0.659     0.669      1282
       I-SSI      0.409     0.281     0.333        64
           O      0.951     0.962     0.956     11701

    accuracy                          0.912     13873
   macro avg      0.698     0.635     0.663     13873
weighted avg      0.910     0.912     0.911     13873



In [86]:
from collections import Counter

def print_transitions(trans_features):
    for (label_from, label_to), weight in trans_features:
        print("%-6s -> %-7s %0.6f" % (label_from, label_to, weight))

print("Top likely transitions:")
print_transitions(Counter(crf.transition_features_).most_common(20))

print("\nTop unlikely transitions:")
print_transitions(Counter(crf.transition_features_).most_common()[-20:])

Top likely transitions:
B-SSI  -> I-SSI   6.273916
I-SSI  -> I-SSI   4.366255
B-AE   -> I-AE    3.330324
I-AE   -> I-AE    1.921833
O      -> O       0.859778
B-SSI  -> O       0.127211
O      -> B-AE    0.122444
I-SSI  -> B-SSI   0.065738
O      -> B-SSI   0.050733
I-SSI  -> B-AE    -0.132096
I-SSI  -> O       -0.241905
B-SSI  -> B-AE    -0.315562
B-SSI  -> B-SSI   -0.418638
I-AE   -> B-SSI   -0.643569
I-AE   -> O       -1.108093
B-AE   -> I-SSI   -1.373583
B-AE   -> B-SSI   -1.476553
I-AE   -> B-AE    -1.490102
B-AE   -> O       -1.996593
I-AE   -> I-SSI   -2.320411

Top unlikely transitions:
B-SSI  -> O       0.127211
O      -> B-AE    0.122444
I-SSI  -> B-SSI   0.065738
O      -> B-SSI   0.050733
I-SSI  -> B-AE    -0.132096
I-SSI  -> O       -0.241905
B-SSI  -> B-AE    -0.315562
B-SSI  -> B-SSI   -0.418638
I-AE   -> B-SSI   -0.643569
I-AE   -> O       -1.108093
B-AE   -> I-SSI   -1.373583
B-AE   -> B-SSI   -1.476553
I-AE   -> B-AE    -1.490102
B-AE   -> O       -1.996593
I-AE   -> 

In [87]:
def print_state_features(state_features):
    for (attr, label), weight in state_features:
        print("%0.6f %-8s %s" % (weight, label, attr))

print("Top positive:")
print_state_features(Counter(crf.state_features_).most_common(30))

print("\nTop negative:")
print_state_features(Counter(crf.state_features_).most_common()[-30:])

Top positive:
2.270345 I-AE     -1word[:2]:to
2.020014 O        word[:2]:to
1.963728 B-AE     tokenizedtext:Insomnia
1.879700 O        word.istitle()
1.769548 O        +1:word.lower():(
1.769548 O        +1:word.stripascii():(
1.681397 I-AE     word.stripascii():I
1.672317 B-AE     tokenizedtext:.
1.652589 B-SSI    word[-2:]:ia
1.632817 O        word[:3]:med
1.572424 O        word.POS1freq():CONJ
1.572424 O        word.POSmin():CONJ
1.572424 O        word.POSfirst():CONJ
1.536526 B-AE     word[-2:]:ia
1.460009 B-AE     tokenizedtext:day.
1.449378 O        tokenizedtext:Some
1.449022 O        word.POS1freq():DET
1.449022 O        word.POSmin():DET
1.443601 B-AE     tokenizedtext:vertigo
1.413958 O        word[:2]:wh
1.403046 B-AE     tokenizedtext:biggest
1.395429 I-AE     -1word[-3:]:n't
1.395429 I-AE     -1word[-2:]:'t
1.389061 I-AE     -1word.POSfirst():DET
1.345375 B-SSI    tokenizedtext:depressed
1.340977 B-SSI    tokenizedtext:intrusive
1.332610 B-SSI    tokenizedtext:depression
1

## Apply to test data

In [91]:
row_id_finaltext, finaltexts = read_file('./TEST_REVIEW_TEXT.txt')

finaltexts = finaltexts


In [92]:
loweredsent = []
for sent in texts:
    loweredword = []
    for word in sent:
        x = word
        x = re.sub('[^A-Za-z0-9]+', ' ', x) 
        loweredword.append(x)
    loweredsent.append(loweredword)

taggedsent = [nltk.pos_tag(sent, tagset='universal') for sent in loweredsent]
flattaggedsent = [item for sublist in taggedsent for item in sublist]


corpusdictfromtraining = dict(nltk.ConditionalFreqDist((w.lower(), t) 
        for w, t in flattaggedsent))


corpusdictfromtrainingnofreq = {}
for k, v in corpusdictfromtraining.items():
    corpusdictfromtrainingnofreq[k] = dict(v)




In [93]:
def word2features(word, index, tokenizedtext):
    strippedascii = str(unicodedata.normalize('NFD', word).encode('ascii', 'ignore').decode("utf-8"))
    strippedasciilower = str(unicodedata.normalize('NFD', word).encode('ascii', 'ignore').decode("utf-8"))
    strippedpunc = re.sub('[^A-Za-z0-9\']+', '', strippedascii.lower())

    dictionary = corpusdictfromtrainingnofreq.get(strippedpunc, dict({'None':1}))
    

    features = {
        'tokenizedtext': tokenizedtext,
        'position': index,
        'word.lower()': word.lower(),
        'word.stripascii()': strippedasciilower,
        'word.strippunc()': strippedpunc,
        'word.istitle()': (re.sub('[^A-Za-z0-9\']+', '', word)).istitle(),
        'word.isupper()': (re.sub('[^A-Za-z0-9\']+', '', word)).isupper(),
        'word.isdigit()': (re.sub('[^A-Za-z0-9\']+', '', word)).isdigit(),
        'word[-3:]': strippedpunc[-3:],
        'word[-2:]': strippedpunc[-2:],
        'word[:2]': strippedpunc[:2],
        'word[:3]': strippedpunc[:3],
        'word.POS1freq()': nlargest(100, dictionary, key=dictionary.get)[0] if len(dictionary) > 0 else 'None',
        'word.POS2freq()': nlargest(100, dictionary, key=dictionary.get)[1] if len(dictionary) > 1 else 'None',
        'word.POS3freq()': nlargest(100, dictionary, key=dictionary.get)[2] if len(dictionary) > 2 else 'None',
        'word.POS4freq()': nlargest(100, dictionary, key=dictionary.get)[3] if len(dictionary) > 3 else 'None',
        'word.POS5freq()': nlargest(100, dictionary, key=dictionary.get)[4] if len(dictionary) > 4 else 'None',
        'word.POS6freq()': nlargest(100, dictionary, key=dictionary.get)[5] if len(dictionary) > 5 else 'None',
        'word.POSmin()':max(dictionary.items(), key=operator.itemgetter(1))[0],
        'word.POSfirst()': list(dictionary.items())[0][0],
        'word.POSlast()': list(dictionary.items())[-1][0],
        'sent.sentimentpos()': float(sia.polarity_scores(" ".join(tokenizedtext))['pos']),
        'sent.sentimentneg()': float(sia.polarity_scores(" ".join(tokenizedtext))['neg']),
        'sent.sentimentneu()': float(sia.polarity_scores(" ".join(tokenizedtext))['neu']),

        'word.in.sickness()' : strippedpunc in filtered_sicknesses,
        'word.in.symptoms()' : strippedpunc in filtered_symptoms,
        'word.sentimentpos()': float(sia.polarity_scores(strippedpunc)['pos']),
        'word.sentimentneg()': float(sia.polarity_scores(strippedpunc)['neg']),
        'word.sentimentneu()': float(sia.polarity_scores(strippedpunc)['neu']),

        'word.isdepress()': 'yes' if strippedpunc in ['depression', 'depressed'] else 'no',
        'word.isanxious()': 'yes' if strippedpunc in ['anxiety', 'anxious', 'worry', 'worried'] else 'no',
        'word.issuicide()': 'yes' if strippedpunc in ['suicide', 'suicidal', 'kill'] else 'no',
        'word.isinsomnia()': 'yes' if strippedpunc in ['insomnia', 'insomniac', 'sleep'] else 'no',
        'word.isheadtired()': 'yes' if strippedpunc in ['tired', 'fatigue', 'migraine'] else 'no',
        'word.isdelusion()': 'yes' if strippedpunc in ['delusional', 'delusion'] else 'no',
        'word.isanger()': 'yes' if strippedpunc in ['anger', 'angry', 'fury', 'furious', 'mad'] else 'no',
        'word.isbulimia()': 'yes' if strippedpunc in ['bulimia', 'bulimic'] else 'no',
        'word.isafraid()': 'yes' if strippedpunc in ['fear', 'afraid'] else 'no',
        'word.isdisorder()': 'yes' if strippedpunc in ['disorder'] else 'no',
        'word.isbipolar()': 'yes' if strippedpunc in ['bipolar'] else 'no'




    }

    if index > 0:
        word = tokenizedtext[index-1]
        strippedascii = str(unicodedata.normalize('NFD', word).encode('ascii', 'ignore').decode("utf-8"))
        strippedasciilower = str(unicodedata.normalize('NFD', word).encode('ascii', 'ignore').decode("utf-8"))
        strippedpunc = re.sub('[^A-Za-z0-9\']+', '', strippedascii.lower())
        dictionary = corpusdictfromtrainingnofreq.get(strippedpunc, dict({'None':1}))


        features.update({
            '-1:word.lower()': word.lower(),
            '-1:word.stripascii()': strippedasciilower,
            '-1:word.strippunc()': strippedpunc,
            '-1word.istitle()': (re.sub('[^A-Za-z0-9\']+', '', word)).istitle(),
            '-1word.isupper()': (re.sub('[^A-Za-z0-9\']+', '', word)).isupper(),
            '-1word.isdigit()': (re.sub('[^A-Za-z0-9\']+', '', word)).isdigit(),
            '-1word[-3:]': strippedpunc[-3:],
            '-1word[-2:]': strippedpunc[-2:],
            '-1word[:2]': strippedpunc[:2],
            '-1word[:3]': strippedpunc[:3],
            '-1word.POS1freq()': nlargest(100, dictionary, key=dictionary.get)[0] if len(dictionary) > 0 else 'None',
            '-1word.POS2freq()': nlargest(100, dictionary, key=dictionary.get)[1] if len(dictionary) > 1 else 'None',
            '-1word.POS3freq()': nlargest(100, dictionary, key=dictionary.get)[2] if len(dictionary) > 2 else 'None',
            '-1word.POS4freq()': nlargest(100, dictionary, key=dictionary.get)[3] if len(dictionary) > 3 else 'None',
            '-1word.POS5freq()': nlargest(100, dictionary, key=dictionary.get)[4] if len(dictionary) > 4 else 'None',
            '-1word.POS6freq()': nlargest(100, dictionary, key=dictionary.get)[5] if len(dictionary) > 5 else 'None',
            '-1word.POSmin()':max(dictionary.items(), key=operator.itemgetter(1))[0],
            '-1word.POSfirst()': list(dictionary.items())[0][0],
            '-1word.POSlast()': list(dictionary.items())[-1][0],
            '-1word.in.sickness()' : strippedpunc in filtered_sicknesses,
            '-1word.in.symptoms()' : strippedpunc in filtered_symptoms,
            '-1word.sentimentpos()': float(sia.polarity_scores(strippedpunc)['pos']),
            '-1word.sentimentneg()': float(sia.polarity_scores(strippedpunc)['neg']),
            '-1word.sentimentneu()': float(sia.polarity_scores(strippedpunc)['neu'])
        })

    if index > 1:
        word = tokenizedtext[index-2]
        strippedascii = str(unicodedata.normalize('NFD', word).encode('ascii', 'ignore').decode("utf-8"))
        strippedasciilower = str(unicodedata.normalize('NFD', word).encode('ascii', 'ignore').decode("utf-8"))
        strippedpunc = re.sub('[^A-Za-z0-9\']+', '', strippedascii.lower())
        dictionary = corpusdictfromtrainingnofreq.get(strippedpunc, dict({'None':1}))


        features.update({
            '-2:word.lower()': word.lower(),
            '-2:word.stripascii()': strippedasciilower,
            '-2:word.strippunc()': strippedpunc,
            '-2word.istitle()': (re.sub('[^A-Za-z0-9\']+', '', word)).istitle(),
            '-2word.isupper()': (re.sub('[^A-Za-z0-9\']+', '', word)).isupper(),
            '-2word.isdigit()': (re.sub('[^A-Za-z0-9\']+', '', word)).isdigit(),
            '-2word[-3:]': strippedpunc[-3:],
            '-2word[-2:]': strippedpunc[-2:],
            '-2word[:2]': strippedpunc[:2],
            '-2word[:3]': strippedpunc[:3],
            '-2word.POS1freq()': nlargest(100, dictionary, key=dictionary.get)[0] if len(dictionary) > 0 else 'None',
            '-2word.POS2freq()': nlargest(100, dictionary, key=dictionary.get)[1] if len(dictionary) > 1 else 'None',
            '-2word.POS3freq()': nlargest(100, dictionary, key=dictionary.get)[2] if len(dictionary) > 2 else 'None',
            '-2word.POS4freq()': nlargest(100, dictionary, key=dictionary.get)[3] if len(dictionary) > 3 else 'None',
            '-2word.POS5freq()': nlargest(100, dictionary, key=dictionary.get)[4] if len(dictionary) > 4 else 'None',
            '-2word.POS6freq()': nlargest(100, dictionary, key=dictionary.get)[5] if len(dictionary) > 5 else 'None',
            '-2word.POSmin()':max(dictionary.items(), key=operator.itemgetter(1))[0],
            '-2word.POSfirst()': list(dictionary.items())[0][0],
            '-2word.POSlast()': list(dictionary.items())[-1][0],
            '-2word.in.sickness()' : strippedpunc in filtered_sicknesses,
            '-2word.in.symptoms()' : strippedpunc in filtered_symptoms,
            '-2word.sentimentpos()': float(sia.polarity_scores(strippedpunc)['pos']),
            '-2word.sentimentneg()': float(sia.polarity_scores(strippedpunc)['neg']),
            '-2word.sentimentneu()': float(sia.polarity_scores(strippedpunc)['neu'])
        })
    
    if index < len(tokenizedtext)-1:
        word = tokenizedtext[index+1]
        strippedascii = str(unicodedata.normalize('NFD', word).encode('ascii', 'ignore').decode("utf-8"))
        strippedasciilower = str(unicodedata.normalize('NFD', word).encode('ascii', 'ignore').decode("utf-8"))
        strippedpunc = re.sub('[^A-Za-z0-9\']+', '', strippedascii.lower())
        dictionary = corpusdictfromtrainingnofreq.get(strippedpunc, dict({'None':1}))


        features.update({
            '+1:word.lower()': word.lower(),
            '+1:word.stripascii()': strippedasciilower,
            '+1:word.strippunc()': strippedpunc,
            '+1word.istitle()': (re.sub('[^A-Za-z0-9\']+', '', word)).istitle(),
            '+1word.isupper()': (re.sub('[^A-Za-z0-9\']+', '', word)).isupper(),
            '+1word.isdigit()': (re.sub('[^A-Za-z0-9\']+', '', word)).isdigit(),
            '+1word[-3:]': strippedpunc[-3:],
            '+1word[-2:]': strippedpunc[-2:],
            '+1word[:2]': strippedpunc[:2],
            '+1word[:3]': strippedpunc[:3],
            '+1word.POS1freq()': nlargest(100, dictionary, key=dictionary.get)[0] if len(dictionary) > 0 else 'None',
            '+1word.POS2freq()': nlargest(100, dictionary, key=dictionary.get)[1] if len(dictionary) > 1 else 'None',
            '+1word.POS3freq()': nlargest(100, dictionary, key=dictionary.get)[2] if len(dictionary) > 2 else 'None',
            '+1word.POS4freq()': nlargest(100, dictionary, key=dictionary.get)[3] if len(dictionary) > 3 else 'None',
            '+1word.POS5freq()': nlargest(100, dictionary, key=dictionary.get)[4] if len(dictionary) > 4 else 'None',
            '+1word.POS6freq()': nlargest(100, dictionary, key=dictionary.get)[5] if len(dictionary) > 5 else 'None',
            '+1word.POSmin()':max(dictionary.items(), key=operator.itemgetter(1))[0],
            '+1word.POSfirst()': list(dictionary.items())[0][0],
            '+1word.POSlast()': list(dictionary.items())[-1][0],
            '+1word.in.sickness()' : strippedpunc in filtered_sicknesses,
            '+1word.in.symptoms()' : strippedpunc in filtered_symptoms,
            '+1word.sentimentpos()': float(sia.polarity_scores(strippedpunc)['pos']),
            '+1word.sentimentneg()': float(sia.polarity_scores(strippedpunc)['neg']),
            '+1word.sentimentneu()': float(sia.polarity_scores(strippedpunc)['neu'])
        })
    if index < len(tokenizedtext)-2:
        
        word = tokenizedtext[index+2]
        strippedascii = str(unicodedata.normalize('NFD', word).encode('ascii', 'ignore').decode("utf-8"))
        strippedasciilower = str(unicodedata.normalize('NFD', word).encode('ascii', 'ignore').decode("utf-8"))
        strippedpunc = re.sub('[^A-Za-z0-9\']+', '', strippedascii.lower())
        dictionary = corpusdictfromtrainingnofreq.get(strippedpunc, dict({'None':1}))


        features.update({
            '+2:word.lower()': word.lower(),
            '+2:word.stripascii()': strippedasciilower,
            '+2:word.strippunc()': strippedpunc,
            '+2word.istitle()': (re.sub('[^A-Za-z0-9\']+', '', word)).istitle(),
            '+2word.isupper()': (re.sub('[^A-Za-z0-9\']+', '', word)).isupper(),
            '+2word.isdigit()': (re.sub('[^A-Za-z0-9\']+', '', word)).isdigit(),
            '+2word[-3:]': strippedpunc[-3:],
            '+2word[-2:]': strippedpunc[-2:],
            '+2word[:2]': strippedpunc[:2],
            '+2word[:3]': strippedpunc[:3],
            '+2word.POS1freq()': nlargest(100, dictionary, key=dictionary.get)[0] if len(dictionary) > 0 else 'None',
            '+2word.POS2freq()': nlargest(100, dictionary, key=dictionary.get)[1] if len(dictionary) > 1 else 'None',
            '+2word.POS3freq()': nlargest(100, dictionary, key=dictionary.get)[2] if len(dictionary) > 2 else 'None',
            '+2word.POS4freq()': nlargest(100, dictionary, key=dictionary.get)[3] if len(dictionary) > 3 else 'None',
            '+2word.POS5freq()': nlargest(100, dictionary, key=dictionary.get)[4] if len(dictionary) > 4 else 'None',
            '+2word.POS6freq()': nlargest(100, dictionary, key=dictionary.get)[5] if len(dictionary) > 5 else 'None',
            '+2word.POSmin()':max(dictionary.items(), key=operator.itemgetter(1))[0],
            '+2word.POSfirst()': list(dictionary.items())[0][0],
            '+2word.POSlast()': list(dictionary.items())[-1][0],
            '+2word.in.sickness()' : strippedpunc in filtered_sicknesses,
            '+2word.in.symptoms()' : strippedpunc in filtered_symptoms,
            '+2word.sentimentpos()': float(sia.polarity_scores(strippedpunc)['pos']),
            '+2word.sentimentneg()': float(sia.polarity_scores(strippedpunc)['neg']),
            '+2word.sentimentneu()': float(sia.polarity_scores(strippedpunc)['neu'])
        })

    return features

def text2features(text):
    tokenizedtext = list(text)
    return [word2features(i, index, tokenizedtext) for index,i in enumerate(text)]







In [94]:
X_test = [text2features(text) for text in tqdm(finaltexts)]

HBox(children=(FloatProgress(value=0.0, max=1259.0), HTML(value='')))




In [95]:
y_test_pred = crf.predict(X_test)

In [96]:
foroutput = []

for idx,item in enumerate(row_id_finaltext):
    withinfo = [item, ' '.join(y_test_pred[idx])]
    foroutput.append(withinfo)


a = [['ID', 'TAGSEQ']]

xout = a +foroutput

In [97]:
with open('TEST_REVIEW_LABELSEQ_CRF_run3.txt','w',encoding='utf-8-sig') as out:
    for line in xout:
        out.write(str('\t'.join(line)) +'\n') 