In order to improve the performance and involve context of our dataset, this steps generates our own dictionary which will be fed into existing lexicon-based sentiment analyser. Here extracts the frequently used/appeared terms -unigram and bigram- in seminal publications and our dataset in RA field. The associated polarity is identified manually. For example, “adverse_reaction” and “inflamedication” are labeled as negative. “symptom_free” and “work” are remarked as positive.

In [2]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer 
import numpy as np
import random
import pickle
from collections import Counter
import pandas as pd

import gensim

In [75]:
lemmatizer = WordNetLemmatizer()

def create_lexicon(data):
    lexicon = []
  
    for l in data:
        all_words = word_tokenize(l.lower())

        lexicon += list(all_words)
    return lexicon

def sent_to_words(sentences):
    for sentence in sentences:
        yield (gensim.utils.simple_preprocess(str(sentence), deacc=False))  # deacc=True removes punctuations

def create_bigram_lexicon(data_words, threshold = 100, min_count = 5):
        # Build the bigram and trigram models
    #data_words = data_words[0]
    data_words = [[lemmatizer.lemmatize(word) for word in word_list if word.isalpha()] for word_list in data_words ]
    print(data_words)
    bigram = gensim.models.Phrases(data_words, min_count=min_count, threshold=threshold) # higher threshold fewer phrases.
    trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  

    # Faster way to get a sentence clubbed as a trigram/bigram
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    trigram_mod = gensim.models.phrases.Phraser(trigram)

    ngrams = set()
    for i in bigram_mod[data_words[:]]:
        for j in i:
            if j.find("_") !=-1:
                ngrams.add(j.replace(" ", "_"))
                
    return list(ngrams) 

In [None]:
data = pd.read_csv('./Temp_csDMARDs.csv')

In [45]:
Lexicons = create_lexicon(data)


In [56]:
Lexicons = [lemmatizer.lemmatize(i) for i in Lexicons if i.isalpha() and i != "br"]

w_counts = Counter(Lexicons)
unigram= []
for w in w_counts:
    if 20000 > w_counts[w] >50 & len(w)>1:
        unigram.append(w)

In [None]:
bigram = []
data_words = list(sent_to_words(data))
bigram = create_bigram_lexicon(data_words)

In [55]:
def sample_handling(sample, lexicon, classification):
    featureset = []  # [1 0] pos sentiment [0 1] negative sentiment
    with open(sample, 'r') as f:
        contents = f.readlines()
        for l in contents[:hm_lines]:
            current_words = word_tokenize(l.lower())
            current_words = [lemmatizer.lemmatize(i) for i in current_words]
            features = np.zeros(len(lexicon))
            #print(features)
            for word in current_words:
                if word.lower() in lexicon:
                    index_value = lexicon.index(word.lower())
                    # like the example discussed earlier
                    features[index_value] += 1
            features = list(features)
            featureset.append([features, classification])
            #print(featureset)

    return featureset


Counter({'content': 16,
         'ra': 19887,
         'knee': 1542,
         'pain': 10434,
         'br': 87281,
         'i': 140077,
         'think': 4932,
         'it': 55952,
         'interesting': 273,
         'that': 33130,
         'you': 32293,
         'were': 2767,
         'diagnosed': 4021,
         'year': 13417,
         'ago': 3501,
         'taking': 5470,
         'plaquenil': 2997,
         'and': 83274,
         'are': 14005,
         'still': 4766,
         'in': 28760,
         'a': 70641,
         'rheumatologist': 2208,
         'told': 2249,
         'me': 21826,
         'only': 5069,
         'work': 8619,
         'of': 39644,
         'case': 965,
         'sound': 828,
         'like': 7692,
         'not': 17692,
         'working': 3988,
         'for': 42350,
         'there': 6854,
         'better': 5115,
         'arthritis': 8274,
         'treatment': 3381,
         'out': 6406,
         'but': 25339,
         'sometimes': 1230,
         'have

In [77]:
lemmatizer.lemmatize("failed")

'failed'

In [81]:
# 1. Init Lemmatizer
from nltk.corpus import wordnet
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)
lemmatizer = WordNetLemmatizer()

# 2. Lemmatize Single Word with the appropriate POS tag
word = 'feet'
print(lemmatizer.lemmatize(word, get_wordnet_pos(word)))

# 3. Lemmatize a Sentence with the appropriate POS tag
sentence = "The striped bats are hanging on their feet for best"
print([lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in nltk.word_tokenize(sentence)])
#> ['The', 'strip', 'bat', 'be', 'hang', 'on', 'their', 'foot', 'for', 'best']

foot
['The', 'strip', 'bat', 'be', 'hang', 'on', 'their', 'foot', 'for', 'best']
