In [1]:
import numpy as np
import pandas as pd
import os
from itertools import groupby
import copy
import flair 
import json

train_data_path="../data/raw/CoNLL-2003/eng.train"

In [2]:
alist = [line.rstrip() for line in open(train_data_path)][1:]

In [3]:
sents = [list(g) for k,g in groupby(alist, key=lambda x: x != '') if k]

In [4]:
data = [[word.split() for word in sent] for sent in sents]

In [None]:
named_data = []

for sent in data:
    for word in sent:
        if word[-1]=='I-PER':
            named_data.append(sent)
            break

In [None]:
class Word:
    def __init__(self, wordArray):
        self.txt = wordArray[0]
        self.origTxt = wordArray[0]
        self.pos = wordArray[1]
        self.chunk_tag = wordArray[2]
        self.named_entity_tag = wordArray[3]
        
    def isPerson(self):
        return self.named_entity_tag=='I-PER'

class Sentence:
    def __init__(self, sentArray):
        self.words = []
        for index, wordArray in enumerate(sentArray):   
            new_word = Word(wordArray)
            if (new_word.isPerson()):
                if(len(self.words)==0 or not self.words[-1].isPerson()):
                    self.words.append(new_word)
            else:
                 self.words.append(new_word)
            
    def getText(self):
        
        return_str = ""
        
        for index, word in enumerate(self.words):
            if word.pos in ['POS','"','.',','] or index==0:
                return_str += word.txt
            else:
                return_str += ' ' + word.txt
                
        return return_str
    
    def getNameIndices(self):
        return [index for index, word in enumerate(self.words) if word.isPerson()]
    
    def mask(self, index, maskName):
        self.words[index].txt = maskName
        
    def getMaskedInfo(self, maskName):
        return [(word.origTxt, index) for index,word in enumerate(self.words) if word.txt==maskName]
    
    def setSentiment(self, label):
        self.sentiment = label[0].to_dict()        

In [None]:
named_sentence_generator = (Sentence(sentArray) for sentArray in named_data)

In [None]:
masked_sents = []

for sentence in named_sentence_generator:
    nameIndices = sentence.getNameIndices()
    for index in nameIndices:
        sent_copy = copy.deepcopy(sentence)
        sent_copy.mask(index, "[NAME]")
        if not sent_copy.getText()=='[NAME]':
            masked_sents.append(sent_copy)

In [None]:
flair_sentiment = flair.models.TextClassifier.load('en-sentiment')

In [None]:
flair_sentences = [flair.data.Sentence(sent.getText()) for sent in masked_sents]

In [None]:
flair_sentiment.predict(flair_sentences)

In [None]:
labels = [sent.labels for sent in flair_sentences]

In [None]:
[ for]

In [None]:
%matplotlib inline
pd.Series([sent.sentiment["value"] for sent in masked_sents]).hist()

In [None]:
df = pd.DataFrame({"value": [sent.sentiment["value"] for sent in masked_sents], 
                   "confidence": [sent.sentiment["confidence"] for sent in masked_sents]})

df["value"] = df["value"].astype("category").cat.codes + np.random.uniform(low=-0.4, high=0.4, size=len(df))

df.plot.scatter(x='value', y='confidence')

In [None]:
json_sents = [{
    "text": sent.getText(),
    "sentiment_polarity": sent.sentiment['value'],
    "sentiment_confidence": sent.sentiment['confidence'],
    "original_name": sent.getMaskedInfo("[NAME]")[0][0],
    "name_location":sent.getMaskedInfo("[NAME]")[0][1],
    "tagged_words": [(word.txt, word.named_entity_tag) for word in sent.words],
} for sent in masked_sents]

In [None]:
len(json_sents)

In [None]:

with open('../data/processed/masked_sents.json', 'w') as outfile:
    json.dump(json_sents, outfile)