In [1]:
import numpy as np
class InputExample(object):
    def __init__(self, text_a, text_b=None, label_aspect=None, image_id=None):
        self.text_a = text_a
        self.text_b = text_b
        self.label_aspect = label_aspect
        self.image_id = image_id

def read_mner(text_path):
    load_file = text_path
    examples = []
    count = 0
    with open(load_file, "r", encoding="utf-8") as f:
        lines = f.readlines()
        raw_words, raw_targets = [], []
        raw_word, raw_target = [], []
        imgs = []
        for line in lines:
            if line.startswith("IMGID:"):
                img_id = line.strip().split('IMGID:')[1] + '.jpg'
                imgs.append(img_id)
                continue
            if line != "\n":
                raw_word.append(line.split('\t')[0])
                label = line.split('\t')[1][:-1]
                if 'OTHER' in label:
                    label = label[:2] + 'MISC'
                raw_target.append(label)
            else:
                assert len(raw_word) == len(raw_target)
                raw_words.append(raw_word)
                raw_targets.append(raw_target)
                # guid = "%s-%s" % (set_type, count)
                text_a = ' '.join(raw_word)
                tags = raw_target
                image_path_single = str(img_id)
                count += 1
                examples.append(
                    InputExample(text_a=text_a, text_b=None, label_aspect=tags, image_id=image_path_single)
                )
                raw_word, raw_target = [], []
    return examples

In [2]:
def ts2bio(ts_tag_sequence):
    
    new_ts_sequence = []
    n_tags = len(ts_tag_sequence)
    for i in range(n_tags):
        ts_tag = ts_tag_sequence[i]
        if ts_tag == 'O' or ts_tag == 'EQ':
            new_ts_sequence.append('O')
        else:
            new_ts_sequence.append(ts_tag)
    return new_ts_sequence

def bio2bioes_ts(ts_tag_sequence):
    n_tags = len(ts_tag_sequence)
    new_ts_sequence = []
    for i in range(n_tags):
        cur_ts_tag = ts_tag_sequence[i]
        if cur_ts_tag == 'O' or cur_ts_tag == 'EQ':
            # when meet the EQ label, regard it as O label
            new_ts_sequence.append('O')
        else:
            cur_pos, cur_sentiment = cur_ts_tag.split('-')
            if cur_pos == 'B':
                if (i == n_tags - 1) or (ts_tag_sequence[i+1].split('-')[0] != 'I'):
                    new_ts_sequence.append('S-%s' % cur_sentiment)
                else:
                    new_ts_sequence.append('B-%s' % cur_sentiment)
            elif cur_pos == 'I':
                # if (i == n_tags - 1) or (ts_tag_sequence[i+1].split('-')[0] != 'I'):
                # 少考虑这种情况：[O, O, I, O, B, I, I, O, O]
                # 第一个I，如果按照原来的规则，会被标记为E
                if (i == n_tags - 1) or (ts_tag_sequence[i + 1].split('-')[0] != 'I' and i != 0 and ts_tag_sequence[i - 1].split('-')[0] != 'O'):
                    new_ts_sequence.append('E-%s' % cur_sentiment)
                else:
                    new_ts_sequence.append('I-%s' % cur_sentiment)
    return new_ts_sequence

def tag2ts(ts_tag_sequence):
    """
    transform ts tag sequence to targeted sentiment
    :param ts_tag_sequence: tag sequence for ts task
    :return:
    """
    n_tags = len(ts_tag_sequence)
    ts_sequence, sentiments = [], []
    beg, end = -1, -1
    for i in range(n_tags):
        ts_tag = ts_tag_sequence[i]
        # current position and sentiment
        # tag O and tag EQ will not be counted
        eles = ts_tag.split('-')
        if len(eles) == 2:
            pos, sentiment = eles
        else:
            pos, sentiment = 'O', 'O'
        if sentiment != 'O':
            # current word is a subjective word
            sentiments.append(sentiment)
        if pos == 'S':
            # singleton
            ts_sequence.append((i, i, sentiment))
            sentiments = []
        elif pos == 'B':
            beg = i
            if len(sentiments) > 1:
                # remove the effect of the noisy I-{POS,NEG,NEU}
                sentiments = [sentiments[-1]]
        elif pos == 'E':
            end = i
            # schema1: only the consistent sentiment tags are accepted
            # that is, all of the sentiment tags are the same
            if end > beg > -1 and len(set(sentiments)) == 1:
                ts_sequence.append((beg, end, sentiment))
                sentiments = []
                beg, end = -1, -1
    return ts_sequence


In [3]:
# multi entities
import pandas as pd
def data_to_pandas(examples):
    pd_test = pd.DataFrame(columns=["text", "entity_category", "image_id", "person", "location", "organization", "miscellaneous"])
    per_count, loc_count, org_count, misc_count = 0, 0, 0, 0
    for i in range(len(examples)):
        text = examples[i].text_a
        label_aspect = examples[i].label_aspect
        image_id = examples[i].image_id
        tags = bio2bioes_ts(ts2bio(label_aspect))
        ts_sequence = tag2ts(ts_tag_sequence=tags)
        text_list = text.split(" ")
        entity_list,category_list = [], []
        person_list, location_list, org_list, misc_list = [], [], [], []
        for ner in ts_sequence:
            entity = " ".join(text_list[ner[0]:ner[1]+1])
            category = ner[-1]
            if category == "MISC":
                category = "miscellaneous"
                misc_list.append(entity)
                misc_count += 1
            elif category == "LOC":
                category = "location"
                location_list.append(entity)
                loc_count += 1
            elif category == "ORG":
                category = "organization"
                org_list.append(entity)
                org_count += 1
            elif category == "PER":
                category = "person"
                person_list.append(entity)
                per_count += 1
            else:
                raise ValueError("error category")
            entity_list.append(entity)
            category_list.append(category)
        entity_category = []
        for j in range(len(entity_list)):
            entity_category.append((entity_list[j], category_list[j]))
        pd_test.loc[i, "text"] = text
        pd_test.loc[i, "entity_category"] = entity_category
        pd_test.loc[i, "image_id"] = image_id
        
        # "person", "location", "organization", "miscellaneous"
        pd_test.loc[i, "person"] = person_list
        pd_test.loc[i, "location"] = location_list
        pd_test.loc[i, "organization"] = org_list
        pd_test.loc[i, "miscellaneous"] = misc_list
    print("per_count", per_count)
    print("loc_count", loc_count)
    print("org_count", org_count)
    print("misc_count", misc_count)
    return pd_test

train_path = "../dataset/mner/twitter2015/train.txt"
dev_path = "../dataset/mner/twitter2015/dev.txt"
test_path = "../dataset/mner/twitter2015/test.txt"

train_examples = read_mner(train_path)
dev_examples = read_mner(dev_path)
test_examples = read_mner(test_path)

pd_train = data_to_pandas(examples=train_examples)
pd_dev = data_to_pandas(examples=dev_examples)
pd_test = data_to_pandas(examples=test_examples)

print(len(pd_train), len(pd_dev), len(pd_test))

per_count 2218
loc_count 2088
org_count 925
misc_count 931
per_count 552
loc_count 522
org_count 247
misc_count 219
per_count 1825
loc_count 1728
org_count 839
misc_count 724
4000 1000 3257


In [4]:
type(pd_train.loc[0, "person"])

list

In [5]:
# save processed files
pd_train.to_csv("twitter2015_process_train.csv", index=False, sep='\t')
pd_dev.to_csv("twitter2015_process_dev.csv", index=False, sep='\t')
pd_test.to_csv("twitter2015_process_test.csv", index=False, sep='\t')

In [6]:
read_test = pd.read_csv("twitter2015_process_train.csv",sep='\t')

In [7]:
read_test.head(20)

Unnamed: 0,text,entity_category,image_id,person,location,organization,miscellaneous
0,RT @JayKenMinaj _ : Me outside of where George...,"[('George Zimmerman', 'person')]",1015799.jpg,['George Zimmerman'],[],[],[]
1,"Swan upping : first stop Hermitage Warf , Towe...","[('Hermitage Warf', 'location'), ('Tower Bridg...",1109405.jpg,[],"['Hermitage Warf', 'Tower Bridge', 'Tower']",[],['Olympic']
2,RT @redbullESPORTS : Smash Shiba is stoked for...,"[('Smash Shiba', 'miscellaneous'), ('Melee Gra...",563049.jpg,[],[],[],"['Smash Shiba', 'Melee Grand Finals']"
3,RT @washingtonpost : Two maps that show the sh...,"[('Baltimore', 'location')]",50447.jpg,[],['Baltimore'],[],[]
4,Rep . Howard Coble mingling ahead of press con...,"[('Howard Coble', 'person')]",418340.jpg,['Howard Coble'],[],[],[]
5,Psychologists explain why Katie Hopkins is jus...,"[('Katie Hopkins', 'person'), ('Katie #Hopkins...",50168.jpg,"['Katie Hopkins', 'Katie #Hopkins']",[],[],[]
6,Do you even meditate tho bro ? I start class a...,"[('LA', 'location')]",684051.jpg,[],['LA'],[],[]
7,RT @cnni : Pope Francis calls for an end to th...,"[('Pope Francis', 'person')]",1351124.jpg,['Pope Francis'],[],[],[]
8,RT @ShervinSinatra : Amsterdam Savage AF . htt...,"[('Amsterdam Savage AF', 'organization')]",94770.jpg,[],[],['Amsterdam Savage AF'],[]
9,RT @jordancornette : This is how Alex Rodrigue...,"[('Alex Rodriguez', 'person'), ('Notre Dame', ...",64931.jpg,"['Alex Rodriguez', 'ARod']",[],['Notre Dame'],[]
