In [1]:
import pandas as pd
import json
import re

In [2]:
with open("../../data/compound_train.json", 'r', encoding='utf-8') as f:
    raw_train = json.load(f)

In [60]:
flatten = lambda x: [j for i in x for j in i if i]

def split_sentence(doc):
    if re.search(r'。', doc):
        return re.findall(r'.*?。', doc)
    
    return [doc]

def get_annotation_dict(annotation_data, attribute):
    if annotation_data.get('entry'):
        annotation_data = annotation_data.get('entry')
        
    return {str(annotation['WikipediaID']): annotation['Attributes'][attribute] for annotation in annotation_data}

def re_isin(patterns):
    escaped_patt = [re.escape(pattern) for pattern in patterns]
    return r'|'.join(escaped_patt)

def labeling(sentence_df, annotation_data, attribute):
    annotation_dict = get_annotation_dict(annotation_data, attribute)
    
    labeled_df = pd.DataFrame()
    for _id, entry in sentence_df.groupby('_id'):
        annotations = annotation_dict.get(str(_id))
        
        if not annotations:
            entry['label'] = [False] * len(entry)
            labeled_df = labeled_df.append(entry)
            continue
        
        annotations = flatten([split_sentence(s) for s in annotations])
        
        entry['label'] = entry.sentence.str.contains(re_isin(annotations))
        labeled_df = labeled_df.append(entry)
        
    return labeled_df.reset_index(drop=True)     

In [63]:
train_df = pd.read_csv("../../data/train.csv")
valid_df = pd.read_csv("../../data/valid.csv")

train_split_df = pd.read_csv("../../data/train_split_words.csv")
valid_split_df = pd.read_csv("../../data/valid_split_words.csv")

In [73]:
production_train_df = labeling(train_split_df, raw_train, '製造方法')
production_valid_df = labeling(valid_split_df, raw_train, '製造方法')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [74]:
print("Train", len(production_train_df))
print("True:", len(production_train_df.loc[production_train_df.label == True]))
print("False:", len(production_train_df.loc[production_train_df.label == False]))

print("Valid", len(production_valid_df))
print("True:", len(production_valid_df.loc[production_valid_df.label == True]))
print("False:", len(production_valid_df.loc[production_valid_df.label == False]))

Train 7435
True: 502
False: 6933
Valid 1564
True: 86
False: 1478


In [79]:
production_train_df.to_csv("../../data/Production_train_split_words.csv", index=False)
production_valid_df.to_csv("../../data/Production_valid_split_words.csv", index=False)