In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing

In [49]:
total = pd.read_csv('total.csv')
count = total.groupby('event_subtype')['file'].count()
subtype = total['event_subtype'].unique()

In [158]:
subtype

array(['Transport', 'Elect', 'Start-Position', 'Nominate', 'Attack',
       'End-Position', 'Meet', 'Marry', 'Phone-Write', 'Transfer-Money',
       'Sue', 'Demonstrate', 'End-Org', 'Injure', 'Die', 'Arrest-Jail',
       'Transfer-Ownership', 'Start-Org', 'Execute', 'Trial-Hearing',
       'Sentence', 'Be-Born', 'Charge-Indict', 'Convict',
       'Declare-Bankruptcy', 'Release-Parole', 'Fine', 'Appeal',
       'Merge-Org', 'Extradite', 'Divorce', 'Acquit'], dtype=object)

In [50]:
subtype = np.delete(subtype,np.where(subtype == 'Pardon')[0][0])

In [51]:
np.random.shuffle(subtype)
total_classes = len(subtype)
base_classes = 7
inc_classes = 5
sessions = 1 + (total_classes - base_classes) // inc_classes

In [52]:
first_session_subtype = np.array(count[count.index.isin(count.sort_values().index[-7:])].index)
subtype = np.setdiff1d(subtype,first_session_subtype)
class_split = [(s - 1) * inc_classes + np.arange(inc_classes) for s in range(1, sessions)]
sessions_subtype = [subtype[cs] for cs in class_split]

In [161]:
sessions_subtype

[array(['Acquit', 'Appeal', 'Arrest-Jail', 'Be-Born', 'Charge-Indict'],
       dtype=object),
 array(['Convict', 'Declare-Bankruptcy', 'Demonstrate', 'Divorce',
        'End-Org'], dtype=object),
 array(['Execute', 'Extradite', 'Fine', 'Injure', 'Marry'], dtype=object),
 array(['Merge-Org', 'Nominate', 'Phone-Write', 'Release-Parole',
        'Sentence'], dtype=object),
 array(['Start-Org', 'Start-Position', 'Sue', 'Transfer-Ownership',
        'Trial-Hearing'], dtype=object)]

In [53]:
train = None
test = None
for current_subtype in first_session_subtype:
    session_rows = total[total.event_subtype == current_subtype]
    mask = np.random.rand(len(session_rows)) < 0.8
    train = session_rows[mask] if train is None else pd.concat([train,session_rows[mask]])
    test = session_rows[~mask] if test is None else pd.concat([test,session_rows[~mask]])

train['session'] = 0
test['session'] = 0
train['type'] = 'train'
test['type'] = 'test'
total_labelled = train
total_labelled = total_labelled.append(test)

In [54]:
session = 1
for session_subtype in sessions_subtype:
    for current_subtype in session_subtype:
        session_rows = total[total.event_subtype == current_subtype]
        train = session_rows.sample(5)
        test_index = np.setdiff1d(session_rows.index,train.index)
        test = session_rows.loc[test_index]

        train['session'] = session
        test['session'] = session
        train['type'] = 'train'
        test['type'] = 'test'
        total_labelled = total_labelled.append(train).append(test)
    session+=1

In [175]:
total_labelled.to_csv('total_labelled.csv',index=None)

In [14]:
total_labelled = pd.read_csv('total_labelled.csv')

le = preprocessing.LabelEncoder()
event_subtype = total_labelled['event_subtype'].unique()
event_subtype = np.insert(event_subtype,0,'None')
le = le.fit(event_subtype)
window_size=31

In [55]:
train = total_labelled[total_labelled.type == 'train']
test = total_labelled[total_labelled.type == 'test']
event_subtype

array(['None', 'Attack', 'Die', 'Elect', 'End-Position', 'Meet',
       'Transfer-Money', 'Transport', 'Acquit', 'Appeal', 'Arrest-Jail',
       'Be-Born', 'Charge-Indict', 'Convict', 'Declare-Bankruptcy',
       'Demonstrate', 'Divorce', 'End-Org', 'Execute', 'Extradite',
       'Fine', 'Injure', 'Marry', 'Merge-Org', 'Nominate', 'Phone-Write',
       'Release-Parole', 'Sentence', 'Start-Org', 'Start-Position', 'Sue',
       'Transfer-Ownership', 'Trial-Hearing'], dtype=object)

In [8]:
total_labelled['event_subtype'].unique()

array(['Attack', 'Die', 'Elect', 'End-Position', 'Meet', 'Transfer-Money',
       'Transport', 'Acquit', 'Appeal', 'Arrest-Jail', 'Be-Born',
       'Charge-Indict', 'Convict', 'Declare-Bankruptcy', 'Demonstrate',
       'Divorce', 'End-Org', 'Execute', 'Extradite', 'Fine', 'Injure',
       'Marry', 'Merge-Org', 'Nominate', 'Phone-Write', 'Release-Parole',
       'Sentence', 'Start-Org', 'Start-Position', 'Sue',
       'Transfer-Ownership', 'Trial-Hearing'], dtype=object)

In [21]:
ind

606

In [56]:

for ind, row in train.iterrows():
    trigger_first_tokens = row['trigger'].split()[0]
    trigger_text_first_end = row['trigger_text_start'] + len(trigger_first_tokens)
    
    index = -1
    tokens = []
    token=''
    pre_char = ''
    for count,char in enumerate(row['text']):
        if pre_char != ' ':
            if (char == ' ' or (count+1) == len(row['text'])):
                if (count+1) == len(row['text']):
                    token += char
                tokens += [token]
                if (trigger_first_tokens in token or token in trigger_first_tokens) and abs(count-trigger_text_first_end) < 10:
                    index = len(tokens) - 1
                    break
                token = ''
        if char != ' ':
            token += char
        pre_char = char
    if index == -1:
        print('error')
        break

    tokens = row['text'].split()

    left_index = max(0,index - int((window_size-1)/2))
    right_index = index+int((window_size-1)/2) +1 #超过也没关系
    
    
    text_windows = tokens[left_index:right_index]
    if len(text_windows)!=window_size:#需要padding
        left_pad_num = (window_size -1)/2- index - left_index
        text_windows = ['<pad>'] * int(left_pad_num) + text_windows
        text_windows = text_windows + ['<pad>'] * (window_size - len(text_windows))
    
    if (text_windows[15] in trigger_first_tokens) or (trigger_first_tokens in text_windows[15]):
        pass
    else:
        print('error2')
        break
    
    train.loc[ind,'text'] = ','.join(text_windows)
    #train.loc[ind,'text'] = ','.join([str(TEXT.vocab.stoi[i]) for i in text_windows])
    train.loc[ind,'label'] = str(le.transform([row['event_subtype']]).item())
    #train.loc[ind,'label'] = ','.join([str(le.transform([row['event_subtype']]).item())] * len(row['trigger'].split()))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = infer_fill_value(value)


In [59]:
for ind, row in test.iterrows():
    trigger_first_tokens = row['trigger'].split()[0]
    trigger_text_first_end = row['trigger_text_start'] + len(trigger_first_tokens)
    index = -1
    tokens = []
    token=''
    pre_char = ''
    for count,char in enumerate(row['text']):
        if pre_char != ' ':
            if (char == ' ' or (count+1) == len(row['text'])):
                if (count+1) == len(row['text']):
                    token += char
                tokens += [token]
                if (trigger_first_tokens in token or token in trigger_first_tokens) and abs(count-trigger_text_first_end) < 10:
                    index = len(tokens) - 1
                    break
                token = ''
        if char != ' ':
            token += char
        pre_char = char
    if index == -1:
        print('error')
        break

    tokens = row['text'].split()

    left_index = max(0,index - int((window_size-1)/2))
    right_index = index+int((window_size-1)/2) +1 #超过也没关系
    
    
    text_windows = tokens[left_index:right_index]
    if len(text_windows)!=window_size:#需要padding
        left_pad_num = (window_size -1)/2- index - left_index
        text_windows = ['<pad>'] * int(left_pad_num) + text_windows
        text_windows = text_windows + ['<pad>'] * (window_size - len(text_windows))
    
    if (text_windows[15] in trigger_first_tokens) or (trigger_first_tokens in text_windows[15]):
        pass
    else:
        print('error2')
        break
    
    test.loc[ind,'text'] = ','.join(text_windows)
    test.loc[ind,'label'] = str(le.transform([row['event_subtype']]).item())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = infer_fill_value(value)


In [12]:
train_large = train[train.session == 0]
train_fewshot = train[train.session != 0]
train_large.to_csv('train_large.csv',index=False)
train_fewshot.to_csv('train_fewshot.csv',index=False)

test[['text','label','session']].to_csv('test.csv',index=False)
