In [1]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' #disable all tensorflow logging output
os.environ["CUDA_VISIBLE_DEVICES"]="0" #0,1,2,3 for four gpu

import pickle

import numpy as np
import pandas as pd

import tensorflow as tf
from transformers import *

In [2]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

# USE MULTIPLE GPUS
if os.environ["CUDA_VISIBLE_DEVICES"].count(',') == 0:
    strategy = tf.distribute.get_strategy()
    print('single strategy')
else:
    strategy = tf.distribute.MirroredStrategy()
    print('multiple strategy')

Num GPUs Available:  1
single strategy


In [3]:
train = pd.read_csv('../input/feedback-prize-2021/train.csv')
print('df shape', train.shape)
print('discourse types: ', train['discourse_type'].unique())
print('mean len: ', train['discourse_end'].mean())
train.head()

df shape (144293, 8)
discourse types:  ['Lead' 'Position' 'Evidence' 'Claim' 'Concluding Statement'
 'Counterclaim' 'Rebuttal']
mean len:  1200.791202622442


Unnamed: 0,id,discourse_id,discourse_start,discourse_end,discourse_text,discourse_type,discourse_type_num,predictionstring
0,423A1CA112E2,1622628000000.0,8.0,229.0,Modern humans today are always on their phone....,Lead,Lead 1,1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 1...
1,423A1CA112E2,1622628000000.0,230.0,312.0,They are some really bad consequences when stu...,Position,Position 1,45 46 47 48 49 50 51 52 53 54 55 56 57 58 59
2,423A1CA112E2,1622628000000.0,313.0,401.0,Some certain areas in the United States ban ph...,Evidence,Evidence 1,60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75
3,423A1CA112E2,1622628000000.0,402.0,758.0,"When people have phones, they know about certa...",Evidence,Evidence 2,76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 9...
4,423A1CA112E2,1622628000000.0,759.0,886.0,Driving is one of the way how to get around. P...,Claim,Claim 1,139 140 141 142 143 144 145 146 147 148 149 15...


In [4]:
# functions for loading and train/val data

def load_train_data(MODEL_NAME="bert-base-cased", MAX_LEN=1024):
    # construct tokenizer
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    
    # load csv file
    df = pd.read_csv('../input/feedback-prize-2021/train.csv')
    IDS = df.id.unique()
    train_ids = np.zeros((len(IDS), MAX_LEN), dtype='int32')
    train_attention = np.zeros((len(IDS), MAX_LEN), dtype='int32')
    
    # init labels
    label_to_ind = {
        'Lead_b': 0,
        'Lead_i': 1,
        'Position_b': 2,
        'Position_i': 3,
        'Evidence_b': 4,
        'Evidence_i': 5,
        'Claim_b': 6,
        'Claim_i': 7,
        'Concluding Statement_b': 8,
        'Concluding Statement_i': 9,
        'Counterclaim_b': 10,
        'Counterclaim_i': 11,
        'Rebuttal_b': 12,
        'Rebuttal_i': 13,
        'other': 14
    }    
    train_labels = np.zeros((len(IDS), MAX_LEN, len(label_to_ind)), dtype='int32')
    
    # form samples
    for i in range(len(IDS)):
        if i % 1000 == 0:
            print(i)
        # read txt file
        filename = '../input/feedback-prize-2021/train/{}.txt'.format(IDS[i])
        txt = open(filename, 'r').read()
        
        # tokenize
        tokens = tokenizer.encode_plus(txt, max_length=MAX_LEN, padding='max_length',
                                       truncation=True, return_offsets_mapping=True)
        train_ids[i, :] = tokens['input_ids']
        train_attention[i, :] = tokens['attention_mask']
        offsets = tokens['offset_mapping']
        
        # extract labels for each token
        curr_df = df.loc[df.id==IDS[i]]
        offset_ind = 0
        for index,row in curr_df.iterrows():
            label = row.discourse_type + '_b'
            
            w_start = row.discourse_start
            w_end = row.discourse_end
            
            if offset_ind >= len(offsets):
                break
            
            # set labels
            t_start = offsets[offset_ind][0]
            while w_end > t_start:
                # exit condition
                if offset_ind >= len(offsets):
                    break
                
                # get current token index
                t_start = offsets[offset_ind][0]
                t_end = offsets[offset_ind][1]
                
                # set label if within range
                if t_end <= w_end:
                    train_labels[i, offset_ind, label_to_ind[label]] = 1
                    label = row.discourse_type + '_i'
                
                # update global var(s)
                offset_ind += 1
    train_labels[:, :, 14] = 1 - np.max(train_labels, axis=-1)
    return train_ids, train_attention, train_labels

def load_test_data(MODEL_NAME="bert-base-cased", MAX_LEN=1024):
    # construct tokenizer
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    
    IDS = os.listdir('../input/feedback-prize-2021/test')
    IDS = [i.split('.')[0] for i in IDS]
    test_ids = np.zeros((len(IDS), MAX_LEN), dtype='int32')
    test_attention = np.zeros((len(IDS), MAX_LEN), dtype='int32')
    
    # form samples
    for i in range(len(IDS)):
        if i % 1000 == 0:
            print(i)
        # read txt file
        filename = '../input/feedback-prize-2021/test/{}.txt'.format(IDS[i])
        txt = open(filename, 'r').read()
        
        # tokenize
        tokens = tokenizer.encode_plus(txt, max_length=MAX_LEN, padding='max_length',
                                       truncation=True, return_offsets_mapping=True)
        test_ids[i, :] = tokens['input_ids']
        test_attention[i, :] = tokens['attention_mask']
    
    return test_ids, test_attention, IDS

def train_val(model, ids, attention, labels, 
              train_size=0.8, 
              epochs=5,
              batch_size=32,
              saved_name='saved_model.h5'
             ):
    # TRAIN VALID SPLIT 80% 20%
    np.random.seed(42)
    IDS = pd.read_csv('../input/feedback-prize-2021/train.csv').id.unique()
    inds = [i for i in range(len(IDS))]
    np.random.shuffle(inds)
    split_point = int(train_size * len(inds))
    train_idx = inds[:split_point]
    val_idx = inds[split_point:]
    print('Train size',len(train_idx),', Valid size',len(val_idx))

    print('start training...')
    model.fit(x = [ids[train_idx,], attention[train_idx,]],
              y = labels[train_idx,],
              validation_data = ([ids[val_idx,], attention[val_idx,]],
                                 labels[val_idx,]),
              epochs = epochs,
              batch_size = batch_size,
              verbose = 2)

    # SAVE MODEL WEIGHTS
    model.save_weights(saved_name)
    

In [5]:
# define models
def build_model(MODEL_NAME="bert-base-cased", MAX_LEN=1024, LR=1e-4):
    # construct input
    input_ids = tf.keras.layers.Input(shape=(MAX_LEN,), name='input_ids', dtype='int32')
    mask = tf.keras.layers.Input(shape=(MAX_LEN,), name='attention_mask', dtype='int32')
    
    # pretrained/finetuned model (Transformers)
    config = AutoConfig.from_pretrained(MODEL_NAME)
    backbone = TFAutoModel.from_pretrained(MODEL_NAME, config=config)
#     backbone.trainable = False
    
#     # save the model
#     os.mkdir('model')
#     backbone.save_pretrained('model')
#     config.save_pretrained('model')
#     tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
#     tokenizer.save_pretrained('model')
    
    # downstream output layer(s)
    x = backbone(input_ids, attention_mask=mask)
    x = tf.keras.layers.Dense(256, activation='relu')(x[0])
    x = tf.keras.layers.Dense(15, activation='softmax', dtype='float32')(x)
    
    # integration
    model = tf.keras.Model(inputs=[input_ids,mask], outputs=x)
    model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate = LR),
                  loss = [tf.keras.losses.CategoricalCrossentropy()],
                  metrics = [tf.keras.metrics.CategoricalAccuracy()])
    
    return model

In [6]:
# # MODEL_NAME = "bert-base-cased"
# MODEL_NAME = "../input/feedbacksaved/BERT" # load from pretrained.
# MAX_LEN = 512
# LR=0.25e-4

# # # MODEL_NAME = 'allenai/longformer-base-4096'
# # MODEL_NAME = '../input/feedbacksaved/LongFormer'
# # MAX_LEN = 1024
# # LR=0.25e-4

# # # processing data
# # train_ids, train_attention, train_labels = load_train_data(MODEL_NAME=MODEL_NAME, MAX_LEN=MAX_LEN)

# # with open('tokenized_data_longformer.pkl', 'wb') as f:
# #     saved = {
# #         'train_ids': train_ids,
# #         'train_attention': train_attention,
# #         'train_labels': train_labels
# #     }
# #     pickle.dump(saved, f)

# # load saved data and build model
# with open('../input/feedbacksaved/tokenized_data_longformer.pkl', 'rb') as f:
#     saved = pickle.load(f)
#     ids = saved['train_ids'][:, :MAX_LEN]
#     attention = saved['train_attention'][:, :MAX_LEN]
#     labels = saved['train_labels'][:, :MAX_LEN, :]

# print('input seq shape', ids.shape)
# print('attention shape', attention.shape)
# print('labels shape', labels.shape)

# with strategy.scope():
#     model = build_model(MODEL_NAME=MODEL_NAME, MAX_LEN=MAX_LEN, LR=LR)
# model.summary()

In [7]:
# # load trained model if available
# model.load_weights('./BERT_entire.h5')

# # # train-val model
# # train_val(model, ids, attention, labels, 
# #           train_size=0.8, 
# #           epochs=5,
# #           batch_size=16,
# #           saved_name='saved_model.h5')

# # train on entire training set
# train_val(model, ids, attention, labels, 
#           train_size=1.0, 
#           epochs=3,
#           batch_size=4,
#           saved_name='{}_entire_0.h5'.format(MODEL_NAME.split('/')[-1]))

<a href="/kaggle/working/LongFormer_entire.h5"> Download File </a>

**Make Prediction on Test Set, and Postprocessing**  
The following codes are mainly for test-time running and post-processing

In [8]:
def word_to_label(folder, pred, IDS, MODEL_NAME = "bert-base-cased", MAX_LEN=1024):
    # init labels
    label_to_ind = {
        'Lead_b': 0,
        'Lead_i': 1,
        'Position_b': 2,
        'Position_i': 3,
        'Evidence_b': 4,
        'Evidence_i': 5,
        'Claim_b': 6,
        'Claim_i': 7,
        'Concluding Statement_b': 8,
        'Concluding Statement_i': 9,
        'Counterclaim_b': 10,
        'Counterclaim_i': 11,
        'Rebuttal_b': 12,
        'Rebuttal_i': 13,
        'other': 14
    }
    ind_to_label = dict()
    for key in label_to_ind:
        ind_to_label[label_to_ind[key]] = key
        
    # construct tokenizer
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    
    word_classes = {
        'id': list(),
        'classes': list()
    }
    for i in range(len(IDS)):
        word_classes['id'].append(IDS[i])
        
        # verbose
        if i % 1000 == 0:
            print(i)
            
        # read txt file
        filename = '../input/feedback-prize-2021/{}/{}.txt'.format(folder, IDS[i])
        txt = open(filename, 'r').read()
        txt_len = len(txt)
        words_num = len(txt.split())
        
        # tokenize
        tokens = tokenizer.encode_plus(txt, max_length=MAX_LEN, padding='max_length',
                                       truncation=True, return_offsets_mapping=True)
        token_ids = tokens['input_ids']
        offsets = tokens['offset_mapping']
        
        # extract word class
        word_class = list() # 1D array
        curr_words = list() # list of tuple (token_len, label_name)
        for j in range(MAX_LEN):
            # exit condition
            if len(word_class) == words_num:
                break
            
            # get current token index
            t_start = offsets[j][0]
            t_end = offsets[j][1]
            
            token_len = t_end - t_start
            curr_label = ind_to_label[pred[i, j]]
            curr_words.append((token_len, curr_label))
            
            # update word map                
            if t_end >= txt_len or txt[t_end] == ' ': # means the ending of a word
                if len(curr_words) < 2: # the word is not splitted
                    word_class.append(curr_label)
                else:
                    word_class.append(sorted(curr_words, key=lambda x: x[0])[-1][1])
                curr_words = list()
        word_classes['classes'].append(word_class)
        
    return word_classes

def form_raw_df(word_classes):
    res = {
        'id': list(),
        'class': list(),
        'predictionstring': list(), # 2D array
    }
    
    for i in range(len(word_classes['id'])):
        curr_seg = list() # (class, idx)
        for j in range(len(word_classes['classes'][i])):
            curr_class = word_classes['classes'][i][j].split('_')
            class_name = curr_class[0]
            pos = curr_class[1] if len(curr_class) > 1 else 'i'
            
#             if len(curr_seg) < 1 or class_name == curr_seg[-1][0]:
#                 curr_seg.append((class_name, j))
#             else:
#                 res['id'].append(word_classes['id'][i])
#                 res['class'].append(curr_seg[-1][0])
#                 res['predictionstring'].append(' '.join([str(k[1]) for k in curr_seg]))
#                 curr_seg = [(class_name, j)]

            if pos == 'b': # if it's the begining of a segment
                if len(curr_seg) < 1: # haven't init
                    curr_seg.append((class_name, j))
                else: # the close of previous segment
                    res['id'].append(word_classes['id'][i])
                    res['class'].append(curr_seg[-1][0])
                    res['predictionstring'].append(' '.join([str(k[1]) for k in curr_seg]))
                    curr_seg = [(class_name, j)]
            elif len(curr_seg) >= 1 and class_name == curr_seg[-1][0]: # if it's the inside of a segment with same class as begining
                curr_seg.append((class_name, j))
            elif len(curr_seg) >= 1:
                res['id'].append(word_classes['id'][i])
                res['class'].append(curr_seg[-1][0])
                res['predictionstring'].append(' '.join([str(k[1]) for k in curr_seg]))
                curr_seg = list()
            else:
                curr_seg = list()
    
    return pd.DataFrame(res)

def post_processing_mode(folder, word_classes): # determine the class of a sentence by mode of label
    res = {
        'id': list(),
        'class': list(),
        'predictionstring': list(), # 2D array
    }
    
    def find_mode_label(arr):
        label_ct = dict()
        for label in arr:
            if label_ct.get(label) == None:
                label_ct[label] = 1
            else:
                label_ct[label] += 1
        label = sorted(label_ct.items(), key=lambda x: x[1])[-1][0]
        return label
    
    for i in range(len(word_classes['id'])):
        # read txt file
        filename = '../input/feedback-prize-2021/{}/{}.txt'.format(folder, word_classes['id'][i])
        words = open(filename, 'r').read().split()
        
        ending = ['.', '!', '?']
        curr_sentence = list() # list of tuples: (word_idx, class_name)
        for j in range(len(word_classes['classes'][i])):
            curr_sentence.append((j, word_classes['classes'][i][j]))
            word = words[j]
            if word[-1] in ending:
                label = find_mode_label([k[1].split('_')[0] for k in curr_sentence])
                
                if len(res['id']) > 0 and word_classes['id'][i] == res['id'][-1] and label == res['class'][-1]:
                    res['predictionstring'][-1] += ' ' + ' '.join([str(k[0]) for k in curr_sentence])
                else:
                    res['id'].append(word_classes['id'][i])
                    res['class'].append(label)
                    res['predictionstring'].append(' '.join([str(k[0]) for k in curr_sentence]))
                
                # clear up
                curr_sentence = list()
                
        if len(curr_sentence) > 0:
            label = find_mode_label([k[1].split('_')[0] for k in curr_sentence])
            
            if len(res['id']) > 0 and word_classes['id'][i] == res['id'][-1] and label == res['class'][-1]:
                res['predictionstring'][-1] += ' ' + ' '.join([str(k[0]) for k in curr_sentence])
            else:
                res['id'].append(word_classes['id'][i])
                res['class'].append(label)
                res['predictionstring'].append(' '.join([str(k[0]) for k in curr_sentence]))
    return pd.DataFrame(res)

# =====================================================================
target_map_rev = {0: 'Lead', 1: 'Position', 2: 'Evidence', 3: 'Claim', 4: 'Concluding Statement', 5: 'Counterclaim', 6: 'Rebuttal', 7: 'blank'}

def get_preds(dataset = 'train', verbose = True, text_ids = None, preds = None):
    # construct tokenizer
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    
    all_predictions = list()
    for id_num in range(len(preds)):
#         if (id_num % 100 == 0) & (verbose): print(id_num, ', ', end = '')
        n = text_ids[id_num]
        name = f'../input/feedback-prize-2021/{dataset}/{n}.txt'
        txt = open(name, 'r').read()
        tokens = tokenizer.encode_plus(txt, max_length = MAX_LEN, padding = 'max_length', truncation = True, return_offsets_mapping = True)
        off = tokens['offset_mapping']
        w = list()
        blank = True
        for i in range(len(txt)):
            if (txt[i] != ' ') & (txt[i] != '\n') & (blank == True):
                w.append(i)
                blank = False
            elif (txt[i] == ' ') | (txt[i] == '\n'):
                blank = True
        w.append(1e6)
        word_map = -1 * np.ones(MAX_LEN, dtype = 'int32')
        w_i = 0
        for i in range(len(off)):
            if off[i][1] == 0: continue
            while off[i][0] >= w[w_i + 1]: w_i += 1
            word_map[i] = int(w_i)
        
        pred = preds[id_num,] / 2.0
        i = 0
        while i < MAX_LEN:
            prediction = []
            start = pred[i]
            if start in [0, 1, 2, 3, 4, 5, 6, 7]:
                prediction.append(word_map[i])
                i += 1
                if i >= MAX_LEN: break
                while pred[i] == start + 0.5:
                    if not word_map[i] in prediction: prediction.append(word_map[i])
                    i += 1
                    if i >= MAX_LEN: break
            else: i += 1
            prediction = [x for x in prediction if x != -1]
            if len(prediction) > 4: 
                all_predictions.append((n, target_map_rev[int(start)], ' '.join([str(x) for x in prediction])))

    # MAKE DATAFRAME
    df = pd.DataFrame(all_predictions)
    df.columns = ['id', 'class', 'predictionstring']
    return df

In [9]:
# MODEL_NAME = "bert-base-cased"
# MODEL_NAME = "../input/feedbacksaved/BERT" # load from pretrained.
# MAX_LEN = 512

# MODEL_NAME = 'allenai/longformer-base-4096'
MODEL_NAME = '../input/feedbacksaved/LongFormer'
MAX_LEN = 1024

# build and load model
with strategy.scope():
    model = build_model(MODEL_NAME=MODEL_NAME, MAX_LEN=MAX_LEN)
model.load_weights('../input/feedbacksaved/LongFormer_entire.h5')
print('Model Loading Complete.')

# load test data
test_ids, test_attention, test_IDS = load_test_data(MODEL_NAME=MODEL_NAME, MAX_LEN=MAX_LEN)
print('Test Data Loading Complete.')

# make prediction
test_pred = model.predict([test_ids, test_attention], batch_size=4, verbose=2).argmax(axis=-1)
print('Prediction Complete.')

All model checkpoint layers were used when initializing TFLongformerModel.

All the layers of TFLongformerModel were initialized from the model checkpoint at ../input/feedbacksaved/LongFormer.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFLongformerModel for predictions without further training.


Model Loading Complete.
0
Test Data Loading Complete.
2/2 - 16s
Prediction Complete.


In [10]:
# form word-class map
word_label_map = word_to_label('test', test_pred, test_IDS, MODEL_NAME=MODEL_NAME, MAX_LEN=MAX_LEN)
print('Word-Class Mapping Complete.')

# post processing
# raw_df = form_raw_df(word_label_map)
# test_res_int = raw_df
# test_res_int = post_processing_mode('test', word_label_map)

# # quick check
# print(test_res_int.shape)
# test_res_int.head(10)

0
Word-Class Mapping Complete.


In [11]:
test_res_int = get_preds(dataset='test', verbose=False, text_ids=test_IDS, preds=test_pred)
# map_clip = {'Lead':9, 'Position':5, 'Evidence':14, 'Claim':3, 'Concluding Statement':11, 'Counterclaim':6, 'Rebuttal':4}
# def threshold(df):
#     df = df.copy()
#     for key, value in map_clip.items():
#     # if df.loc[df['class']==key,'len'] < value 
#         index = df.loc[df['class']==key].query(f'len<{value}').index
#         df.drop(index, inplace = True)
#     return df

# test_res_int['len'] = test_res_int['predictionstring'].apply(lambda x:len(x.split()))
# test_res_int = threshold(test_res_int)

# quick check
print(test_res_int.shape)
test_res_int.head(10)

(40, 3)


Unnamed: 0,id,class,predictionstring
0,0FB0700DAF44,Lead,0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18...
1,0FB0700DAF44,Position,41 42 43 44 45 46 47
2,0FB0700DAF44,Claim,49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64
3,0FB0700DAF44,Claim,65 66 67 68 69 70 71 72 73 74
4,0FB0700DAF44,Evidence,120 121 122 123 124 125 126 127 128 129 130 13...
5,0FB0700DAF44,Claim,314 315 316 317 318 319 320 321 322 323 324 32...
6,0FB0700DAF44,Evidence,342 343 344 345 346 347 348 349 350 351 352 35...
7,0FB0700DAF44,Concluding Statement,560 561 562 563 564 565 566 567 568 569 570 57...
8,D72CB1C11673,Lead,0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18...
9,D72CB1C11673,Position,51 52 53 54 55 56 57 58 59 60


In [12]:
# write to file
test_res_int.to_csv('submission.csv',index=False)