# Notes
## Papers
* [Text Segmentation as a Supervised Learning Task](https://arxiv.org/pdf/1803.09337.pdf)

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        pass
        #print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Importing and exploring data

In [2]:
pd_annotations = pd.read_csv('/kaggle/input/feedback-prize-2021/train.csv')
document_ids = list(pd_annotations['id'].unique())

num_documents = 100

pd_annotations.head()

Unnamed: 0,id,discourse_id,discourse_start,discourse_end,discourse_text,discourse_type,discourse_type_num,predictionstring
0,423A1CA112E2,1622628000000.0,8.0,229.0,Modern humans today are always on their phone....,Lead,Lead 1,1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 1...
1,423A1CA112E2,1622628000000.0,230.0,312.0,They are some really bad consequences when stu...,Position,Position 1,45 46 47 48 49 50 51 52 53 54 55 56 57 58 59
2,423A1CA112E2,1622628000000.0,313.0,401.0,Some certain areas in the United States ban ph...,Evidence,Evidence 1,60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75
3,423A1CA112E2,1622628000000.0,402.0,758.0,"When people have phones, they know about certa...",Evidence,Evidence 2,76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 9...
4,423A1CA112E2,1622628000000.0,759.0,886.0,Driving is one of the way how to get around. P...,Claim,Claim 1,139 140 141 142 143 144 145 146 147 148 149 15...


In [3]:
# Let's look at txt with id '0000D23A521A'
print(pd_annotations[pd_annotations.id == '0000D23A521A']['predictionstring'].iloc[0])
print(pd_annotations[pd_annotations.id == '0000D23A521A']['predictionstring'])

0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33
59951    0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18...
59952    34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 4...
59953         69 70 71 72 73 74 75 76 77 78 79 80 81 82 83
59954    84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 9...
59955    117 118 119 120 121 122 123 124 125 126 127 12...
59956    134 135 136 137 138 139 140 141 142 143 144 14...
59957    154 155 156 157 158 159 160 161 162 163 164 16...
59958    186 187 188 189 190 191 192 193 194 195 196 19...
Name: predictionstring, dtype: object


# Segmentation
Let's start with segmenting the text. We'll work on classifying those segments later.

Per Koshorek et al:
> Input \\(x\\) will be represented as a sequence of \\(n\\) sentences \\(s_1,...,s_n\\), and the label \\(y = (y_1,...,y_{n-1})\\) is a segmentation of the document, represented by \\(n-1\\) binary values, where \\(y_i\\) denotes whether \\(s_i\\) ends a segment.

\\(y_i = 0\\) indicates \\(s_i\\) is not a border, while a value of \\(y_i = 1\\) indicates \\(s_i\\) is a border sentence.

In [4]:

WHITE_SPACE_CHARS = [' ', '\t', '\n', '\r', '\x0b', '\x0c']
from transformers import BertTokenizerFast

# This will let us grab text and an associated label. 'O' means "outside" of any relevant segments.
def get_entities(text, doc_id):
    token_array = np.zeros(298)
    char_array = np.zeros((len(text)))
    
    entities = []
    
    pd = pd_annotations[pd_annotations['id'] == doc_id]
    
    start_i = [int(row) for row in list(pd['discourse_start'])]
    end_i = [int(row) for row in list(pd['discourse_end'])]
    annotations = [row for row in list(pd['discourse_type'])]
    
    print(start_i)
    text_i = 0
    for i in range(len(start_i)):
        if text_i < start_i[i]:
            entities.append((text[ text_i : start_i[i] ], 'O'))
            text_i = start_i[i]
        entities.append((text[ start_i[i] : end_i[i] ], annotations[i]))
        text_i = end_i[i]
    return entities

all_entities = {}
for uid in document_ids[0:num_documents]:
    doc = open(f'/kaggle/input/feedback-prize-2021/train/{uid}.txt').read()
    all_entities[uid] = get_entities(doc, uid)

[8, 230, 313, 402, 759, 887, 1151, 1534, 1603, 1891]
[18, 86, 203, 1031]
[36, 513, 594, 727, 1245, 1440, 1682]
[0, 455, 528, 569, 589, 782, 937, 1404, 1507, 2043, 2188, 2878]
[21, 380, 461, 541, 955, 1052, 1530, 1604, 1914]
[52, 200, 245, 295, 418, 516, 590, 1162, 1220, 1718, 1843, 2444]
[24, 95, 295, 546, 682, 1009, 1359, 1480, 1945]
[0, 429, 544, 725, 1333, 1498, 1870, 2535]
[63, 129, 252, 1030, 1161, 1350, 1512, 1859]
[0, 779, 1000, 1118, 1351, 1467]
[0, 223, 295, 319, 693, 785, 937]
[0, 78, 634, 976, 1525, 1637, 2287]
[0, 462, 532, 641, 838, 999, 1348, 1455, 1641]
[0, 68, 108, 150, 199, 274, 1048]
[20, 371, 477, 1322, 1436, 1498, 2131, 2256, 3162, 3218]
[0, 360, 554, 688, 767, 777, 802, 943, 1285, 1390, 1910, 2005, 2510, 2675]
[0, 608, 719, 879, 1284, 1368]
[0, 145, 393, 686, 861, 953, 1425, 1487, 1889]
[32, 544, 637, 696, 726, 763, 872, 1437, 1601, 2161, 2335, 2842]
[20, 418, 484, 997, 1211, 1384, 1724, 1786]
[0, 201, 309, 942, 1299, 1450, 2027, 2155, 2711]
[17, 260, 317, 357, 399

# Text preprocessing
Now we have a bunch of segments. Let's go ahead and get them ready for encoding.
* Lematize
* Tokenize
* word2vec embeddings

In [5]:
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

lemmatizer = WordNetLemmatizer()

preprocessed_entities = {}

for uid in all_entities:
    preprocessed_entities[uid] = []
    for entity in all_entities[uid]:
        tokenized = word_tokenize(entity[0])
        tokenized = [word for word in tokenized if word not in stop_words]
        lemmatized = [lemmatizer.lemmatize(token) for token in tokenized]
        preprocessed_entities[uid].append(lemmatized)

    

In [6]:
# After all texts preprocessed, we want to build up our dictionary for one hot encoding

index_to_word = []
word_to_index = {}

for uid in preprocessed_entities:
    for entity in preprocessed_entities[uid]:
        for token in entity:
            if token not in index_to_word:
                index_to_word.append(token)
                
for i in range(len(index_to_word)):
    word_to_index[index_to_word[i]] = i

# How many words are there in the corpus?
num_input_tokens = len(index_to_word)
num_documents = len(preprocessed_entities)
max_token_length = max([sum([len(entity) for entity in preprocessed_entities[uid]]) for uid in preprocessed_entities])


In [8]:
DISCOURSE_TYPES = {'O': 0, 'Lead': 2, 'Position': 4, 'Claim': 6, 'Counterclaim': 8, 'Rebuttal': 10, 'Evidence': 12, 'Concluding Statement': 14}
IBO_LABELS = {'O': 0, 'LB': 1, 'LI': 2, 'PB': 3, 'PI': 4, 'CB': 5, 'CI': 6, 'XB': 7, 'XI': 8, 'RB': 9, 'RI': 10, 'EB':11, 'EI': 12, 'SB': 13, 'SI': 14}

# Build the target array
targets = {}
for uid in preprocessed_entities:
    targets[uid] = []
    for i in range(len(preprocessed_entities[uid])): # For each entity
        for j in range(len(preprocessed_entities[uid][i])): # For each token
            annotation = all_entities[uid][i][1]
            label = DISCOURSE_TYPES[annotation]
            
            if j == 0 and label > 0:
                label -= 1
                
            targets[uid].append(label)

In [9]:
# One hot encoding for input and output



one_hot_input = np.zeros((num_documents, max_token_length, num_input_tokens))
y = np.zeros((num_documents, max_token_length, len(IBO_LABELS)))

for i, uid in enumerate(preprocessed_entities):
    k = 0
    print(uid)
    for entity in preprocessed_entities[uid]:
        for token in entity:
            one_hot_input[i][k][word_to_index[token]] = 1
            y[i][k][targets[uid][k]] = 1
            k += 1

423A1CA112E2
A8445CABFECE
6B4F7A0165B9
E05C7F5C1156
50B3435E475B
DBF7EB6A9E02
810B70E80E1D
CE98789F502B
A97DE0D49AEA
48D3F4243F0F
AC594194F01C
4F0E197053FF
C3811E7F1750
86C1ED49C35F
019328A0D7A3
4B6C254FEE39
F054050F442F
4C30EEDA3A8F
20FD63F49519
21730F71662E
A783D3241786
DE628D1F2F9D
1B1FA8C3F4F9
2E98ECF2CA61
D0CBBD43827C
E34D7384EE70
CC296299ABA4
62644C50869C
D7D83D1EBFDB
7E29804EE125
1CCC2C060AA2
0421128DEE6C
42048FB6EC2B
7FF6281EC288
5B8AD3907163
89808E74DDC5
77FADB16D0F4
2F2607C7D8F7
B7F586D0260C
326B272D36A0
4B30291A725D
4662057D0A77
E527586F851C
5F1CF4B91975
F425F44374DD
5613F9FB2154
D14A82EE41BF
8BFC5B17C5AF
077DD935321C
AA994A6CAF65
6A2F708CAA8B
D59E1F10092B
354946A1CA46
F3E71A1A4F8B
DA1C845AB04A
743904BAD7E5
3828201E7783
C30B52D6E340
57E2E8E20B45
1D35A6980E7F
41EF348E3016
9F5A37599E7B
40CC76613B2D
4000B8222A07
2022539CFE3E
15128715053E
E3830AB95CD7
E92185894096
9B45D0A9E4C0
3DCD0C034E88
C7A316555DF7
C161EEA83234
6B2C2AFDFC90
2C42788D171D
D5D31918A943
B0E93CC3E195
66BD5DA864C8

If all has gone well, targets and inputs should have the same number of tokens encoded.

# Model building

In [10]:
from tensorflow import keras
from keras.layers import Input, LSTM, Bidirectional, Dense
from keras.models import Model

# Create the input layer
encoder_inputs = Input(shape=(None, num_input_tokens))

# Create the LSTM layer:
encoder_lstm = LSTM(256, return_sequences=True, return_state=True)
encoder_bi = Bidirectional(encoder_lstm)

# Retrieve the outputs and states:
encoder_outputs, forward_h, forward_c, backward_h, backward_c = encoder_bi(encoder_inputs)

# Per Barrow et al. "the maximum and mean of all hidden 
#   states are concatenated with the final hidden states,
#   and this is used as the sentence encoding."
# For now... we'll just concatenate the final hidden states.
encoder_states = [forward_h, backward_h]

test_dense = Dense(len(IBO_LABELS), activation='softmax')
final_output = test_dense(encoder_outputs)

test_dense = Dense(len(IBO_LABELS), activation='softmax')
final_output = test_dense(encoder_outputs)

2022-01-04 04:10:49.179810: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-01-04 04:10:49.180889: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-01-04 04:10:49.181590: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-01-04 04:10:49.183251: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compil

In [11]:
model = Model(encoder_inputs, final_output)
model.summary()
model.compile(optimizer='rmsprop', 
              loss='categorical_crossentropy',
              metrics=['accuracy'])

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, None, 3115)]      0         
_________________________________________________________________
bidirectional (Bidirectional [(None, None, 512), (None 6905856   
_________________________________________________________________
dense (Dense)                (None, None, 15)          7695      
Total params: 6,913,551
Trainable params: 6,913,551
Non-trainable params: 0
_________________________________________________________________


In [12]:
batch_size = 50
epochs = 50

model.fit(one_hot_input, y, batch_size=batch_size, epochs=epochs, validation_split=0.2)

2022-01-04 04:10:55.826221: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 1/50


2022-01-04 04:10:59.568740: I tensorflow/stream_executor/cuda/cuda_dnn.cc:369] Loaded cuDNN version 8005


Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x7efbf9f7ec10>

In [13]:
# Predicting segment bounds

# Create the input layer
seg_inputs = Input(shape=(None, num_input_tokens))

# Create the LSTM layer:
seg_lstm = LSTM(256, return_sequences=True, return_state=True)
seg_bi = Bidirectional(encoder_lstm)

# Retrieve the outputs and states:
seg_outputs, forward_h, forward_c, backward_h, backward_c = sentence_encoder_bi(sentence_encoder_inputs)


NameError: name 'sentence_encoder_bi' is not defined