In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential, save_model, load_model
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D ,LSTM ,TimeDistributed
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
from keras.callbacks import EarlyStopping , ModelCheckpoint
from keras.layers import Dropout
import re
import os

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd 
train = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/ori_data/train.csv")


# Load Data From train.csv

In [None]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 144293 entries, 0 to 144292
Data columns (total 8 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   id                  144293 non-null  object 
 1   discourse_id        144293 non-null  float64
 2   discourse_start     144293 non-null  float64
 3   discourse_end       144293 non-null  float64
 4   discourse_text      144293 non-null  object 
 5   discourse_type      144293 non-null  object 
 6   discourse_type_num  144293 non-null  object 
 7   predictionstring    144293 non-null  object 
dtypes: float64(3), object(5)
memory usage: 8.8+ MB


In [None]:
train.discourse_type.value_counts()

Claim                   50208
Evidence                45702
Position                15419
Concluding Statement    13505
Lead                     9305
Counterclaim             5817
Rebuttal                 4337
Name: discourse_type, dtype: int64

In [None]:
import re
STOPWORDS = [ "a", "about", "above", "after", "again", "all", "am", "an", "and", "any", "are", "as", "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "could", "did", "do", "does", "doing", "down", "during", "each", "few", "for", "from", "further", "had", "has", "have", "having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself", "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "it", "it's", "its", "itself", "let's", "me", "more", "most", "my", "myself", "nor", "of", "on", "once", "only", "or", "other", "ought", "our", "ours", "ourselves", "out", "over", "own", "same", "she", "she'd", "she'll", "she's", "should", "so", "some", "such", "than", "that", "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "we", "we'd", "we'll", "we're", "we've", "were", "what", "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom", "why", "why's", "with", "would", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves" ]
train = train.reset_index(drop=True)
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')


def clean_text(text):
    """
        text: a string
        
        return: modified initial string
    """
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text. substitute the matched string in REPLACE_BY_SPACE_RE with space.
    text = BAD_SYMBOLS_RE.sub('', text) # remove symbols which are in BAD_SYMBOLS_RE from text. substitute the matched string in BAD_SYMBOLS_RE with nothing. 
    text = text.replace('x', '')
#    text = re.sub(r'\W+', '', text)
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # remove stopwords from text
    return text
train['discourse_text'] = train['discourse_text'].apply(clean_text)
train['discourse_text'] = train['discourse_text'].str.replace('\d+', '')



In [None]:
lens_list = [len(i.split()) for i in train['discourse_text'] ]
max_length = max(lens_list)
max_ind =lens_list.index(max_length)
print('index of maximum lenght(longer sentence): ' ,max_ind )
print('maximum lenght is : ', max_length) 

index of maximum lenght(longer sentence):  57667
maximum lenght is :  377


In [None]:
from keras.preprocessing.text import Tokenizer



# The maximum number of words to be used. (most frequent)
MAX_NB_WORDS = 10000
# Max number of words in each complaint.
MAX_SEQUENCE_LENGTH = max_length
# This is fixed.
EMBEDDING_DIM = 300
tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(train['discourse_text'].values)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 65442 unique tokens.


In [None]:
from keras.preprocessing.sequence import pad_sequences
X = tokenizer.texts_to_sequences(train['discourse_text'].values)
X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', X.shape)

Shape of data tensor: (144293, 377)


In [None]:
y_label = pd.get_dummies(train['discourse_type'])
Y = y_label.values
print('Shape of label tensor:', Y.shape)

Shape of label tensor: (144293, 7)


In [None]:
label_names = y_label.columns
label_names

Index(['Claim', 'Concluding Statement', 'Counterclaim', 'Evidence', 'Lead',
       'Position', 'Rebuttal'],
      dtype='object')

# Build LSTM with keras

In [None]:
model = Sequential()
model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=X.shape[1]))
model.add(SpatialDropout1D(0.4))
model.add(LSTM(10, dropout=0.4, recurrent_dropout=0.4))
model.add(Dense(7, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

epochs = 5
batch_size = 64



In [None]:
model.fit(X, Y,
          epochs=epochs,
          batch_size=batch_size,
          validation_split=0.1,
          callbacks=[ModelCheckpoint('best_model.h5', save_best_only = True)])

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f148dd35710>

In [None]:
test_dir = "/content/drive/MyDrive/Colab Notebooks/ori_data/test"
test_files = os.listdir(test_dir)

for file in range(len(test_files)):
    test_files[file] = str(test_dir) + "/" +  str(test_files[file])


print("Total number of test files = " , len(test_files))

Total number of test files =  5


# Preprocess Test Data Texts

In [None]:
import os
test_names, test_texts = [], []
for f in list(os.listdir(test_dir)):
    test_names.append(f.replace('.txt', ''))
    test_texts.append(open(test_dir + '/' + f, 'r').read())
    
    
test_texts = pd.DataFrame({'id': test_names, 'text': test_texts})
test_texts

Unnamed: 0,id,text
0,D46BCB48440A,"When people ask for advice,they sometimes talk..."
1,18409261F5C2,80% of Americans believe seeking multiple opin...
2,D72CB1C11673,Making choices in life can be very difficult. ...
3,0FB0700DAF44,"During a group project, have you ever asked a ..."
4,DF920E0A7337,Have you ever asked more than one person for h...


In [None]:
discourse_id = []
discourse_text = []
discourse_start =[] 
discourse_end = []
predictionstring = []
for text in range(len(test_texts.text)):
    doc = test_texts.text[text]
    paras = re.split(r'[.]\n',doc)
    start = 0
    for para in paras:
        positions = re.findall(r'\w+[.]', para)
        if len(positions) <=2 :
            txts = re.split(r'[.]\s',para)
            for i in txts:
                discourse_text.append(i)
                length = len(i.split())
                end = start + length
                l = list(range(start+1,end+1 ))
                l = [str(j) for j in l]
                l = ' '.join(l)
                predictionstring.append(l)
                discourse_start.append(start+1)
                discourse_end.append(end)
                discourse_id.append(test_texts.id[text])
                start += length

        else: 
            if len(positions)%2 == 0:
                split_pos = int(len(positions)/2)
                split_word = positions[split_pos]
            else :
                split_pos = int((len(positions)+1)/2)
                split_word = positions[split_pos]

            words = para.split(' ')
            position = words.index(split_word)
            part1 = words[:position]
            part2 = words[position:]
            part1 = ' '.join(part1)
            part2 = ' '.join(part2)
            parts = [part1 ,part2]
            for i in parts:
                discourse_text.append(i)
                length = len(i.split())
                end = start + length
                l = list(range(start+1,end+1 ))
                l = [str(k) for k in l]
                l = ' '.join(l)
                predictionstring.append(l)
                discourse_start.append(start+1)
                discourse_end.append(end)
                discourse_id.append(test_texts.id[text])
                start += length

testing_data =pd.DataFrame() 
testing_data['discourse_id'] =discourse_id
testing_data['discourse_text'] = discourse_text
testing_data['discourse_start'] = discourse_start
testing_data['discourse_end'] = discourse_end
testing_data['predictionstring'] = predictionstring
testing_data.head()

Unnamed: 0,discourse_id,discourse_text,discourse_start,discourse_end,predictionstring
0,D46BCB48440A,"When people ask for advice,they sometimes talk...",1,27,1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 1...
1,D46BCB48440A,choices. Some reasons I think why advises help...,28,46,28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 4...
2,D46BCB48440A,\nThe reason I think advises help is keeps you...,47,84,47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 6...
3,D46BCB48440A,party. I was so ready to go but i didnt know i...,85,139,85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 1...
4,D46BCB48440A,\nAnother reason I think Advises help is make ...,140,178,140 141 142 143 144 145 146 147 148 149 150 15...


In [None]:
len(testing_data)

55

In [None]:
testing_data['test_sentences'] = testing_data['discourse_text'].apply(clean_text)
testing_data['test_sentences'] = testing_data['discourse_text'].str.replace('\d+', '')

  


In [None]:
testing_data.head()

Unnamed: 0,discourse_id,discourse_text,discourse_start,discourse_end,predictionstring,test_sentences
0,D46BCB48440A,"When people ask for advice,they sometimes talk...",1,27,1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 1...,"When people ask for advice,they sometimes talk..."
1,D46BCB48440A,choices. Some reasons I think why advises help...,28,46,28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 4...,choices. Some reasons I think why advises help...
2,D46BCB48440A,\nThe reason I think advises help is keeps you...,47,84,47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 6...,\nThe reason I think advises help is keeps you...
3,D46BCB48440A,party. I was so ready to go but i didnt know i...,85,139,85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 1...,party. I was so ready to go but i didnt know i...
4,D46BCB48440A,\nAnother reason I think Advises help is make ...,140,178,140 141 142 143 144 145 146 147 148 149 150 15...,\nAnother reason I think Advises help is make ...


In [None]:
X_test = tokenizer.texts_to_sequences(testing_data['test_sentences'].values)
X_test = pad_sequences(X_test, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', X_test.shape)

Shape of data tensor: (55, 377)


In [None]:
y_pred = model.predict(X_test)
output = pd.DataFrame(y_pred ,columns= label_names)
output = list(output.idxmax(axis = 1))
submission_df = pd.DataFrame()
submission_df['id'] = testing_data['discourse_id']
submission_df['class'] = output# label of y_predict
submission_df['predictionstring'] = testing_data['predictionstring']
mapping = { 1:'Claim' , 2:'Evidence' ,  3:'Position' , 4:'Concluding Statement' , 5:'Lead', 6:'Counterclaim', 7:'Rebuttal' }
submission_df['class']= submission_df['class'].replace(mapping)
submission_df

Unnamed: 0,id,class,predictionstring
0,D46BCB48440A,Evidence,1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 1...
1,D46BCB48440A,Claim,28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 4...
2,D46BCB48440A,Evidence,47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 6...
3,D46BCB48440A,Evidence,85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 1...
4,D46BCB48440A,Evidence,140 141 142 143 144 145 146 147 148 149 150 15...
5,D46BCB48440A,Evidence,179 180 181 182 183 184 185 186 187 188 189 19...
6,D46BCB48440A,Evidence,214 215 216 217 218 219 220 221 222 223 224 22...
7,D46BCB48440A,Evidence,268 269 270 271 272 273 274 275 276 277 278 27...
8,D46BCB48440A,Evidence,307 308 309 310 311 312 313 314 315 316 317 31...
9,D46BCB48440A,Claim,350 351 352 353 354 355 356 357 358 359 360 36...


In [None]:
submission_df['class'].unique()

array(['Evidence', 'Claim', 'Position', 'Rebuttal',
       'Concluding Statement'], dtype=object)

In [None]:
submission_df.to_csv("submission.csv", index=False)

In [None]:
MODEL_NAME = "LSTM_32_5.h5"
MODEL_SAVE = '/content/drive/MyDrive/Model/LSTM_LYC/'+ MODEL_NAME
model.save(MODEL_SAVE)

#Load Model

In [None]:
model = load_model(MODEL_NAME)

