In [None]:
import numpy as np
import pandas as pd
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
import re
import tensorflow as tf
from sklearn.model_selection import train_test_split
tf.test.is_gpu_available(
    cuda_only=False, min_cuda_compute_capability=None
)
print(tf.test.gpu_device_name())

In [2]:
df_train = pd.read_csv('train.csv')
df_val = pd.read_csv("val.csv")
text_data = pd.concat([df_train,df_val])
text_data

Unnamed: 0,text,augmented_text
0,barbaric and shocking use of force against,barbaric and shocking use of ofcre agaistn
1,pm says energy policy has almost universal sup...,pm assy energy policy has laomst universal sup...
2,coronavirus tasmania peter gutwein economy impact,coronavirus tasmania peter gutwein econIjy JJpact
3,tamworth police continue hunt for missing man,ta<wortj (olicr continue hunt for miEsiMg man
4,help us secure borders iraq says,help us secure boGcers iraq sz^s
...,...,...
153723,retailers fume over lost easter business,retailers fume over lost easFef bJsijess
153724,malcolm turnbull should put his authority on t...,amclolm turnbull should put his authority on t...
153725,miner considers early bronzewing reopening,Kine$ considers farIy bronzewing reopening
153726,rann government curbs internet debate,raMG government curbs in^e5net debate


1. Data Cleaning and Processing

In [3]:
def clean_data(text):
    text = text.lower()
    text = re.sub(r"[-=+*\"#@!$%^&()`<>\[\]]","",text)
    text = re.sub(r"i'm","i am",text)
    text = re.sub(r"he's","he is",text)
    text = re.sub(r"she's","she is",text)
    text = re.sub(r"it's","it is",text)
    text = re.sub(r"they're","they are",text)
    text = re.sub(r"there're","there are",text)
    text = re.sub(r"there's","there is",text)
    text = re.sub(r"how're","how are",text)
    text = re.sub(r"what're","what are",text)
    text = re.sub(r"where're","where ",text)
    text = re.sub(r"who're","who are",text)
    text = re.sub(r"that're","that are",text)
    text = re.sub(r"when're","when are",text)
    text = re.sub(r"how's","how is",text)
    text = re.sub(r"what's","what is",text)
    text = re.sub(r"where's","where is",text)
    text = re.sub(r"who's","who is",text)
    text = re.sub(r"that's","that is",text)
    text = re.sub(r"when's","when is",text)
    text = re.sub(r"won't","would not",text)
    text = re.sub(r"nt't","can not",text)
    text = re.sub(r"\'bout'","about",text)
    text = re.sub(r"\'till'","untill",text)
    text = re.sub(r"\'ll","will",text)
    text = re.sub(r"\'ve","have",text)
    text = re.sub(r"\'re","are",text)
    text = re.sub(r"\'d","would",text)
    text = re.sub(r"\."," ",text)
    text = re.sub(r"\,"," ",text)
    text = re.sub(r"\!"," ",text)
    text = re.sub(r"\?"," ",text)
    text = re.sub(r"\;"," ",text)
    text = re.sub(r"\:"," ",text)
    return text

    
text_data['text'] = text_data['text'].apply(clean_data)
text_data['augmented_text'] = text_data['augmented_text'].apply(clean_data)




In [4]:
tokenizer = Tokenizer(oov_token="<OOV>")
tokenizer.fit_on_texts(text_data['text'])
word_index = tokenizer.word_index


train_sequences = tokenizer.texts_to_sequences(text_data['augmented_text'])
train_padded = pad_sequences(train_sequences, maxlen=20, padding='post',truncating='post')

test_sequences = tokenizer.texts_to_sequences(text_data['text'])
test_padded = pad_sequences(test_sequences, maxlen=20, padding='post',truncating='post')


2 Model Building

In [9]:
vocab_size = len(word_index)+1
embedding_dim = 128
max_length = 20

model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim,input_length=max_length),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(100, return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(50)),
    tf.keras.layers.Dropout(0.1),
    tf.keras.layers.Dense(max_length,activation='softmax')
])
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=["accuracy"])


In [6]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 20, 128)           12670080  
                                                                 
 bidirectional (Bidirectiona  (None, 20, 200)          183200    
 l)                                                              
                                                                 
 bidirectional_1 (Bidirectio  (None, 100)              100400    
 nal)                                                            
                                                                 
 dropout (Dropout)           (None, 100)               0         
                                                                 
 dense (Dense)               (None, 20)                2020      
                                                                 
Total params: 12,955,700
Trainable params: 12,955,700
No

In [7]:
train_padded[0]

array([19181,    14,  6782,   410,     5,     1,     1,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0])

In [11]:
try:
    with tf.device('/device:GPU:0'):
        model.fit(train_padded, test_padded,epochs=100,verbose=5)
except RuntimeError as e:
  print(e)

Epoch 1/100


KeyboardInterrupt: 