In [1]:
%tensorflow_version 2.x
import tensorflow as tf
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

Found GPU at: /device:GPU:0


In [2]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [3]:
import numpy as np
import pandas as pd
import re
import spacy 
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import LSTM, Dense, Dropout, Activation
from tensorflow.keras.callbacks import EarlyStopping 

In [6]:
train = pd.read_csv('/content/train.csv')

In [7]:
train['keyword'].fillna('', inplace=True)
train['text'] = train['text'] + ' ' + train['keyword']

In [8]:
def preprocess(text):
    text = re.sub('[^a-zA-Z]', ' ', text)
    text = text.lower().split()
    
    text = [PorterStemmer().stem(word) for word in text if not word in stopwords.words('english')]
    text = ' '.join(text)
    return text

In [9]:
train['text'] = train['text'].apply(lambda x: preprocess(x))


In [10]:
from sklearn.model_selection import train_test_split

In [11]:
voc_size = 50000
sent_length = 40
embedding_dim = 300

X_train = [one_hot(words, voc_size) for words in train['text']]
X_train = pad_sequences(X_train, padding='pre', maxlen=sent_length)
Y_train = train['target']

In [12]:
model = Sequential()
model.add(Embedding(voc_size, embedding_dim, input_length=sent_length))
model.add(Dropout(0.2))

model.add(LSTM(64, return_sequences=True))
model.add(Dropout(0.2))

model.add(LSTM(64))
model.add(Dropout(0.1))

model.add(Dense(64,activation='relu'))
model.add(Dense(64,activation='relu'))
model.add(Activation('softmax'))
model.add(Dense(1,activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model_history = model.fit(X_train, Y_train, validation_split=0.2, batch_size=64, epochs=20, shuffle=True)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
