### This is from a Kaggle competition: https://www.kaggle.com/c/nlp-getting-started/data?select=train.csv

In [1]:
import pandas as pd

df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

In [2]:
df_train = df_train[:10]

In [3]:
df_train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [4]:
import sidetable
df_train.stb.missing(style=True)

Unnamed: 0,missing,total,percent
keyword,10,10,100.00%
location,10,10,100.00%
id,0,10,0.00%
text,0,10,0.00%
target,0,10,0.00%


In [5]:
len(max(df_train.text))

79

In [6]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
import numpy as np
def get_pad_sq(df):
    text = np.array(df.text)
    tokenizer = Tokenizer(num_words=len(df.text.unique()))
    tokenizer.fit_on_texts(text)
    sequences = tokenizer.texts_to_sequences(text)
    # Find the max length of rows in sequences
    max_length = 0
    for sentence in sequences:
        max_length = max(max_length, len(sentence))
    df = sequence.pad_sequences(sequences, maxlen=max_length)
    return (df, max_length)

In [7]:
test_id = df_test.id

In [8]:
train, _ = get_pad_sq(df_train)
test, max_review_length = get_pad_sq(df_test)
train_target = np.array(df_train.target)

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(train, train_target, stratify=train_target, test_size=0.2, random_state=1)

Credit : https://medium.com/@mrunal68/text-sentiments-classification-with-cnn-and-lstm-f92652bc29fd

In [10]:
from keras.models import Sequential
from keras.layers.embeddings import Embedding
from keras.layers import Dense,Conv1D,MaxPooling1D
from tensorflow.keras import regularizers
from keras.layers import LSTM,Dropout
from keras.callbacks import ModelCheckpoint, EarlyStopping

# create the model
embedding_vector_length = 128
top_words = 10000
model = Sequential()
model.add(Embedding(top_words, embedding_vector_length, input_length=max_review_length))
model.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid', kernel_regularizer=regularizers.l1_l2(l1=1e-5, l2=1e-4)))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
checkpoint = ModelCheckpoint('best_model.h5', monitor='val_accuracy',verbose=0, save_best_only=True)
es = EarlyStopping(monitor='val_accuracy', verbose=0, patience=1)
callbacks_list = [checkpoint, es]
model.fit(X_train, y_train, epochs=4000, batch_size=32,verbose = 1, callbacks = callbacks_list, validation_data=(X_val,y_val))

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 30, 128)           1280000   
_________________________________________________________________
conv1d (Conv1D)              (None, 30, 32)            12320     
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 15, 32)            0         
_________________________________________________________________
lstm (LSTM)                  (None, 100)               53200     
_________________________________________________________________
dense (Dense)                (None, 1)                 101       
Total params: 1,345,621
Trainable params: 1,345,621
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/4000
Epoch 2/4000


<tensorflow.python.keras.callbacks.History at 0x1c9a6728c48>

In [18]:
# Load the model and predict
from keras.models import load_model
model = load_model('best_model.h5')
y_pred = model.predict(test)

In [24]:
y_pred = pd.DataFrame([1 if x >= 0.5 else 0 for x in y_pred], columns=['target'])

In [26]:
y_pred = pd.concat([df_test.id, y_pred], axis=1)

In [27]:
y_pred

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,1
4,11,1
...,...,...
3258,10861,1
3259,10865,1
3260,10868,1
3261,10874,1
