In [10]:
# import libraries
import numpy as np
import pandas as pd

from nltk.util import ngrams
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense


import warnings
warnings.filterwarnings('ignore')

In [11]:
train_df = pd.read_csv('./preprocessing/train.csv')
test_df = pd.read_csv('./preprocessing/test.csv')

In [12]:
# lets use only tweet text to build the model
X = train_df.text
y = train_df.target

test_id = test_df.id
test_df.drop(["id","location","keyword"],1,inplace = True)

In [13]:
# Test train split 
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [14]:
# Bag of Words model
from keras.preprocessing.text import Tokenizer

# fit a tokenizer
def create_tokenizer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

In [15]:
# create and apply tokenizer on the training dataset
tokenizer = create_tokenizer(X_train)
X_train_set = tokenizer.texts_to_matrix(X_train, mode = 'freq')

In [16]:
# define the model
def define_model(n_words):
    # define network
    model = Sequential()
    model.add(Dense(128, input_shape=(n_words,), activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    # compile network
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics = ['accuracy'])
    # summarize defined model
    model.summary()
    return model

In [17]:
# create the model
n_words = X_train_set.shape[1]
model = define_model(n_words)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 128)               1521408   
                                                                 
 dense_1 (Dense)             (None, 1)                 129       
                                                                 
Total params: 1,521,537
Trainable params: 1,521,537
Non-trainable params: 0
_________________________________________________________________


In [18]:
#fit network
model.fit(X_train_set,y_train,epochs=10,verbose=2)

Epoch 1/10
191/191 - 3s - loss: 0.6272 - accuracy: 0.6680 - 3s/epoch - 16ms/step
Epoch 2/10
191/191 - 3s - loss: 0.4335 - accuracy: 0.8394 - 3s/epoch - 16ms/step
Epoch 3/10
191/191 - 3s - loss: 0.3128 - accuracy: 0.8860 - 3s/epoch - 13ms/step
Epoch 4/10
191/191 - 3s - loss: 0.2396 - accuracy: 0.9184 - 3s/epoch - 14ms/step
Epoch 5/10
191/191 - 3s - loss: 0.1874 - accuracy: 0.9407 - 3s/epoch - 14ms/step
Epoch 6/10
191/191 - 3s - loss: 0.1496 - accuracy: 0.9552 - 3s/epoch - 14ms/step
Epoch 7/10
191/191 - 2s - loss: 0.1215 - accuracy: 0.9635 - 2s/epoch - 13ms/step
Epoch 8/10
191/191 - 2s - loss: 0.1010 - accuracy: 0.9727 - 2s/epoch - 13ms/step
Epoch 9/10
191/191 - 2s - loss: 0.0849 - accuracy: 0.9749 - 2s/epoch - 13ms/step
Epoch 10/10
191/191 - 3s - loss: 0.0738 - accuracy: 0.9782 - 3s/epoch - 13ms/step


<keras.callbacks.History at 0x1e7687b18a0>

In [19]:
# prediction on the test dataset
X_test = test_df.text
X_test_set = tokenizer.texts_to_matrix(X_test, mode = 'freq')
y_pred = model.predict(X_test_set)
y_pred = list(map(lambda x: 1 if x>=0.5 else 0, y_pred))



In [20]:
from sklearn.metrics import accuracy_score

ans = pd.read_csv('./dataset/ans.csv')['target'].values
accuracy_score(y_pred= y_pred, y_true= ans)

0.7640208397180509