In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/nlp-getting-started/sample_submission.csv
/kaggle/input/nlp-getting-started/train.csv
/kaggle/input/nlp-getting-started/test.csv


# 1. Load Data

In [2]:
import os
print(os.getcwd())

/kaggle/working


In [3]:
sample = pd.read_csv('/kaggle/input/nlp-getting-started/sample_submission.csv')

In [4]:
sample.head()

Unnamed: 0,id,target
0,0,0
1,2,0
2,3,0
3,9,0
4,11,0


In [5]:
train = pd.read_csv('../input/nlp-getting-started/train.csv')
test = pd.read_csv('../input/nlp-getting-started/test.csv')

# 2. Exploratory Data Analysis

In [6]:
train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [7]:
print(len(train))
print(len(test))
print(len(sample))
print(train.index, test.index)
print(train.columns, test.columns)

7613
3263
3263
RangeIndex(start=0, stop=7613, step=1) RangeIndex(start=0, stop=3263, step=1)
Index(['id', 'keyword', 'location', 'text', 'target'], dtype='object') Index(['id', 'keyword', 'location', 'text'], dtype='object')


In [8]:
print(train['text'].str.len().max())
print(train['text'].str.len().min())
print(train['text'].str.len().value_counts())


157
7
text
136    401
139    275
138    256
140    218
137    210
      ... 
150      2
149      1
157      1
151      1
152      1
Name: count, Length: 147, dtype: int64


In [9]:
print(train['keyword'].isna().count())

7613


In [10]:
train_x = np.array(train['text'])
train_y = np.array(train['target'])
test_x = np.array(test['text'])

In [11]:
print(train_x.shape)
print(train_y.shape)
print(test_x.shape)

(7613,)
(7613,)
(3263,)


# 3. Model Development 

In [12]:
import tensorflow as tf
from tensorflow.keras import layers, models, optimizers, losses, metrics
from tensorflow.keras.preprocessing import text, sequence

In [13]:
tokenizer = text.Tokenizer(num_words = 10000, oov_token = '<oov>')
tokenizer.fit_on_texts(train_x)
train_sequences = tokenizer.texts_to_sequences(train_x)
train_pad_x = sequence.pad_sequences(train_sequences, maxlen= 20, truncating = 'post', padding = 'post')

In [14]:
test_sequences = tokenizer.texts_to_sequences(test_x)
test_pad_x = sequence.pad_sequences(test_sequences, maxlen = 20, padding = 'post', truncating = 'post')

In [15]:
from tensorflow.keras import regularizers, callbacks

In [16]:
def Smodel(vocab_size, d_model):
    inputs = layers.Input(shape= (None,))
    x = layers.Embedding(vocab_size, d_model)(inputs)
    x, fh, fc, bh, bc = layers.Bidirectional(layers.LSTM(16, return_sequences = True, return_state = True))(x)
    h = layers.Concatenate()([fh, bh])
    c = layers.Concatenate()([fc, bc])
    x, h1, c1 = layers.LSTM(32, kernel_regularizer=regularizers.l2(0.1), return_state = True, return_sequences = True,
                    bias_regularizer= regularizers.l2(0.2))(x, initial_state= [h, c]) 
    h11 = layers.Concatenate()([h, h1])
    c11 = layers.Concatenate()([c, c1])
    x = layers.LSTM(64, kernel_regularizer=regularizers.l2(0.1), bias_regularizer= regularizers.l2(0.2))(x, initial_state= [h11, c11]) 
    x = layers.Dense(64, activation= 'relu', kernel_regularizer=regularizers.l2(0.1), bias_regularizer= regularizers.l2(0.2))(x)
    x = layers.Dropout(0.4)(x)
    outputs = layers.Dense(1, activation = 'sigmoid')(x)
    return models.Model(inputs = inputs, outputs = outputs)

In [17]:
vocab_size = len(tokenizer.word_index) + 1
d_model = 512

In [18]:
model = Smodel(vocab_size, d_model)

In [19]:
model.compile(optimizer = optimizers.Adam(learning_rate = 0.00005),
             loss = losses.BinaryCrossentropy(),
             metrics = ['accuracy'])

In [20]:
model.fit(train_pad_x, train_y, validation_split= 0.15, epochs = 4, shuffle = True)

Epoch 1/4
[1m203/203[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 149ms/step - accuracy: 0.5517 - loss: 36.2214 - val_accuracy: 0.5342 - val_loss: 33.9419
Epoch 2/4
[1m203/203[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 145ms/step - accuracy: 0.5958 - loss: 33.2320 - val_accuracy: 0.6349 - val_loss: 31.2522
Epoch 3/4
[1m203/203[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 145ms/step - accuracy: 0.7325 - loss: 30.5961 - val_accuracy: 0.7531 - val_loss: 28.8811
Epoch 4/4
[1m203/203[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 146ms/step - accuracy: 0.8472 - loss: 28.2454 - val_accuracy: 0.7750 - val_loss: 26.8006


<keras.src.callbacks.history.History at 0x7a709fc71720>

# 4. Inference 

In [21]:
predict = model.predict(test_pad_x)

[1m102/102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 16ms/step


In [22]:
predict = (predict > 0.5).astype('int')

In [23]:
sample

Unnamed: 0,id,target
0,0,0
1,2,0
2,3,0
3,9,0
4,11,0
...,...,...
3258,10861,0
3259,10865,0
3260,10868,0
3261,10874,0


In [24]:
len(predict)

3263

In [25]:
test

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan
...,...,...,...,...
3258,10861,,,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...
3259,10865,,,Storm in RI worse than last hurricane. My city...
3260,10868,,,Green Line derailment in Chicago http://t.co/U...
3261,10874,,,MEG issues Hazardous Weather Outlook (HWO) htt...


# 5. Submission

In [26]:
import copy
test1 = copy.deepcopy(test)

In [27]:
test1['target'] = predict


In [28]:
test1.head()

Unnamed: 0,id,keyword,location,text,target
0,0,,,Just happened a terrible car crash,1
1,2,,,"Heard about #earthquake is different cities, s...",0
2,3,,,"there is a forest fire at spot pond, geese are...",1
3,9,,,Apocalypse lighting. #Spokane #wildfires,0
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan,1


In [29]:
test1.drop(columns = ['keyword', 'location', 'text'], inplace = True)
test1.head()

Unnamed: 0,id,target
0,0,1
1,2,0
2,3,1
3,9,0
4,11,1


In [30]:
test1 = test1[['id', 'target']]

In [31]:
test1.to_csv('results.csv', index = False)