In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/nlp-getting-started/sample_submission.csv
/kaggle/input/nlp-getting-started/train.csv
/kaggle/input/nlp-getting-started/test.csv


# Dataset

In [4]:
train_df = pd.read_csv("/kaggle/input/nlp-getting-started/train.csv", encoding="latin-1", dtype={'id': np.int16, 'target': np.int8})
test_df = pd.read_csv("/kaggle/input/nlp-getting-started/test.csv", encoding="latin-1", dtype={'id': np.int16})

train_total_row, _ = train_df.shape
test_total_row, _ = test_df.shape
print(f"Training dataset shape is: {train_total_row}")
print(f"Test dataset shape is is {test_total_row}")

Training dataset shape is: 7613
Test dataset shape is is 3263


In [6]:
# Number of missing value

missed_loc = len(train_df[train_df.location.isnull()])
missed_kw = len(train_df[train_df.keyword.isnull()])
missed_text = len(train_df[train_df.text.isnull()])

print(f"Missing location: {missed_loc}")
print(f"Missing keywords: {missed_kw}")
print(f"Missing text: {missed_text}")


Missing location: 2533
Missing keywords: 61
Missing text: 0


### Distributions training set

In [7]:
num_pos = len(train_df[train_df['target'] == 1])
num_neg = len(train_df[train_df['target'] == 0])

print(f"Number of positive sample: {num_pos}")
print(f"Number of negative sample: {num_neg}")

Number of positive sample: 3271
Number of negative sample: 4342


### Loading dataset

In [27]:
class Dictionary:
    def __init__(self):
        self.word2idx = {}
        self.idx2word = []

    def add_word(self, word):
        if word not in self.word2idx:
            self.idx2word.append(word)
            self.word2idx[word] = len(self.idx2word) - 1
        return self.word2idx[word]

    def __len__(self):
        return len(self.idx2word)

class Corpus:
    def __init__(self, DATA_DIR, df):
        self.dictionary = Dictionary()
        self.data = self.tokenize(DATA_DIR, df)

    def tokenize(self, DATA_DIR, df):
        for row in df:
            words = row.text.split() + ['<eos>']
            tokens += len(words)
            for word in words:
                self.dictionary.add_word(word)

        # Tokenize file content
        with open(DATA_DIR, 'r') as f:
            ids = torch.LongTensor(tokens)
            token = 0
            for line in f:
                words = line.split() + ['<eos>']
                for word in words:
                    ids[token] = self.dictionary.word2idx[word]
                    token += 1

        return ids

In [21]:
from torch.utils.data import Dataset, DataLoader


class DisasterDataset(Dataset):
    """Loading Disaster Dataset"""
    def __init__(self, train_df):
        self.text = []
        self.label = []
        self.location = []
        self.kw = []
        for row in train_df:
            self.text.append(row.text)
            self.label.append(row.target)
            self.location.append(row.location)
            self.kw.append(row.keyword)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return torch.tensor([float(x) for x in self.data[idx]])

    
batch_size = 32
dataset = NLPDataset(train_df)
train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)


# Model

In [None]:
import torch.nn as nn
import torch.nn.functional as F

class LSTMClassifier(nn.ModuleList):

    def __init__(self, batch_size, hidden_dim, lstm_layers, input_size):
        super(LSTMClassifier, self).__init__()
        
        self.batch_size = batch_size
        self.hidden_dim = hidden_dim
        self.LSTM_layers = lstm_layers
        self.input_size = input_size # embedding dimention
        
        self.dropout = nn.Dropout(0.5)
        self.embedding = nn.Embedding(self.input_size, self.hidden_dim)
        self.lstm = nn.LSTM(input_size=self.hidden_dim, hidden_size=self.hidden_dim, num_layers=self.LSTM_layers, batch_first=True)
        self.fc = nn.Linear(in_features=self.hidden_dim, out_features=2)

    def forward(self, sentence):
        embeds = self.embedding(sentence)
        x = embeds.view(len(sentence), self.batch_size, -1)
        lstm_out, self.hidden = self.lstm(x, self.hidden)
        y  = self.fc(lstm_out[-1])
        return y