In [1]:
import collections, itertools, re
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn import model_selection
from sklearn import metrics
from nltk.tokenize import word_tokenize
import torch
from torch import nn
from skorch import NeuralNetClassifier

# Data

https://www.kaggle.com/competitions/nlp-getting-started/data

In [2]:
Data = pd.read_csv('/home/utk.edu/cliu89/Data/DisasterTweets/train.csv', index_col='id')
Data

Unnamed: 0_level_0,keyword,location,text,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,,,Our Deeds are the Reason of this #earthquake M...,1
4,,,Forest fire near La Ronge Sask. Canada,1
5,,,All residents asked to 'shelter in place' are ...,1
6,,,"13,000 people receive #wildfires evacuation or...",1
7,,,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...
10869,,,Two giant cranes holding a bridge collapse int...,1
10870,,,@aria_ahrary @TheTawniest The out of control w...,1
10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
10872,,,Police investigating after an e-bike collided ...,1


In [3]:
Data['keyword'] = Data.keyword.str.strip()
Data['location'] = Data.location.str.strip()

# Keyword

In [4]:
keyword_counts = Data.keyword.value_counts(dropna=False)
keyword_counts

NaN                      61
fatalities               45
deluge                   42
armageddon               42
sinking                  41
                         ..
forest%20fire            19
epicentre                12
threat                   11
inundation               10
radiation%20emergency     9
Name: keyword, Length: 222, dtype: int64

In [5]:
keyword_index = pd.Series(range(len(keyword_counts)), index=keyword_counts.index)
keyword_index

NaN                        0
fatalities                 1
deluge                     2
armageddon                 3
sinking                    4
                        ... 
forest%20fire            217
epicentre                218
threat                   219
inundation               220
radiation%20emergency    221
Length: 222, dtype: int64

In [6]:
keyword_encoded = Data.keyword.map(keyword_index)
keyword_encoded

id
1        0
4        0
5        0
6        0
7        0
        ..
10869    0
10870    0
10871    0
10872    0
10873    0
Name: keyword, Length: 7613, dtype: int64

In [7]:
keyword_encoded.value_counts()

0      61
1      45
2      42
3      42
4      41
       ..
217    19
218    12
219    11
220    10
221     9
Name: keyword, Length: 222, dtype: int64

# Location

In [8]:
location_counts = Data.location.value_counts(dropna=False)
location_counts

NaN                               2533
USA                                105
New York                            73
United States                       50
London                              46
                                  ... 
Layang-Layang, Perak                 1
somewhere in Indiana                 1
Inside your webcam. Stop that.       1
R'lyeh, South Pacific                1
Lincoln                              1
Name: location, Length: 3280, dtype: int64

In [9]:
location_counts = location_counts[location_counts>=10]
location_counts

NaN                  2533
USA                   105
New York               73
United States          50
London                 46
Nigeria                32
Canada                 30
UK                     27
Los Angeles, CA        26
India                  24
Mumbai                 22
Washington, DC         21
Kenya                  20
California             20
Worldwide              19
Chicago, IL            19
Australia              19
New York, NY           16
Everywhere             15
California, USA        15
Los Angeles            15
Washington, D.C.       14
Indonesia              14
United Kingdom         14
San Francisco          14
Florida                14
NYC                    12
Earth                  12
Toronto                12
Ireland                12
Chicago                11
Texas                  11
San Francisco, CA      11
Seattle                11
London, UK             10
London, England        10
Dallas, TX             10
Sacramento, CA         10
ss          

In [10]:
location_index = pd.Series(range(len(location_counts)), index=location_counts.index)
location_index

NaN                   0
USA                   1
New York              2
United States         3
London                4
Nigeria               5
Canada                6
UK                    7
Los Angeles, CA       8
India                 9
Mumbai               10
Washington, DC       11
Kenya                12
California           13
Worldwide            14
Chicago, IL          15
Australia            16
New York, NY         17
Everywhere           18
California, USA      19
Los Angeles          20
Washington, D.C.     21
Indonesia            22
United Kingdom       23
San Francisco        24
Florida              25
NYC                  26
Earth                27
Toronto              28
Ireland              29
Chicago              30
Texas                31
San Francisco, CA    32
Seattle              33
London, UK           34
London, England      35
Dallas, TX           36
Sacramento, CA       37
ss                   38
Atlanta, GA          39
New York City        40
dtype: int64

In [11]:
location_encoded = Data.location.map(location_index).fillna(0).astype(int)
location_encoded

id
1        0
4        0
5        0
6        0
7        0
        ..
10869    0
10870    0
10871    0
10872    0
10873    0
Name: location, Length: 7613, dtype: int64

In [12]:
location_encoded.value_counts()

0     6767
1      105
2       73
3       50
4       46
5       32
6       30
7       27
8       26
9       24
10      22
11      21
12      20
13      20
15      19
14      19
16      19
17      16
19      15
18      15
20      15
21      14
22      14
24      14
23      14
25      14
28      12
26      12
29      12
27      12
33      11
32      11
30      11
31      11
39      10
36      10
38      10
34      10
40      10
37      10
35      10
Name: location, dtype: int64

# Text

In [13]:
def text_tokenize(text):
    text = text.lower()
    text = word_tokenize(text)
    return text

text_tokenized = Data.text.map(text_tokenize)
text_tokenized

id
1        [our, deeds, are, the, reason, of, this, #, ea...
4         [forest, fire, near, la, ronge, sask, ., canada]
5        [all, residents, asked, to, 'shelter, in, plac...
6        [13,000, people, receive, #, wildfires, evacua...
7        [just, got, sent, this, photo, from, ruby, #, ...
                               ...                        
10869    [two, giant, cranes, holding, a, bridge, colla...
10870    [@, aria_ahrary, @, thetawniest, the, out, of,...
10871    [m1.94, [, 01:04, utc, ], ?, 5km, s, of, volca...
10872    [police, investigating, after, an, e-bike, col...
10873    [the, latest, :, more, homes, razed, by, north...
Name: text, Length: 7613, dtype: object

In [14]:
token_counts = collections.Counter(itertools.chain(*text_tokenized))
token_counts = pd.Series(token_counts)
token_counts = token_counts.sort_values()
token_counts

//t.co/lbmekyphm5.pls       1
cristiano                   1
keen                        1
talisman                    1
//t.co/y8xknpqmnj           1
                         ... 
?                        3126
the                      3233
#                        3403
http                     4307
:                        6706
Length: 23068, dtype: int64

In [15]:
token_counts = token_counts[token_counts>=10]
token_counts

handbags      10
hi            10
hospital      10
avoid         10
woke          10
            ... 
?           3126
the         3233
#           3403
http        4307
:           6706
Length: 1589, dtype: int64

In [16]:
token_index = pd.Series(range(1, 1+len(token_counts)), index=token_counts.index)
token_index

handbags       1
hi             2
hospital       3
avoid          4
woke           5
            ... 
?           1585
the         1586
#           1587
http        1588
:           1589
Length: 1589, dtype: int64

In [17]:
def text_index(tokens):
    return [token_index[token] for token in tokens if token in token_index]

text_encoded = text_tokenized.map(text_index)
text_encoded

id
1        [1454, 1556, 1586, 711, 1579, 1560, 1587, 1308...
4                       [1386, 1529, 1339, 761, 1584, 328]
5        [1533, 1580, 1581, 904, 1567, 1556, 1444, 1561...
6                  [1516, 1587, 85, 1315, 168, 1581, 1481]
7        [1542, 1488, 355, 1560, 1232, 1557, 1587, 1546...
                               ...                        
10869    [1455, 748, 276, 505, 1582, 1217, 1415, 1507, ...
10870    [1583, 1583, 1586, 1538, 1579, 850, 1307, 1459...
10871    [1496, 10, 1495, 1585, 25, 1299, 1579, 993, 22...
10872    [1499, 499, 1534, 1530, 1250, 1563, 1582, 1442...
10873    [1586, 1365, 1589, 1519, 1372, 1042, 1561, 138...
Name: text, Length: 7613, dtype: object

In [18]:
text_encoded.map(len).describe()

count    7613.000000
mean       14.359123
std         6.061323
min         0.000000
25%        10.000000
50%        14.000000
75%        18.000000
max        67.000000
Name: text, dtype: float64

In [19]:
def text_pad(tokens, size=67, idx=0):
    return tokens + [idx] * (size - len(tokens))

text_encoded = text_encoded.map(text_pad)
text_encoded

id
1        [1454, 1556, 1586, 711, 1579, 1560, 1587, 1308...
4        [1386, 1529, 1339, 761, 1584, 328, 0, 0, 0, 0,...
5        [1533, 1580, 1581, 904, 1567, 1556, 1444, 1561...
6        [1516, 1587, 85, 1315, 168, 1581, 1481, 0, 0, ...
7        [1542, 1488, 355, 1560, 1232, 1557, 1587, 1546...
                               ...                        
10869    [1455, 748, 276, 505, 1582, 1217, 1415, 1507, ...
10870    [1583, 1583, 1586, 1538, 1579, 850, 1307, 1459...
10871    [1496, 10, 1495, 1585, 25, 1299, 1579, 993, 22...
10872    [1499, 499, 1534, 1530, 1250, 1563, 1582, 1442...
10873    [1586, 1365, 1589, 1519, 1372, 1042, 1561, 138...
Name: text, Length: 7613, dtype: object

In [20]:
text_encoded = pd.DataFrame(text_encoded.to_list(), index=text_encoded.index)
text_encoded

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,57,58,59,60,61,62,63,64,65,66
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1454,1556,1586,711,1579,1560,1587,1308,1435,1487,...,0,0,0,0,0,0,0,0,0,0
4,1386,1529,1339,761,1584,328,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,1533,1580,1581,904,1567,1556,1444,1561,1584,1532,...,0,0,0,0,0,0,0,0,0,0
6,1516,1587,85,1315,168,1581,1481,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,1542,1488,355,1560,1232,1557,1587,1546,1309,1557,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10869,1455,748,276,505,1582,1217,1415,1507,917,1372,...,0,0,0,0,0,0,0,0,0,0
10870,1583,1583,1586,1538,1579,850,1307,1459,1581,1481,...,0,0,0,0,0,0,0,0,0,0
10871,1496,10,1495,1585,25,1299,1579,993,22,1584,...,0,0,0,0,0,0,0,0,0,0
10872,1499,499,1534,1530,1250,1563,1582,1442,1581,1295,...,0,0,0,0,0,0,0,0,0,0


# Processed Data

In [21]:
X = pd.DataFrame({
    'keyword': keyword_encoded,
    'location': location_encoded,
})
X = pd.concat([X, text_encoded], axis=1)
X

Unnamed: 0_level_0,keyword,location,0,1,2,3,4,5,6,7,...,57,58,59,60,61,62,63,64,65,66
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,1454,1556,1586,711,1579,1560,1587,1308,...,0,0,0,0,0,0,0,0,0,0
4,0,0,1386,1529,1339,761,1584,328,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,1533,1580,1581,904,1567,1556,1444,1561,...,0,0,0,0,0,0,0,0,0,0
6,0,0,1516,1587,85,1315,168,1581,1481,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,1542,1488,355,1560,1232,1557,1587,1546,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10869,0,0,1455,748,276,505,1582,1217,1415,1507,...,0,0,0,0,0,0,0,0,0,0
10870,0,0,1583,1583,1586,1538,1579,850,1307,1459,...,0,0,0,0,0,0,0,0,0,0
10871,0,0,1496,10,1495,1585,25,1299,1579,993,...,0,0,0,0,0,0,0,0,0,0
10872,0,0,1499,499,1534,1530,1250,1563,1582,1442,...,0,0,0,0,0,0,0,0,0,0


In [22]:
Y = Data.target
Y.value_counts()

0    4342
1    3271
Name: target, dtype: int64

In [23]:
X = X.values
Y = Y.values
X_train, X_test, Y_train, Y_test = model_selection.train_test_split(X, Y)
X_train.shape, X_test.shape, Y_train.shape, Y_test.shape

((5709, 69), (1904, 69), (5709,), (1904,))

# Model

In [24]:
class Model(nn.Module):
    def __init__(self, num_keywords, num_locations, num_tokens, embedding_dim, out_channels):
        super().__init__()
        self.keyword_embedding = nn.Embedding(num_embeddings=num_keywords, embedding_dim=embedding_dim)
        self.location_embedding = nn.Embedding(num_embeddings=num_locations, embedding_dim=embedding_dim)
        self.text_embedding = nn.Embedding(num_embeddings=1+num_tokens, embedding_dim=embedding_dim, padding_idx=0)
        self.conv3 = nn.Conv1d(in_channels=embedding_dim, out_channels=out_channels, kernel_size=3, stride=2)
        self.conv5 = nn.Conv1d(in_channels=embedding_dim, out_channels=out_channels, kernel_size=5, stride=2)
        self.pool = nn.AdaptiveMaxPool1d(1)
        self.act = nn.ReLU()
        self.fc = nn.Linear(embedding_dim+embedding_dim+out_channels+out_channels, 2)
        self.out = nn.Softmax(dim=1)
        self.dropout = nn.Dropout(0.25)
    def forward(self, x):
        keyword_embedding = self.keyword_embedding(x[:, 0])
        location_embedding = self.location_embedding(x[:, 1])
        text_embedding = self.text_embedding(x[:, 2:])
        text_embedding = text_embedding.permute(0,2,1)
        text_conv3 = self.pool(self.act(self.conv3(text_embedding))).squeeze()
        text_conv5 = self.pool(self.act(self.conv5(text_embedding))).squeeze()
        x = torch.cat([keyword_embedding, location_embedding, text_conv3, text_conv5], dim=1)
        x = self.fc(x)
        x = self.dropout(x)
        return self.out(x)

In [25]:
model = Model(
    num_keywords = len(keyword_index),
    num_locations = len(location_index),
    num_tokens = len(token_index),
    embedding_dim = 50,
    out_channels = 32,
)
model

Model(
  (keyword_embedding): Embedding(222, 50)
  (location_embedding): Embedding(41, 50)
  (text_embedding): Embedding(1590, 50, padding_idx=0)
  (conv3): Conv1d(50, 32, kernel_size=(3,), stride=(2,))
  (conv5): Conv1d(50, 32, kernel_size=(5,), stride=(2,))
  (pool): AdaptiveMaxPool1d(output_size=1)
  (act): ReLU()
  (fc): Linear(in_features=164, out_features=2, bias=True)
  (out): Softmax(dim=1)
  (dropout): Dropout(p=0.25, inplace=False)
)

In [26]:
classifier = NeuralNetClassifier(
    model,
    max_epochs=100,
    lr=0.05,
    iterator_train__shuffle=True,
)

In [27]:
classifier.fit(X_train, Y_train)

  epoch    train_loss    valid_acc    valid_loss     dur
-------  ------------  -----------  ------------  ------
      1        [36m0.7441[0m       [32m0.6182[0m        [35m0.6557[0m  0.2074
      2        [36m0.6521[0m       [32m0.6690[0m        [35m0.6205[0m  0.1755
      3        [36m0.6312[0m       0.5981        0.6940  0.2099
      4        [36m0.6220[0m       0.6576        0.6282  0.2075
      5        0.6275       0.6594        [35m0.6179[0m  0.2157
      6        [36m0.5858[0m       [32m0.6804[0m        [35m0.5941[0m  0.2110
      7        [36m0.5687[0m       0.6743        0.6255  0.1854
      8        [36m0.5518[0m       [32m0.6953[0m        [35m0.5816[0m  0.2107
      9        [36m0.5286[0m       0.6926        [35m0.5756[0m  0.2209
     10        [36m0.5171[0m       0.6751        0.5953  0.1800
     11        [36m0.4870[0m       [32m0.7093[0m        [35m0.5752[0m  0.1685
     12        [36m0.4711[0m       0.6961        0.5854  0.

<class 'skorch.classifier.NeuralNetClassifier'>[initialized](
  module_=Model(
    (keyword_embedding): Embedding(222, 50)
    (location_embedding): Embedding(41, 50)
    (text_embedding): Embedding(1590, 50, padding_idx=0)
    (conv3): Conv1d(50, 32, kernel_size=(3,), stride=(2,))
    (conv5): Conv1d(50, 32, kernel_size=(5,), stride=(2,))
    (pool): AdaptiveMaxPool1d(output_size=1)
    (act): ReLU()
    (fc): Linear(in_features=164, out_features=2, bias=True)
    (out): Softmax(dim=1)
    (dropout): Dropout(p=0.25, inplace=False)
  ),
)

In [28]:
print(metrics.classification_report(Y_train, classifier.predict(X_train)))

              precision    recall  f1-score   support

           0       0.94      0.94      0.94      3270
           1       0.92      0.91      0.92      2439

    accuracy                           0.93      5709
   macro avg       0.93      0.93      0.93      5709
weighted avg       0.93      0.93      0.93      5709



In [29]:
print(metrics.classification_report(Y_test, classifier.predict(X_test)))

              precision    recall  f1-score   support

           0       0.73      0.74      0.73      1072
           1       0.66      0.64      0.65       832

    accuracy                           0.70      1904
   macro avg       0.69      0.69      0.69      1904
weighted avg       0.70      0.70      0.70      1904



# Remarks

This is just a proof of concept, and the performance is not surprising: with limited data, language embedding models are very difficult to train. You can easily improve the models with many `pretrained` models, e.g., word2vec.