In [1]:
import numpy as np
import pandas as pd
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

In [2]:
df_train = pd.read_csv('subword_train.csv')
df_test = pd.read_csv('subword_test.csv')

In [3]:
df_train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [4]:
df_test.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [5]:
df_train.tail()

Unnamed: 0,id,keyword,location,text,target
7608,10869,,,Two giant cranes holding a bridge collapse int...,1
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,,,Police investigating after an e-bike collided ...,1
7612,10873,,,The Latest: More Homes Razed by Northern Calif...,1


In [6]:
# 전처리

In [7]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [8]:
def preprocess(sentences):
    sentences = sentences.lower() # 소문자 처리
    sentences = sentences.translate(str.maketrans(' ', ' ', punctuations)) # 구두점 제거
    sentences = word_tokenize(sentences)
    sentences = [s for s in sentences if s not in stops] # 불용어 제거
    return ' '.join(sentences) # 문자열로 합하여 반환

In [9]:
def make_feature(keyword, location, text):
    if pd.isnull(keyword):
        keyword = ''
    if pd.isnull(location):
        location = ''

    try:
        return keyword + ' ' + location + ' ' + text
    except:
        print('keyword :', keyword)
        print('location :', location)
        print('text : ', text)

In [10]:
# 불용어 처리
# 특수문자 제거
punctuations = string.punctuation
punctuations = punctuations.translate({ord('@'): None}) # 구두점 목록에서 @를 제거
punctuations = punctuations.translate({ord('#'): None}) # 구두점 목록에서 #을 제거
stops = stopwords.words('english') 
stemmer = PorterStemmer()

In [11]:
# train 데이터에 적용
feature = [preprocess(make_feature(k, l, t)) for k, l, t in zip(df_train['keyword'], df_train['location'], df_train['text'])]

In [12]:
feature[0]

'deeds reason # earthquake may allah forgive us'

In [13]:
feature[4]

'got sent photo ruby # alaska smoke # wildfires pours school'

In [14]:
test_feature = [preprocess(make_feature(k, l, t)) for k, l, t in zip(df_test['keyword'], df_test['location'], df_test['text'])]

In [15]:
test_feature[0]

'happened terrible car crash'

In [16]:
with open('subword_input.txt', 'w', encoding='utf-8') as f:
    for word in feature:
        f.write(word)

In [17]:
from tokenizers import BertWordPieceTokenizer

tokenizer = BertWordPieceTokenizer(lowercase=True, strip_accents=True)

data_file = 'subword_input.txt'
vocab_size = 30000
limit_alphabet = 6000
min_frequency = 5

In [18]:
tokenizer.train(files=data_file, vocab_size=vocab_size, limit_alphabet=limit_alphabet, min_frequency=min_frequency)






In [19]:
# 트윗의 텍스트(키워드, 위치) target : 실제 재난에 관한 내용인지 확인
X_tokens = [tokenizer.encode(tweet).tokens for tweet in feature]
print(X_tokens[0])

['de', '##eds', 'reason', '#', 'earthquake', 'may', 'allah', 'forg', '##ive', 'us']


In [20]:
X_ids = [tokenizer.encode(tweet).ids for tweet in feature]
X_ids[0]

[198, 704, 3149, 5, 1540, 533, 4908, 4931, 279, 208]

In [21]:
max([len(s) for s in X_ids])

59

In [22]:
X_test_tokens = [tokenizer.encode(tweet).tokens for tweet in test_feature]
X_test_ids = [tokenizer.encode(tweet).ids for tweet in test_feature]

In [23]:
tokenizer.token_to_id('hello')

4940

In [24]:
tokenizer.id_to_token(123)

'httptco'

In [25]:
word_to_index = dict()

for sent in X_tokens[0]:
    print(sent)
    for w in sent:
        print(w)

de
d
e
##eds
#
#
e
d
s
reason
r
e
a
s
o
n
#
#
earthquake
e
a
r
t
h
q
u
a
k
e
may
m
a
y
allah
a
l
l
a
h
forg
f
o
r
g
##ive
#
#
i
v
e
us
u
s


In [26]:
word_to_index = dict()

for sent in X_tokens[:10]:
    for w in sent:
        if w not in word_to_index:
            word_to_index[w] = tokenizer.token_to_id(w)

In [27]:
print(word_to_index)

{'de': 198, '##eds': 704, 'reason': 3149, '#': 5, 'earthquake': 1540, 'may': 533, 'allah': 4908, 'forg': 4931, '##ive': 279, 'us': 208, 'forest': 1143, 'fire': 318, 'near': 681, 'la': 944, 'ron': 5438, '##ge': 376, 'sa': 458, '##s': 63, '##k': 77, 'canada': 820, 'residents': 6357, 'asked': 5289, 'shelter': 7137, 'place': 2537, 'not': 1085, '##ified': 2849, 'officers': 5694, 'evacuation': 1490, 'orders': 4746, 'expected': 3820, '13': 2654, '##000': 1120, 'people': 435, 'rece': 7587, 'wildfires': 4942, 'california': 430, 'got': 584, 'sent': 2999, 'photo': 1664, 'rub': 1697, '##y': 79, 'alaska': 4733, 'smoke': 1512, 'po': 1061, '##urs': 1651, 'school': 1054, 'rocky': 3541, '##fire': 298, 'update': 1890, 'hwy': 4772, '20': 489, 'closed': 3180, 'direction': 2794, 'due': 2235, 'lake': 2582, 'county': 1191, 'ca': 301, 'flood': 486, 'disaster': 511, 'heavy': 2975, 'rain': 796, 'causes': 4205, 'flash': 2773, 'flooding': 1468, 'streets': 5293, 'man': 319, '##ito': 4877, '##u': 66, 'colorado': 15

In [28]:
from tensorflow.keras import preprocessing
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout

In [29]:
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical

In [30]:
vocab_size = tokenizer.get_vocab_size()
vocab_size

8233

In [31]:
X_padded = pad_sequences(X_ids, maxlen=60, padding='post', value=vocab_size)
X_test_padded = pad_sequences(X_test_ids, maxlen=60, padding='post', value=0)

In [32]:
X_padded[:3]

array([[ 198,  704, 3149,    5, 1540,  533, 4908, 4931,  279,  208, 8233,
        8233, 8233, 8233, 8233, 8233, 8233, 8233, 8233, 8233, 8233, 8233,
        8233, 8233, 8233, 8233, 8233, 8233, 8233, 8233, 8233, 8233, 8233,
        8233, 8233, 8233, 8233, 8233, 8233, 8233, 8233, 8233, 8233, 8233,
        8233, 8233, 8233, 8233, 8233, 8233, 8233, 8233, 8233, 8233, 8233,
        8233, 8233, 8233, 8233, 8233],
       [1143,  318,  681,  944, 5438,  376,  458,   63,   77,  820, 8233,
        8233, 8233, 8233, 8233, 8233, 8233, 8233, 8233, 8233, 8233, 8233,
        8233, 8233, 8233, 8233, 8233, 8233, 8233, 8233, 8233, 8233, 8233,
        8233, 8233, 8233, 8233, 8233, 8233, 8233, 8233, 8233, 8233, 8233,
        8233, 8233, 8233, 8233, 8233, 8233, 8233, 8233, 8233, 8233, 8233,
        8233, 8233, 8233, 8233, 8233],
       [6357, 5289, 7137, 2537, 1085, 2849, 5694, 1490, 7137, 2537, 4746,
        3820, 8233, 8233, 8233, 8233, 8233, 8233, 8233, 8233, 8233, 8233,
        8233, 8233, 8233, 8233, 82

In [33]:
X_padded.shape, X_test_padded.shape

((7613, 60), (3263, 60))

In [34]:
y = df_train['target']

In [35]:
print(y[:100])

0     1
1     1
2     1
3     1
4     1
     ..
95    1
96    0
97    1
98    0
99    1
Name: target, Length: 100, dtype: int64


In [36]:
vocab_size = tokenizer.get_vocab_size()+1
vocab_size

8234

In [37]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, GRU, Embedding
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

In [38]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GRU, Dense

embedding_dim = 64
hidden_unit = 64

model = Sequential()
model.add(Embedding(vocab_size, embedding_dim))
model.add(GRU(hidden_unit))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 64)          526976    
                                                                 
 gru (GRU)                   (None, 64)                24960     
                                                                 
 dense (Dense)               (None, 1)                 65        
                                                                 
Total params: 552,001
Trainable params: 552,001
Non-trainable params: 0
_________________________________________________________________


2024-07-11 20:12:42.587518: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2024-07-11 20:12:42.588384: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2024-07-11 20:12:42.588940: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

In [39]:
es = EarlyStopping(monitor = 'val_loss', mode='min', verbose=1, patience=3)
mc = ModelCheckpoint('best_model.keras', monitor = 'val_acc', mode='max', verbose=1, save_best_only=True)

In [40]:
history = model.fit(X_padded, y, epochs=15, callbacks=[es, mc], batch_size=64, validation_split=0.1)

Epoch 1/15


2024-07-11 20:12:42.655416: W tensorflow/tsl/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
2024-07-11 20:12:42.791456: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2024-07-11 20:12:42.792533: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2024-07-11 20:12:42.793170: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG IN

Epoch 1: val_acc improved from -inf to 0.53412, saving model to best_model.keras


2024-07-11 20:12:46.658304: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2024-07-11 20:12:46.659100: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2024-07-11 20:12:46.659634: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

Epoch 2/15
Epoch 2: val_acc did not improve from 0.53412
Epoch 3/15
Epoch 3: val_acc did not improve from 0.53412
Epoch 4/15
Epoch 4: val_acc did not improve from 0.53412
Epoch 4: early stopping


In [41]:
print('train acc : ', model.evaluate(X_padded, y)[1])

train acc :  0.5703402161598206


In [42]:
from tensorflow.keras.layers import Bidirectional, LSTM, Dense

In [43]:
model2 = Sequential()
model2.add(Embedding(vocab_size, embedding_dim))
model2.add(Bidirectional(LSTM(hidden_unit)))
model2.add(Dense(1, activation='sigmoid'))

2024-07-11 20:12:57.423954: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2024-07-11 20:12:57.424761: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2024-07-11 20:12:57.425332: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

In [44]:
model2.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])

In [45]:
history2 = model2.fit(X_padded, y, epochs=15, callbacks=[es, mc], batch_size=64, validation_split=0.1)

Epoch 1/15


2024-07-11 20:12:57.660901: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2024-07-11 20:12:57.661932: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2024-07-11 20:12:57.662537: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus



2024-07-11 20:13:02.810760: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2024-07-11 20:13:02.811796: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2024-07-11 20:13:02.812598: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus


Epoch 1: val_acc improved from 0.53412 to 0.68766, saving model to best_model.keras
Epoch 2/15
Epoch 2: val_acc improved from 0.68766 to 0.78346, saving model to best_model.keras
Epoch 3/15
Epoch 3: val_acc did not improve from 0.78346
Epoch 4/15
Epoch 4: val_acc did not improve from 0.78346
Epoch 5/15
Epoch 5: val_acc did not improve from 0.78346
Epoch 5: early stopping


In [46]:
df_test.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [47]:
print('train acc : ', model2.evaluate(X_padded, y)[1])

train acc :  0.9365558624267578


In [48]:
from gensim.models import Word2Vec
w2v_model = Word2Vec(sentences=X_tokens, vector_size=60, window=5, min_count=1, workers=4)

In [49]:
embedding_matrix = np.zeros((vocab_size, 60))

for word, i in word_to_index.items():
    embedding_vector = w2v_model.wv[word]
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [51]:
y = df_train['target']
y = to_categorical(y)
y.shape

(7613, 2)

In [53]:
model3 = Sequential()
model3.add(Embedding(vocab_size, embedding_dim))
# model3.add(Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=60))
model3.add(Bidirectional(LSTM(32, activation='relu')))
model3.add(Dropout(0.2))
model3.add(Dense(32, activation='relu'))
model3.add(Dropout(0.2))
model3.add(Dense(16, activation='relu'))
model3.add(Dense(2, activation='softmax'))

model3.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])

In [54]:
history3 = model3.fit(X_padded, y, epochs=15, batch_size=64, validation_split=0.1, callbacks=[es,mc])

Epoch 1/15
Epoch 1: val_acc did not improve from 0.78346
Epoch 2/15
Epoch 2: val_acc did not improve from 0.78346
Epoch 3/15
Epoch 3: val_acc did not improve from 0.78346
Epoch 4/15
Epoch 4: val_acc did not improve from 0.78346
Epoch 4: early stopping


In [55]:
print('test acc : ', model3.evaluate(X_padded, y)[1])

test acc :  0.9273610711097717


In [56]:
model.add(Embedding(vocab_size, embedding_dim))

In [57]:
y_pred2 = model3.predict(X_test_padded)



In [None]:
y_pred2 = [np.argmax(p) for p in y_pred2]

In [None]:
print(y_pred2)

In [None]:
result_df = pd.DataFrame({'id':df_test['id'], 'target':y_pred})

In [None]:
result_df.to_csv('submission.csv', index=False)