참고 : https://www.kaggle.com/code/songseungwon/nlp-quick-start-for-newbie-with-9steps/notebook

# 데이터셋 불러오기

In [1]:
import pandas as pd 
import numpy as np 

In [2]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [3]:
train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [4]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [5]:
train_df.nunique()

id          7613
keyword      221
location    3341
text        7503
target         2
dtype: int64

In [6]:
test_df.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [7]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3263 entries, 0 to 3262
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        3263 non-null   int64 
 1   keyword   3237 non-null   object
 2   location  2158 non-null   object
 3   text      3263 non-null   object
dtypes: int64(1), object(3)
memory usage: 102.1+ KB


In [8]:
test_df.nunique()

id          3263
keyword      221
location    1602
text        3243
dtype: int64

# 데이터 전처리

## Drop Columns

In [9]:
train_df.drop(columns=['id','keyword','location'], axis=1, inplace=True)

In [10]:
test_df.drop(columns=['keyword','location'],axis=1, inplace=True)

In [11]:
print(train_df.shape, test_df.shape)

(7613, 2) (3263, 2)


## Tokenizer

In [12]:
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(train_df['text'],train_df['target'], test_size=0.2, random_state=111)
print(X_train.shape, y_train.shape, X_valid.shape, y_valid.shape)

(6090,) (6090,) (1523,) (1523,)


In [13]:
from tensorflow.keras.preprocessing.text import Tokenizer

vocab_size = 1000
oov_token = "<OOV>"
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_token)

tokenizer.fit_on_texts(X_train)

X_train = tokenizer.texts_to_sequences(X_train)
X_valid = tokenizer.texts_to_sequences(X_valid)

## Pad Sequences

In [14]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

max_length = 120
trunc_type = 'post'
pad_type = 'post'

X_train_padded = pad_sequences(X_train, maxlen=max_length, truncating=trunc_type, padding=pad_type)
X_valid_padded = pad_sequences(X_valid, maxlen=max_length, truncating=trunc_type, padding=pad_type)\

print(X_train_padded.shape, X_valid_padded.shape)

(6090, 120) (1523, 120)


## Match Data type to numpy.ndarray

In [15]:
print(type(X_train_padded), type(X_valid_padded))
print(type(y_train), type(y_valid))

<class 'numpy.ndarray'> <class 'numpy.ndarray'>
<class 'pandas.core.series.Series'> <class 'pandas.core.series.Series'>


In [16]:
y_train = np.array(y_train)
y_valid = np.array(y_valid)

In [17]:
print(type(X_train_padded), type(X_valid_padded))
print(type(y_train), type(y_valid))

<class 'numpy.ndarray'> <class 'numpy.ndarray'>
<class 'numpy.ndarray'> <class 'numpy.ndarray'>


---

# Modeling

In [18]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional, Flatten

In [19]:
embedding_dim = 16
vocab_size = 1000
max_length = 120

model = Sequential([
    Embedding(vocab_size, embedding_dim, input_length=max_length),
    Bidirectional(LSTM(64, return_sequences=True)),
    Bidirectional(LSTM(64, return_sequences=True)),
    Bidirectional(LSTM(64, dropout=0.5)),
    Dense(32, activation='relu'),
    Dense(16, activation='relu'),
    Dense(1, activation='sigmoid')
])

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 120, 16)           16000     
                                                                 
 bidirectional (Bidirectiona  (None, 120, 128)         41472     
 l)                                                              
                                                                 
 bidirectional_1 (Bidirectio  (None, 120, 128)         98816     
 nal)                                                            
                                                                 
 bidirectional_2 (Bidirectio  (None, 128)              98816     
 nal)                                                            
                                                                 
 dense (Dense)               (None, 32)                4128      
                                                        

---

# Model Compile

In [20]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])

---

# Callbacks

In [21]:
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping

filepath = 'my_checkpoint.ckpt'
cp = ModelCheckpoint(
    filepath=filepath,
    save_weights_only=True,
    save_best_only=True,
    monitor='val_loss',
    verbose=1
)

ep = EarlyStopping(
    monitor='val_loss', 
    patience=5,
)

---

# Model Fit 

In [22]:
epochs=30
model.fit(
    X_train_padded, y_train,
    validation_data = (X_valid_padded, y_valid),
    callbacks=[cp,ep],
    epochs=epochs
)

Epoch 1/30
Epoch 1: val_loss improved from inf to 0.48924, saving model to my_checkpoint.ckpt
Epoch 2/30
Epoch 2: val_loss improved from 0.48924 to 0.45939, saving model to my_checkpoint.ckpt
Epoch 3/30
Epoch 3: val_loss improved from 0.45939 to 0.44572, saving model to my_checkpoint.ckpt
Epoch 4/30
Epoch 4: val_loss did not improve from 0.44572
Epoch 5/30
Epoch 5: val_loss did not improve from 0.44572
Epoch 6/30
Epoch 6: val_loss did not improve from 0.44572
Epoch 7/30
Epoch 7: val_loss did not improve from 0.44572
Epoch 8/30
Epoch 8: val_loss did not improve from 0.44572


<keras.callbacks.History at 0x26e3729b7c0>

---

# Model Evaluate & Save

In [23]:
model.load_weights(filepath)

<tensorflow.python.checkpoint.checkpoint.CheckpointLoadStatus at 0x26e4930c9d0>

In [24]:
model.evaluate(X_valid_padded, y_valid)



[0.44572386145591736, 0.7938279509544373]

In [25]:
model.save('./model/basic_nlp.h5')

---

# Reload Model

In [26]:
import tensorflow as tf

mymodel = tf.keras.models.load_model('./model/basic_nlp.h5')
mymodel.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 120, 16)           16000     
                                                                 
 bidirectional (Bidirectiona  (None, 120, 128)         41472     
 l)                                                              
                                                                 
 bidirectional_1 (Bidirectio  (None, 120, 128)         98816     
 nal)                                                            
                                                                 
 bidirectional_2 (Bidirectio  (None, 128)              98816     
 nal)                                                            
                                                                 
 dense (Dense)               (None, 32)                4128      
                                                        

---

# Predict Test Data

In [27]:
X_test = tokenizer.texts_to_sequences(test_df['text'])
X_test_padded = pad_sequences(X_test, maxlen=max_length, truncating=trunc_type, padding=pad_type)

In [28]:
y_test_raw = model.predict(X_test_padded)

y_test = list(map(lambda x : 1 if x > 0.5 else 0, y_test_raw))

test_df['predict'] = y_test
test_df



Unnamed: 0,id,text,predict
0,0,Just happened a terrible car crash,1
1,2,"Heard about #earthquake is different cities, s...",1
2,3,"there is a forest fire at spot pond, geese are...",1
3,9,Apocalypse lighting. #Spokane #wildfires,0
4,11,Typhoon Soudelor kills 28 in China and Taiwan,1
...,...,...,...
3258,10861,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...,1
3259,10865,Storm in RI worse than last hurricane. My city...,1
3260,10868,Green Line derailment in Chicago http://t.co/U...,1
3261,10874,MEG issues Hazardous Weather Outlook (HWO) htt...,1


In [29]:
test_df[test_df['predict']==1]

Unnamed: 0,id,text,predict
0,0,Just happened a terrible car crash,1
1,2,"Heard about #earthquake is different cities, s...",1
2,3,"there is a forest fire at spot pond, geese are...",1
4,11,Typhoon Soudelor kills 28 in China and Taiwan,1
5,12,We're shaking...It's an earthquake,1
...,...,...,...
3257,10858,The death toll in a #IS-suicide car bombing on...,1
3258,10861,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...,1
3259,10865,Storm in RI worse than last hurricane. My city...,1
3260,10868,Green Line derailment in Chicago http://t.co/U...,1


In [30]:
submission = test_df[['id','predict']]
submission.columns = ['id', 'target']

submission.to_csv('./sample_submission.csv', index=False)