<a href="https://colab.research.google.com/github/yuyangweng/NLP/blob/main/NLP_RNN_8_keras_tokenizer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
from tensorflow import keras
from collections import Counter
import re
 
# nltk
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
 
# Scikit-learn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder



---


keras.preprocessing.text.Tokenizer() to transform text to index like 

(input_train, y_train), (input_test, y_test) = imdb.load_data(num_words=max_features)

In [None]:
text0 = ['this is a book','today is a sunny day']
tok = keras.preprocessing.text.Tokenizer()
tok.fit_on_texts(text0)
print(tok.word_index)
print(tok.texts_to_sequences(text0))

{'is': 1, 'a': 2, 'this': 3, 'book': 4, 'today': 5, 'sunny': 6, 'day': 7}
[[3, 1, 2, 4], [5, 1, 2, 6, 7]]




---

Sentiment classification

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
tweets = pd.read_csv('/content/drive/MyDrive/00NLP/twitter_sentiment.csv', encoding='latin1')

In [None]:
tweets.head()

Unnamed: 0,ItemID,Sentiment,SentimentText
0,1,0,is so sad for my APL frie...
1,2,0,I missed the New Moon trail...
2,3,1,omg its already 7:30 :O
3,4,0,.. Omgaga. Im sooo im gunna CRy. I'...
4,5,0,i think mi bf is cheating on me!!! ...


In [None]:
tweets=tweets.drop(['ItemID'],axis=1)

In [None]:
tweets.head()

Unnamed: 0,Sentiment,SentimentText
0,0,is so sad for my APL frie...
1,0,I missed the New Moon trail...
2,1,omg its already 7:30 :O
3,0,.. Omgaga. Im sooo im gunna CRy. I'...
4,0,i think mi bf is cheating on me!!! ...


In [None]:
target_cnt = Counter(tweets.Sentiment)
 
print(target_cnt.keys(), target_cnt.values())

dict_keys([0, 1]) dict_values([43532, 56457])


In [None]:
nltk.download('stopwords')
stop_words = stopwords.words("english")
stemmer = SnowballStemmer("english")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
def preprocessor(text):
    text = re.sub('<[^>]*>', '', text) # re.sub 取代
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text) # :-) ;-( =-D :-P :D :-(
    text = (re.sub('[\W]+', ' ', text.lower()) +
            ' '.join(emoticons).replace('-', ''))
    return text
 
def preprocess(text, stem=False):
    text = preprocessor(str(text)).strip()
    tokens = []
    for token in text.split():
        if token not in stop_words:
            if stem:
                tokens.append(stemmer.stem(token))
            else:
                tokens.append(token)
    return " ".join(tokens)

In [None]:
tweets.SentimentText = tweets.SentimentText.apply(lambda x: preprocess(x))

In [None]:
tweets.head()

Unnamed: 0,Sentiment,SentimentText
0,0,sad apl friend
1,0,missed new moon trailer
2,1,omg already 7 30
3,0,omgaga im sooo im gunna cry dentist since 11 s...
4,0,think mi bf cheating t_t


In [None]:
tokenizer = keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(tweets.SentimentText)
 
vocab_size = len(tokenizer.word_index) + 1
print("Total words", vocab_size)

Total words 103102


In [None]:
df = pd.DataFrame()

In [None]:
df['SentimentText'] = tokenizer.texts_to_sequences(tweets.SentimentText)

In [None]:
df['Sentiment'] = tweets['Sentiment']

In [None]:
df[-10:]

Unnamed: 0,SentimentText,Sentiment
99979,"[103094, 313, 11, 62, 313, 458, 87]",1
99980,"[103095, 654, 2785, 1005, 87]",1
99981,"[1086, 398, 716, 1846, 23, 58, 3, 692, 138, 14...",0
99982,"[1086, 3509, 5068, 48, 1722, 14, 6670, 1847, 9...",1
99983,"[103097, 421, 6580, 115, 5, 476, 3815, 40, 23,...",0
99984,"[2644, 331, 3, 6789, 356, 25, 312, 105, 94]",0
99985,"[2644, 103098, 1238, 611, 256, 17, 13, 10298, ...",1
99986,"[2644, 103100, 97, 107]",0
99987,"[2644, 103101, 41, 41, 103, 48]",1
99988,"[2644, 7707, 22, 41]",1


In [None]:
df_train, df_test = train_test_split(df, test_size=0.1, random_state=1)
print("TRAIN size:", len(df_train))
print("TEST size:", len(df_test))

TRAIN size: 89990
TEST size: 9999


In [None]:
x_train = keras.preprocessing.sequence.pad_sequences(df_train.SentimentText.values, maxlen=300)
x_test = keras.preprocessing.sequence.pad_sequences(df_test.SentimentText.values, maxlen=300)

In [None]:
x_train

array([[   0,    0,    0, ...,  450, 3315,  256],
       [   0,    0,    0, ...,  162,    8,  895],
       [   0,    0,    0, ...,   50,  215,   76],
       ...,
       [   0,    0,    0, ..., 4062,  391, 2484],
       [   0,    0,    0, ...,  369,   79,   91],
       [   0,    0,    0, ...,   14,  408, 9405]], dtype=int32)

In [None]:
y_train = df_train.Sentiment.values.reshape(-1,1)
y_test = df_test.Sentiment.values.reshape(-1,1)

print("y_train",y_train.shape)
print("y_test",y_test.shape)

y_train (89990, 1)
y_test (9999, 1)


In [None]:
y_test

array([[1],
       [0],
       [0],
       ...,
       [0],
       [0],
       [0]])

In [None]:
max_features = vocab_size  # 要考慮作為特徵的語詞數量
maxlen = 300  # 當句子的長度超過300個語詞的部份,就把它刪除掉
batch_size = 16

In [None]:
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, LSTM, Dropout, Embedding

model = Sequential()   # 輸入 （sample size，time steps） with label encoding              
model.add(Embedding(max_features, 4, input_length=maxlen)) # => 參數(input features, output features, time steps)
model.add(Dropout(0.5)) # 承接 embedding 輸出 （sample size，time steps, output features） 
model.add(LSTM(8, return_sequences=True)) # 輸入 （sample size，time steps, input features） 
model.add(Dropout(0.5))
model.add(LSTM(4))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

model.summary()

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
history = model.fit(x_train, y_train,
                    epochs=4,
                    batch_size=batch_size,
                    validation_split=0.2)

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 300, 4)            412408    
_________________________________________________________________
dropout (Dropout)            (None, 300, 4)            0         
_________________________________________________________________
lstm (LSTM)                  (None, 300, 8)            416       
_________________________________________________________________
dropout_1 (Dropout)          (None, 300, 8)            0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 4)                 208       
_________________________________________________________________
dropout_2 (Dropout)          (None, 4)                 0         
_________________________________________________________________
dense (Dense)                (None, 1)                 5

In [None]:
model.evaluate(x_test, y_test, batch_size=16)



[0.5364326238632202, 0.7322732210159302]



---

加入 word2vec 預訓練 embedding weights

In [None]:
from gensim.models.keyedvectors import KeyedVectors
gensim_model = KeyedVectors.load_word2vec_format('/content/drive/MyDrive/00NLP/GoogleNews-vectors-negative300.bin.gz', binary=True, limit=300000)

In [None]:
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(tweets.SentimentText)

vocab_size = len(tokenizer.word_index) + 1
print("Total words", vocab_size)
print([word for word, index in tokenizer.word_index.items()][:100])
print([index for word, index in tokenizer.word_index.items()][:100])

Total words 103102
['quot', 'good', 'like', 'lol', 'get', 'u', 'know', 'love', 'thanks', 'one', 'go', 'day', 'see', 'amp', 'well', 'http', 'time', 'got', 'im', 'oh', 'think', 'haha', 'really', 'going', 'hope', 'work', 'sorry', 'back', 'still', 'yeah', 'com', 'would', 'want', 'today', 'much', '2', 'great', 'miss', 'need', 'right', 'yes', 'new', 'twitter', '3', 'night', 'though', 'come', 'fun', 'hey', 'make', 'last', 'better', 'thank', 'sad', 'wish', 'feel', 'nice', 'bad', 'lt', 'could', 'way', 'home', 'happy', 'morning', 'awesome', 'never', 'ur', 'sure', 'bit', 'say', 'even', 'always', 'dont', 'people', 'wait', 'us', 'ok', 'soon', 'take', 'tomorrow', 'week', 'next', 'let', 'gonna', 'cool', 'show', 'x', 'please', 'thing', 'follow', 'look', '4', 'guys', 'something', 'tonight', 'twitpic', 'ya', 'getting', 'hear', 'tell']
[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45,

In [None]:
embedding_weights = np.zeros((vocab_size, maxlen))
for word, index in tokenizer.word_index.items():
    try:
      embedding_weights[index, :] = gensim_model.wv[word]
    except KeyError:
      pass

  after removing the cwd from sys.path.


In [None]:
embedding_weights.shape

(103102, 300)

In [None]:
max_features = vocab_size  # 要考慮作為特徵的語詞數量
maxlen = 300  # 當句子的長度超過300個語詞的部份,就把它刪除掉
batch_size = 16

In [None]:
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, LSTM, Dropout, Embedding

model = Sequential()   # 輸入 （sample size，time steps） with label encoding              
model.add(Embedding(max_features, embedding_weights.shape[1], input_length=maxlen, weights = [embedding_weights])) # => 參數 weights = [embedding_weights]
model.add(Dropout(0.5)) # 承接 embedding 輸出 （sample size，time steps, output features） 
model.add(LSTM(8, return_sequences=True)) # 輸入 （sample size，time steps, input features） 
model.add(Dropout(0.5))
model.add(LSTM(4))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

model.summary()

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
history = model.fit(x_train, y_train,
                    epochs=1,
                    batch_size=batch_size,
                    validation_split=0.2)

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 300, 300)          30930600  
_________________________________________________________________
dropout_3 (Dropout)          (None, 300, 300)          0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 300, 8)            9888      
_________________________________________________________________
dropout_4 (Dropout)          (None, 300, 8)            0         
_________________________________________________________________
lstm_3 (LSTM)                (None, 4)                 208       
_________________________________________________________________
dropout_5 (Dropout)          (None, 4)                 0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                

In [None]:
model.evaluate(x_test, y_test, batch_size=16)



[0.5069248080253601, 0.7532753348350525]



---
callbacks


In [None]:
checkpoint = keras.callbacks.ModelCheckpoint('/content/drive/MyDrive/00NLP/tok.h5', monitor='val_accuracy', mode="max", save_best_only=True, verbose=1)
earlystopping = keras.callbacks.EarlyStopping(monitor='val_accuracy', mode="max", patience=5, verbose=1)
rlr = keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=2, verbose=1, mode='auto', min_delta=0.0001)

model.fit(x_train, y_train, epochs=10, batch_size=16,
     validation_split=0.2, callbacks=[checkpoint,earlystopping,rlr])