In [81]:
import pandas as pd
import numpy as np

import tensorflow as tf
from tensorflow import keras 
from tensorflow.keras import layers

from keras_preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

import tensorflow as tf
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, GlobalMaxPooling1D, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from tensorflow.keras.utils import plot_model
from tensorflow.keras.utils import to_categorical

In [82]:
import tensorflow as tf
import os
import numpy as np
import random

SEED = 0

def set_seeds(seed=SEED):
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    tf.random.set_seed(seed)
    np.random.seed(seed)

set_seeds(seed=42)

 
 

In [83]:
class TransformerBlock(layers.Layer): # Transformer的Encoder端，Transformer block塊
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super().__init__()
        self.att=layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn=keras.Sequential([layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim),])
        self.layernorm1=layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2=layers.LayerNormalization(epsilon=1e-6)
        self.dropout1=layers.Dropout(rate)
        self.dropout2=layers.Dropout(rate)
        
    def call(self, inputs, training):
        attn_output=self.att(inputs, inputs)
        attn_output=self.dropout1(attn_output, training=training)
        out1=self.layernorm1(inputs + attn_output)
        ffn_output=self.ffn(out1)
        ffn_output=self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

In [84]:
data = pd.read_csv('/home/u108029050/m/train.csv')
testdata = pd.read_csv('/home/u108029050/m/test.csv')

#Set Column Names 
data.columns = ['ClassIndex', 'Title', 'Description']
testdata.columns = ['ClassIndex', 'Title', 'Description']

data['summary'] = data['Title'] + ' ' + data['Description']
testdata['summary'] = testdata['Title'] + ' ' + testdata['Description']

data = data.drop(columns=['Title', 'Description'])
testdata = testdata.drop(columns=['Title', 'Description'])


#Combine Title and Description
X_train = data['summary'] # Combine title and description (better accuracy than using them as separate features)
y_train = data['ClassIndex'].apply(lambda x: x-1).values # Class labels need to begin from 0
x_test = testdata['summary'] # Combine title and description (better accuracy than using them as separate features)
y_test = testdata['ClassIndex'].apply(lambda x: x-1).values # Class labels need to begin from 0

#Max Length of sentences in Train Dataset
maxlen = X_train.map(lambda x: len(x.split())).max()
data.head()


Unnamed: 0,ClassIndex,summary
0,3,Wall St. Bears Claw Back Into the Black (Reute...
1,3,Carlyle Looks Toward Commercial Aerospace (Reu...
2,3,Oil and Economy Cloud Stocks' Outlook (Reuters...
3,3,Iraq Halts Oil Exports from Main Southern Pipe...
4,3,"Oil prices soar to all-time record, posing new..."


In [85]:
data.shape, testdata.shape

((120000, 2), (7600, 2))

In [86]:
y_train = to_categorical(y_train,4)
y_test = to_categorical(y_test,4)

In [87]:
max_words = 10000 # 僅考慮資料集中的前10000個單詞
maxlen = 100 # 100個文字後切斷評論
# Create and Fit tokenizer

tok = Tokenizer(num_words=max_words) # 實例化一個只考慮最常用10000詞的分詞器
tok.fit_on_texts(X_train.values) # 建構單詞索引
# vocab_size = len(tok.word_index) + 1

# 將文字轉成整數list的序列資料
X_train = tok.texts_to_sequences(X_train)
x_test = tok.texts_to_sequences(x_test)

# Pad data
X_train = keras.preprocessing.sequence.pad_sequences(X_train, maxlen=maxlen)
x_test = keras.preprocessing.sequence.pad_sequences(x_test, maxlen=maxlen)

word_index = tok.word_index #單詞和數字的字典
print('Found %s unique tokens' % len(word_index))
# print(len(X_train), "Training sequences")
# print(len(x_test), "Validation sequences")

Found 70337 unique tokens


In [88]:
import os
embedding_index = {}
f = open('wiki.txt')

for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype = 'float32')
    embedding_index[word] = coefs
    
f.close()

print('Found %s word vectors' % len(embedding_index))
print(embedding_index["google"])

Found 107803 word vectors
[-0.81419706 -0.8000132   2.0608513  -1.007846   -0.7216044  -0.8566915
  0.7966929   3.7051861  -0.46912885 -0.68854475 -1.3169966  -0.50396204
 -1.2225083   0.9640229  -0.946806    0.18398409 -0.948219    1.6240733
  0.06061047 -0.76218134  0.10111515 -0.41588673  0.32345408 -0.19489264
  0.4045541  -0.18221259  0.48027515 -0.84279644  0.3509806   2.6030517
  1.8096178   0.473035   -0.8081082   0.58772033 -0.8069067   0.23847212
  0.7584653   0.24641363 -0.85608065 -1.5835495   0.19089963 -0.3917458
 -2.0202892   0.2806195   0.21958712 -1.1981602  -0.45732456 -2.6616156
  0.42792758  1.4582756  -0.7322122   0.11539538  1.5570602   0.11495335
  0.74106705  0.83484536  0.42752406  1.6985508  -1.4286654   1.9803507
  0.39201847 -1.3146726  -1.061342    1.49797     0.8115893  -0.33302578
  3.2861745  -1.6020184  -1.0791559   0.08012582 -1.0471542   1.683749
 -0.4148051   0.39994523 -0.33512256  3.4928787  -0.60213983 -0.72856414
 -0.5789077   1.5414579  -0.72842

In [89]:
embedding_dim = 100

embedding_matrix = np.zeros((max_words, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embedding_index.get(word)
    if i < max_words:
        embedding_vector = embedding_index.get(word)
        if embedding_vector is not None:
            # Words not found in embedding index will be all-zeros.
            embedding_matrix[i] = embedding_vector         

In [90]:
class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, max_words, embed_dim):
        super(TokenAndPositionEmbedding, self).__init__()
        self.token_emb=layers.Embedding(input_dim=max_words, output_dim=embed_dim,weights=[embedding_matrix],trainable=False)
        self.pos_emb=layers.Embedding(input_dim=maxlen, output_dim=embed_dim)
    def call(self, x):
        maxlen=tf.shape(x)[-1]
        positions=tf.range(start=0, limit=maxlen, delta=1)
        positions=self.pos_emb(positions)
        x=self.token_emb(x)
        return x + positions

In [91]:
embed_dim = 100  # 嵌入向量總長度
num_heads = 2  # Number of attention heads
ff_dim = 100  # Hidden layer size in feed forward network inside transformer

In [92]:
inputs = layers.Input(shape=(maxlen,))
embedding_layer = TokenAndPositionEmbedding(maxlen, max_words, embed_dim)
x = embedding_layer(inputs)
transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
x = transformer_block(x)
x = layers.GlobalAveragePooling1D()(x)
x = layers.Dropout(0.1)(x)
x = layers.Dense(20, activation="relu")(x)
x = layers.Dropout(0.1)(x)
outputs = layers.Dense(4, activation="softmax")(x)

model = keras.Model(inputs=inputs, outputs=outputs)
model.summary()

Model: "model_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_5 (InputLayer)        [(None, 100)]             0         
                                                                 
 token_and_position_embeddin  (None, 100, 100)         1010000   
 g_4 (TokenAndPositionEmbedd                                     
 ing)                                                            
                                                                 
 transformer_block_4 (Transf  (None, 100, 100)         101300    
 ormerBlock)                                                     
                                                                 
 global_average_pooling1d_4   (None, 100)              0         
 (GlobalAveragePooling1D)                                        
                                                                 
 dropout_18 (Dropout)        (None, 100)               0   

In [93]:
print(X_train.shape)
print(y_train.shape)
print(y_test.shape)
print(x_test.shape)

(120000, 100)
(120000, 4)
(7600, 4)
(7600, 100)


In [94]:
# Shuffle the data
seed = 1337
rng = np.random.RandomState(seed)
rng.shuffle(X_train)
rng = np.random.RandomState(seed)
rng.shuffle(y_train)

In [95]:
model.compile(optimizer="adam", loss='categorical_crossentropy', metrics=["accuracy"])
history=model.fit(X_train, y_train, batch_size=512, epochs=10, validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [96]:
history.history
scores = model.evaluate(x_test, y_test)
print("\n%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))


accuracy: 88.39%


In [97]:
prediction = model.predict(x_test)
labels = ['World News', 'Sports News', 'Business News', 'Science-Technology News']
for i in range(10,40,4):
    print(testdata['summary'].iloc[i][:50], "...")
    print("Actual category: ", labels[np.argmax(y_test[i])])
    print("predicted category: ",labels[np.argmax(prediction[i])])

Group to Propose New High-Speed Wireless Format  L ...
Actual category:  Science-Technology News
predicted category:  Science-Technology News
Socialites unite dolphin groups Dolphin groups, or ...
Actual category:  Science-Technology News
predicted category:  Science-Technology News
Rocking the Cradle of Life When did life begin? On ...
Actual category:  Science-Technology News
predicted category:  Science-Technology News
IBM Chips May Someday Heal Themselves New technolo ...
Actual category:  Science-Technology News
predicted category:  Science-Technology News
Giddy Phelps Touches Gold for First Time Michael P ...
Actual category:  Sports News
predicted category:  Sports News
They've caught his eye In  quot;helping themselves ...
Actual category:  Sports News
predicted category:  Sports News
Explosions Echo Throughout Najaf NAJAF, Iraq - Exp ...
Actual category:  World News
predicted category:  World News
Iran Warns Its Missiles Can Hit Anywhere in Israel ...
Actual category:  World N

In [98]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, accuracy_score
from tensorflow.keras.utils import plot_model

y_test_arg=np.argmax(y_test,axis=1)
Y_pred = np.argmax(prediction,axis=1)
print(confusion_matrix(y_test_arg, Y_pred)) #y軸事實 x軸預測
from sklearn.metrics import classification_report
print(classification_report(y_test_arg, Y_pred))

[[1636   81  127   56]
 [  22 1838   28   12]
 [  61   31 1626  182]
 [  65   26  191 1618]]
              precision    recall  f1-score   support

           0       0.92      0.86      0.89      1900
           1       0.93      0.97      0.95      1900
           2       0.82      0.86      0.84      1900
           3       0.87      0.85      0.86      1900

    accuracy                           0.88      7600
   macro avg       0.88      0.88      0.88      7600
weighted avg       0.88      0.88      0.88      7600

