In [1]:
!nvidia-smi

Sat Dec 18 16:23:44 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 495.44       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   31C    P0    22W / 300W |      0MiB / 16160MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers import Bidirectional, GlobalMaxPool1D, SpatialDropout1D, Conv1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers
import numpy as np
from sklearn.preprocessing import OneHotEncoder

In [3]:
!wget https://archive.ics.uci.edu/ml/machine-learning-databases/00359/NewsAggregatorDataset.zip
!unzip NewsAggregatorDataset.zip

--2021-12-18 16:23:54--  https://archive.ics.uci.edu/ml/machine-learning-databases/00359/NewsAggregatorDataset.zip
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 29224203 (28M) [application/x-httpd-php]
Saving to: ‘NewsAggregatorDataset.zip’


2021-12-18 16:23:57 (11.6 MB/s) - ‘NewsAggregatorDataset.zip’ saved [29224203/29224203]

Archive:  NewsAggregatorDataset.zip
  inflating: 2pageSessions.csv       
   creating: __MACOSX/
  inflating: __MACOSX/._2pageSessions.csv  
  inflating: newsCorpora.csv         
  inflating: __MACOSX/._newsCorpora.csv  
  inflating: readme.txt              
  inflating: __MACOSX/._readme.txt   


In [4]:
# 読込時のエラー回避のためダブルクォーテーションをシングルクォーテーションに置換
!sed -e 's/"/'\''/g' ./newsCorpora.csv > ./newsCorpora_re.csv

In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split

# データの読込
df = pd.read_csv('./newsCorpora_re.csv', header=None, sep='\t', names=['ID', 'TITLE', 'URL', 'PUBLISHER', 'CATEGORY', 'STORY', 'HOSTNAME', 'TIMESTAMP'])

# データの抽出
df = df.loc[df['PUBLISHER'].isin(['Reuters', 'Huffington Post', 'Businessweek', 'Contactmusic.com', 'Daily Mail']), ['TITLE', 'CATEGORY']]

# データの分割
train, valid_test = train_test_split(df, test_size=0.2, shuffle=True, random_state=123, stratify=df['CATEGORY'])
valid, test = train_test_split(valid_test, test_size=0.5, shuffle=True, random_state=123, stratify=valid_test['CATEGORY'])
train.reset_index(drop=True, inplace=True)
valid.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)

print(train.head())

                                               TITLE CATEGORY
0  REFILE-UPDATE 1-European car sales up for sixt...        b
1  Amazon Plans to Fight FTC Over Mobile-App Purc...        t
2  Kids Still Get Codeine In Emergency Rooms Desp...        m
3  What On Earth Happened Between Solange And Jay...        e
4  NATO Missile Defense Is Flight Tested Over Hawaii        b


In [6]:
# FastTextダウンロード
!pip install fasttext > /dev/null
import fasttext

In [7]:
embed_size = 300 # how big is each word vector
max_features = 14724 # how many unique words to use (i.e num rows in embedding vector)
maxlen = 100 # max number of words in a comment to use

In [8]:
label_to_num = {"b": 0, "e": 1, "m": 2, "t": 3}

In [9]:
!wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz
!gzip -d cc.en.300.bin.gz

--2021-12-18 16:24:43--  https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 104.22.75.142, 172.67.9.4, 104.22.74.142, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|104.22.75.142|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4503593528 (4.2G) [application/octet-stream]
Saving to: ‘cc.en.300.bin.gz’


2021-12-18 16:30:48 (11.8 MB/s) - ‘cc.en.300.bin.gz’ saved [4503593528/4503593528]



In [10]:
FASTTEXT_MODEL_BIN = "cc.en.300.bin"
#this works
ft_model = fasttext.load_model(FASTTEXT_MODEL_BIN)
ft_model.get_word_vector("additional").shape



(300,)

In [11]:
list_sentences_train = train["TITLE"].values
list_sentences_valid = valid["TITLE"].values
list_sentences_test = test["TITLE"].values
ohe = OneHotEncoder()
y_tr = ohe.fit_transform(np.array(train["CATEGORY"]).reshape(-1,1)).toarray()

In [12]:
y_tr

array([[1., 0., 0., 0.],
       [0., 0., 0., 1.],
       [0., 0., 1., 0.],
       ...,
       [1., 0., 0., 0.],
       [0., 0., 1., 0.],
       [0., 1., 0., 0.]])

In [13]:
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(list_sentences_train))
list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
list_tokenized_valid = tokenizer.texts_to_sequences(list_sentences_valid)
list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)
X_tr = pad_sequences(list_tokenized_train, maxlen=maxlen)
X_vl = pad_sequences(list_tokenized_valid, maxlen=maxlen)
X_te = pad_sequences(list_tokenized_test, maxlen=maxlen)

In [14]:
ft_model

<fasttext.FastText._FastText at 0x7f8024d65a90>

In [15]:
def get_coefs(word): 
  return word, ft_model.get_word_vector(word)
embeddings_index = dict(get_coefs(word) for word in ft_model.words)

In [16]:
'''
Use these vectors to create our embedding matrix, 
with random initialization for words that aren't in GloVe. 
We'll use the same mean and stdev of embeddings the GloVe has 
when generating the random init.
'''
all_embs = np.stack(embeddings_index.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()
emb_mean,emb_std

  if self.run_code(code, result):


(0.0019600347, 0.07324928)

In [17]:
word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

In [18]:

embed_size=300
inp = Input(shape=(maxlen, ), name="text")
x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
x = SpatialDropout1D(0.2)(x)
z = GlobalMaxPool1D()(x)
x = GlobalMaxPool1D()(Conv1D(embed_size, 4, activation="relu")(x))
x = Dropout(0.3)(x)
x = Dense(4, activation="sigmoid")(x)
model = Model(inputs=[inp], outputs=x)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])


In [19]:
model.fit(X_tr, y_tr, batch_size=32, epochs=2, validation_split=0)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f800d378390>

In [20]:
y_te = ohe.transform(np.array(test["CATEGORY"]).reshape(-1,1)).toarray()
results = model.evaluate(X_te, y_te, batch_size=128)



In [21]:
y_te = y_te.argmax(axis=1)

In [22]:
from sklearn.metrics import roc_auc_score, log_loss, accuracy_score
y_pred = model.predict(X_te)
y_pred_argmax = y_pred.argmax(axis=1)
print('{:.4f}'.format(accuracy_score(y_te, y_pred_argmax)))
y_te = ohe.transform(np.array(test["CATEGORY"]).reshape(-1,1)).toarray()
print('{:.4f}'.format(roc_auc_score(y_te, y_pred, multi_class='ovo')))
print('{:.4f}'.format(log_loss(y_te, y_pred)))

0.9319
0.9822
0.2207


In [23]:
y_vl = ohe.transform(np.array(valid["CATEGORY"]).reshape(-1,1)).toarray()
y_vl = y_vl.argmax(axis=1)
y_pred = model.predict(X_vl)
y_pred_argmax = y_pred.argmax(axis=1)
print('{:.4f}'.format(accuracy_score(y_vl, y_pred_argmax)))
y_vl = ohe.transform(np.array(valid["CATEGORY"]).reshape(-1,1)).toarray()
print('{:.4f}'.format(roc_auc_score(y_vl, y_pred, multi_class='ovo')))
print('{:.4f}'.format(log_loss(y_vl, y_pred)))

0.9132
0.9800
0.2684
