In [3]:
import tensorflow as tf
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
import numpy as np

df = pd.read_csv('data/df_article_text.csv', sep=',')

In [2]:
print(tf.__version__)

2.1.0


In [41]:
X = df['article_text']
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [42]:
X_train=X_train.reset_index(drop=True)

In [43]:
X_train

0       The CIA is the second Western intelligence age...
1       President Trump's recent decision to leave the...
2       A Michigan woman shocked the Internets on Satu...
3       UK Ministry of Defense 2010: “Choosing to be d...
4       Michael Snyder  Economic Collapse  May 7, 2020...
5         Edward Meyer School victory garden on First ...
6       COVID Ethics: It’s Immoral to Confine Innocent...
7       Dave Hodges and JR Nyquist warn of China’s pla...
8       Representational image | Photo Credit: Twitter...
9       California Gov. Gavin Newsom says the state wi...
10      Hundreds of thousands of Indian contract worke...
11      Good Wednesday morning.  Here is what’s on the...
12      This post was originally published on this sit...
13      This post was originally published on this sit...
14      India backed calls this week for an independen...
15      Summary:  Hong Kong reports no new local coron...
16      All across the U.S., Democrats have been relea...
17      The “G

In [62]:
vocab_size = 9000
embedding_dim = 16
max_length = 120
trunc_type = 'post'
oov_tok = '<OOV>'

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [63]:
tokenizer = Tokenizer(num_words = vocab_size, oov_token = oov_tok)
tokenizer.fit_on_texts(X_train)

In [64]:
word_index = tokenizer.word_index

In [65]:
sequences = tokenizer.texts_to_sequences(X_train)

In [66]:
padded = pad_sequences(sequences, maxlen = max_length, truncating=trunc_type)

In [67]:
testing_sequences = tokenizer.texts_to_sequences(X_test)
testing_padded = pad_sequences(testing_sequences, maxlen=max_length, truncating=trunc_type)

In [68]:
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

def decode_review(text):
    return ' '.join([reverse_word_index.get(i, '?') for i in text])

print(decode_review(padded[3]))
print(X_train[3])

uk ministry of defense 2010 <OOV> to be <OOV> may be considered suspicious ” <OOV> general robert s <OOV> said in a 2019 interview that humans will not be able to opt out of the <OOV> 5g system 12 years ago old thinker news asked the question <OOV> dissidents potentially be <OOV> <OOV> and denied access to <OOV> payment systems and transit systems as if they were a banned web page in the <OOV> of <OOV> a <OOV> reality has emerged since this report was published the infrastructure for this system of control has been built the 5g network will enable the rollout of a vast command and control grid that will monitor people places and things <OOV> in real
UK Ministry of Defense 2010: “Choosing to be disconnected may be considered suspicious.”  Brigadier General Robert S. Spalding said in a 2019 interview that humans will not be able to opt out of the pervasive 5G system:  12 years ago Old-Thinker News asked the question: “Could dissidents potentially be electronically blacklisted and denied 

In [50]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(6, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 120, 16)           160000    
_________________________________________________________________
flatten (Flatten)            (None, 1920)              0         
_________________________________________________________________
dense (Dense)                (None, 6)                 11526     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 7         
Total params: 171,533
Trainable params: 171,533
Non-trainable params: 0
_________________________________________________________________


In [51]:
num_epochs = 10
model.fit(padded, y_train, epochs=num_epochs, validation_data=(testing_padded, y_test))

Train on 6528 samples, validate on 2176 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fdb7c3db4e0>

In [52]:
e = model.layers[0]
weights = e.get_weights()[0]
print(weights.shape) # shape: (vocab_size, embedding_dim)

(10000, 16)


In [53]:
import io

out_v = io.open('vecs.tsv', 'w', encoding='utf-8')
out_m = io.open('meta.tsv', 'w', encoding='utf-8')

for word_num in range(1, vocab_size):
  word = reverse_word_index[word_num]
  embeddings = weights[word_num]
  out_m.write(word + "\n")
  out_v.write('\t'.join([str(x) for x in embeddings]) + "\n")
    
out_v.close()
out_m.close()

In [70]:
reverse_word_index.nrow

AttributeError: 'dict' object has no attribute 'nrow'

In [75]:
len(word_index.keys())

143016

In [76]:
len(X_train)

6528

In [77]:
len(y_train)

6528