In [2]:
from collections import Counter
import numpy as np
import nltk
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import tensorflow as tf

In [3]:
dataset = fetch_20newsgroups(shuffle=True, random_state=1,
                             remove=('headers', 'footers', 'quotes'))

In [4]:
num_of_features = 5000
num_of_examples = 10000
corpus = dataset.data[:num_of_examples]
corpus = [doc.lower() for doc in corpus]
targets = dataset.target[:num_of_examples]

In [5]:
class TransformTokenizer:
    def __init__(self, transform):
        self.transform = transform
        
    def __call__(self, doc):
        return [self.transform(word) for word in nltk.word_tokenize(doc)]

In [6]:
stemmer = nltk.stem.PorterStemmer()
tfidf_vectorizer_stemming = TfidfVectorizer(tokenizer=TransformTokenizer(stemmer.stem), max_df=0.95, min_df=2, max_features=num_of_features, stop_words='english')
bows = tfidf_vectorizer_stemming.fit_transform(corpus)

  'stop_words.' % sorted(inconsistent))


In [8]:
X_train, X_test, y_train, y_test = train_test_split(bows, targets, test_size=0.1, shuffle=False)

In [9]:
y_train

array([17,  0, 17, ..., 11,  4,  5])

In [16]:
X_train[1]

array([0.13471192, 0.        , 0.        , ..., 0.        , 0.        ,
       0.        ])

In [10]:
np.unique(y_train)

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19])

In [11]:
num_of_classes = np.unique(y_train).shape[0]
num_of_classes

20

In [18]:
# we need to encode y_train and y_test to categorical encoding
print(y_train[:10])
y_train = tf.keras.utils.to_categorical(y_train)
print(y_train[:10])
y_test = tf.keras.utils.to_categorical(y_test)

[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]]
[[[1. 0.]
  [1. 0.]
  [1. 0.]
  [1. 0.]
  [1. 0.]
  [1. 0.]
  [1. 0.]
  [1. 0.]
  [1. 0.]
  [1. 0.]
  [1. 0.]
  [1. 0.]
  [1. 0.]
  [1. 0.]
  [1. 0.]
  [1. 0.]
  [1. 0.]
  [0. 1.]
  [1. 0.]
  [1. 0.]]

 [[0. 1.]
  [1. 0.]
  [1. 0.]
  [1. 0.]
  [1. 0.]
  [1. 0.]
  [1. 0.]
  [1. 0.]
  [1. 0.]
  [1. 0.]
  [1. 0.]
  [1. 0.]
  [1. 0.]
  [1. 0.]
  [1. 0.]
  [1. 0.]
  [1. 0

In [20]:
# the problem: we have sparse arrays, but neural networks need dense arrays!
# the solution will be word embeddings, here we just convert to dense arrays
X_train = X_train.toarray()
X_test = X_test.toarray()

AttributeError: 'numpy.ndarray' object has no attribute 'toarray'

### softmax and cross-entropy


$\sigma (\mathbf {z} )_{i}={\frac {e^{z_{i}}}{\sum _{j=1}^{K}e^{z_{j}}}}$
${\text{ for }}i=1,\dotsc ,K{\text{ and }}$
$\mathbf {z} =(z_{1},\dotsc ,z_{K})\in \mathbb {R} ^{K}$

$H(p,q)=-\sum _{x\in {\mathcal {X}}}p(x)\,\log q(x)$

$H(y, \hat{y}) = - \sum_{i=1}^{K} y_i * log(\hat{y}_i)$

In [None]:
model = tf.keras.models.Sequential([
    tf.keras.layers.Dense(20, activation=tf.keras.activations.relu),
    tf.keras.layers.Dense(num_of_classes, activation=tf.keras.activations.softmax)
])

In [None]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
model.fit(X_train, y_train, epochs=10)

In [None]:
# the test accuracy is less => we are overfitting
model.evaluate(X_test, y_test)

## Word embeddings
They are dense, usually 300 dimensional vectors. There is a vector for each word in the vocabulary.

### Words to numbers

In [None]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=num_of_features)
tokenizer.fit_on_texts(corpus)

In [None]:
sequences = tokenizer.texts_to_sequences(corpus)

In [None]:
print(sequences[0])

### Padding

In [None]:
[len(sequence) for sequence in sequences[:10]]

In [None]:
sequences = tf.keras.preprocessing.sequence.pad_sequences(sequences, maxlen=1000)

In [None]:
[len(sequence) for sequence in sequences[:10]]

In [None]:
print(sequences[0])

### Training

In [None]:
X_train, X_test, y_train, y_test = train_test_split(sequences, targets, test_size=0.1, shuffle=False)

In [None]:
print(y_train[:10])
y_train = tf.keras.utils.to_categorical(y_train)
print(y_train[:10])
y_test = tf.keras.utils.to_categorical(y_test)

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(num_of_features, 20),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(100, activation='relu'),
    tf.keras.layers.Dense(num_of_classes, activation=tf.keras.activations.softmax)
])

In [None]:
model.summary()

In [None]:
model.compile(optimizer=tf.keras.optimizers.Adam(), loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
model.fit(X_train, y_train, epochs=100, validation_split=0.1)

In [None]:
model.evaluate(X_test, y_test)