In [1]:
from sklearn.datasets import fetch_20newsgroups

categories = ['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball']

# Design a model that we can train

In [2]:
from sklearn.datasets import fetch_20newsgroups
from bs4 import BeautifulSoup
import nltk, re
import nltk
nltk.download('punkt')

from gensim.models import word2vec

# Define a function named news_ to_ Sentences strips out the sentences in each news one by one,
# And return a sentence one by one, and return a sentence list.
def news_to_sentences(news):
    news_text = BeautifulSoup(news).get_text()
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    raw_sentences = tokenizer.tokenize(news_text)
    sentences = []
    for sent in raw_sentences:
        sentences.append(re.sub('[^a-zA-Z]', ' ', sent.lower().strip()).split())
    return sentences


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [3]:
news_groups = fetch_20newsgroups(subset='all',categories=categories)

X,y  = news_groups.data, news_groups.target

print ('Total texts in train:', len(news_groups.data))
print ('Total texts in test:', len(news_groups.target))

Total texts in train: 9645
Total texts in test: 9645


In [4]:
sentences = []

for x in X:
    sentences += news_to_sentences(x)

num_features = 300
min_word_count = 20
num_workers = 2
context = 5
downsampling = 1e-3

model = word2vec.Word2Vec(sentences, workers=num_workers,
                          size=num_features, min_count=min_word_count,
                          window=context, sample=downsampling)

In [5]:
model.init_sims(replace=True)
model.most_similar('email')

  


[('contact', 0.8059289455413818),
 ('replies', 0.7544243335723877),
 ('address', 0.7397332787513733),
 ('send', 0.7225940823554993),
 ('respond', 0.7182400226593018),
 ('mail', 0.7181669473648071),
 ('sas', 0.7145587801933289),
 ('subscribe', 0.6780484318733215),
 ('chintan', 0.6753007173538208),
 ('responses', 0.6540365815162659)]

In [6]:
model.similarity('must','tell')

  """Entry point for launching an IPython kernel.


-0.12986755

In [7]:
model.wv.save("model")
model.wv.load("model")

<gensim.models.keyedvectors.Word2VecKeyedVectors at 0x7f580978e2d0>

# Train

In [8]:
# More imports
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

# Maximum number of features should the tokenizer extract
max_words = 500
tokenizer = Tokenizer(num_words = max_words)
# Fit the tokenizer on our text
tokenizer.fit_on_texts(X)
# Get all words that the tokenizer knows
word_index = tokenizer.word_index
print(word_index)



In [9]:
import pandas as pd
X1 = tokenizer.texts_to_sequences(X)
X1 = pad_sequences(X1)
print(X1,"shape:",X1.shape)

# Prepare the labels
y1 = pd.get_dummies(y)
print(y1,"shape:",y1.shape)

[[  0   0   0 ...   3  22  45]
 [  0   0   0 ... 315  33  33]
 [  0   0   0 ... 124   3  38]
 ...
 [  0   0   0 ...  74  44 370]
 [  0   0   0 ...  65  10   1]
 [  0   0   0 ...  19 213 128]] shape: (9645, 17837)
      0  1  2  3  4  5  6  7  8  9
0     1  0  0  0  0  0  0  0  0  0
1     0  0  0  0  0  1  0  0  0  0
2     0  0  0  0  0  0  0  0  0  1
3     0  0  0  0  0  0  0  0  1  0
4     0  0  0  0  0  0  1  0  0  0
...  .. .. .. .. .. .. .. .. .. ..
9640  1  0  0  0  0  0  0  0  0  0
9641  0  0  0  0  0  0  1  0  0  0
9642  0  0  0  0  0  0  0  0  1  0
9643  0  0  0  0  0  0  0  0  0  1
9644  0  0  0  0  0  0  0  0  0  1

[9645 rows x 10 columns] shape: (9645, 10)


In [10]:
X_train, X_test, y_train, y_test = train_test_split(X1, y1, test_size=0.1, shuffle=False)

print(len(X_train))
print(len(X_test))

8680
965


In [11]:
import numpy as np
embedding_vector_length = 300

embedding_matrix = np.zeros((len(model.wv.vocab) + 1, embedding_vector_length))
for i, vec in enumerate(model.wv.vectors):
  embedding_matrix[i] = vec

print(embedding_matrix.shape)

(8550, 300)


In [12]:
filters = 250
kernel_size = 3
hidden_dims = 125
import tensorflow
from keras.models import Sequential
from keras.layers import LSTM, Dense, Embedding,Conv1D,GlobalMaxPooling1D,Activation

wmodel = Sequential()
wmodel.add(Embedding(len(model.wv.vocab) + 1,embedding_vector_length,input_length=X1.shape[1],weights=[embedding_matrix],trainable=False))
#model.add(Dropout(0.2))
wmodel.add(Conv1D(filters,
                 kernel_size,
                 padding='valid',
                 activation='relu',
                 strides=1))
wmodel.add(GlobalMaxPooling1D())

wmodel.add(Dense(hidden_dims))
#model.add(Dropout(0.2))
wmodel.add(Activation('relu'))

wmodel.add(Dense(10))
wmodel.add(Activation('sigmoid'))

wmodel.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

wmodel.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 17837, 300)        2565000   
                                                                 
 conv1d (Conv1D)             (None, 17835, 250)        225250    
                                                                 
 global_max_pooling1d (Globa  (None, 250)              0         
 lMaxPooling1D)                                                  
                                                                 
 dense (Dense)               (None, 125)               31375     
                                                                 
 activation (Activation)     (None, 125)               0         
                                                                 
 dense_1 (Dense)             (None, 10)                1260      
                                                        

In [13]:
# 11 - Train the neural network model
from keras.callbacks import EarlyStopping

batch_size = 128
wmodel.fit(X_train, y_train,
          batch_size=batch_size,
          epochs=20,
          validation_split=0.2,
          callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)]
          )


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20


<keras.callbacks.History at 0x7f579a5fe390>

In [15]:
val_loss, val_acc = wmodel.evaluate(X_test, y_test,
                                   batch_size=batch_size,
                                   verbose=0)
print("Test loss:",val_loss)
print("Test accuracy: %.2f%%" % (val_acc*100))

Test loss: 0.1354576051235199
Test accuracy: 76.58%


In [46]:
from sklearn.metrics import classification_report
y_pred = wmodel.predict(X_test,batch_size=batch_size)

In [47]:
for i in range(len(y_pred)):
        max_value=max(y_pred[i])
        for j in range(len(y_pred[i])):
            if max_value==y_pred[i][j]:
                y_pred[i][j]=1
            else:
                y_pred[i][j]=0

In [52]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.79      0.91      0.85        79
           1       0.54      0.78      0.64        78
           2       0.77      0.85      0.81       105
           3       0.61      0.59      0.60       107
           4       0.80      0.69      0.74       108
           5       0.88      0.74      0.80        94
           6       0.80      0.82      0.81        95
           7       0.80      0.69      0.74        94
           8       0.87      0.82      0.85       107
           9       0.89      0.80      0.84        98

   micro avg       0.77      0.77      0.77       965
   macro avg       0.77      0.77      0.77       965
weighted avg       0.78      0.77      0.77       965
 samples avg       0.77      0.77      0.77       965



In [57]:
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test.values.argmax(axis=1),y_pred.argmax(axis=1))
print(confusion_matrix)

[[72  0  0  1  4  0  0  1  0  1]
 [ 2 61  2  5  1  2  2  1  2  0]
 [ 0  6 89  3  2  2  0  1  1  1]
 [ 3 11 13 63  5  0  4  4  3  1]
 [ 1  5  4 17 75  2  2  0  1  1]
 [ 0 12  3  4  1 70  2  1  1  0]
 [ 0  4  2  4  2  0 78  2  1  2]
 [ 2  7  2  3  2  2  4 65  4  3]
 [ 3  3  1  2  1  0  4  4 88  1]
 [ 8  4  0  2  1  2  1  2  0 78]]
