In [None]:
import os
import sys
import numpy as np
import pandas as pd
import tensorflow
from tensorflow import keras
from tensorflow.keras.utils import to_categorical
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding
from keras.layers import Dense, Input
from keras.models import Model
from nltk.stem import SnowballStemmer
from nltk.stem.isri import ISRIStemmer
from textblob import TextBlob

In [None]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')

In [None]:
from nltk.corpus import stopwords
import re

In [None]:
CURRENT_DIR_PATH = %pwd
TEXT_DATA_DIR = os.path.join(CURRENT_DIR_PATH, 'categories')
print(CURRENT_DIR_PATH)

C:\Users\thaze


In [None]:
texts = [] 
labels_index = {} 
labels = [] 

In [None]:
for name in sorted(os.listdir(TEXT_DATA_DIR)):

  path = os.path.join(TEXT_DATA_DIR, name)
  print(path)
  if os.path.isdir(path):
    label_id = len(labels_index)
    labels_index[name] = label_id
    for fname in sorted(os.listdir(path)):
      fpath = os.path.join(path, fname)
      with open(fpath, encoding='utf-8') as f:
        t = f.read()
        i = t.find('\n\n')
        if 0 < i:
          t = t[i:]
        texts.append(t)
        
      labels.append(label_id)

print('Found %s texts.' % len(texts))
print('nLabels = ', len(labels))
print('Classes are:\n ')
for key in labels_index:
    print (key)

C:\Users\thaze\categories\Medical
C:\Users\thaze\categories\Politics
C:\Users\thaze\categories\Sports
Found 19500 texts.
nLabels =  19500
Classes are:
 
Medical
Politics
Sports


In [None]:
ArListem = ISRIStemmer()
def stem(text):
    zen = TextBlob(text)
    words = zen.words
    cleaned = [ArListem.stem(w) for w in words if not w in stopwords.words('arabic')]
    return " ".join(cleaned)

In [None]:
texts = [stem(t) for t in texts]

In [None]:
tokenizer = Tokenizer(num_words=20000)
tokenizer.fit_on_texts(texts)
token_x=tokenizer.texts_to_sequences(texts)
word_index = tokenizer.word_index

In [None]:
from keras.preprocessing.sequence import pad_sequences
maxlen =1000
token_x = pad_sequences(token_x,maxlen=maxlen)

In [None]:
labels_matrix = to_categorical(np.asarray(labels))

In [None]:
VALIDATION_SPLIT = 0.2
indices = np.arange(token_x.shape[0])
np.random.shuffle(indices)
data_shuffled = token_x[indices]
labels_shuffled = labels_matrix[indices]
nb_validation_samples = int(VALIDATION_SPLIT * data_shuffled.shape[0])
x_train = data_shuffled[:-nb_validation_samples]
y_train = labels_shuffled[:-nb_validation_samples]
x_val = data_shuffled[-nb_validation_samples:]
y_val = labels_shuffled[-nb_validation_samples:]

In [None]:
vocab_size = len(tokenizer.word_index) +1
print(len(word_index))

73361


In [None]:
import gensim
from textblob import TextBlob

In [None]:
def temp_split(text):
  zen = TextBlob(text)
  words = zen.words
  return words
corpus = [temp_split(t) for t in texts]

In [None]:
w2v_model = gensim.models.Word2Vec(sentences=corpus,vector_size=100,window=10,min_count=2, workers=4)
# w2v_model = gensim.models.Word2Vec(sentences=corpus,size=100,window=10,min_count=2, workers=4)

In [None]:
def get_matrix(model):
  weight_matrix = np.zeros((vocab_size,100))
  for word, i in tokenizer.word_index.items():
    if word in model.wv.key_to_index:
      weight_matrix[i] = model.wv.key_to_index[word]
    # if word in model.wv:
    #   weight_matrix[i] = model.wv[word]
  return weight_matrix

In [None]:
embedding_vectors =get_matrix(w2v_model)

In [None]:
from tensorflow.keras.layers import LSTM,Dropout

In [None]:
model = tensorflow.keras.Sequential()
model.add(Embedding(vocab_size,output_dim=100,weights=[embedding_vectors],input_length=1000,))
model.add(LSTM(units=256,return_sequences=True))
model.add(Dense(128, activation='relu'))
model.add(LSTM(units=128,return_sequences=True))
model.add(Dropout(0.4))
model.add(LSTM(units=128))
model.add(Dense(len(labels_index),activation='sigmoid'))

In [None]:
# model = Model(sequence_input, preds)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])
# happy learning!


In [None]:
model.summary()

Model: "sequential_10"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_10 (Embedding)    (None, 1000, 100)         7336200   
                                                                 
 lstm_28 (LSTM)              (None, 1000, 256)         365568    
                                                                 
 dense_14 (Dense)            (None, 1000, 128)         32896     
                                                                 
 lstm_29 (LSTM)              (None, 1000, 128)         131584    
                                                                 
 dropout_6 (Dropout)         (None, 1000, 128)         0         
                                                                 
 lstm_30 (LSTM)              (None, 128)               131584    
                                                                 
 dense_15 (Dense)            (None, 3)               

In [None]:
model.fit(x_train, y_train, validation_data=(x_val, y_val), epochs = 10, batch_size = 128)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x285f0124988>

In [None]:
print('Acuracy on testing set:')
model.evaluate(x_val, y_val)

Acuracy on testing set:


[0.31389114260673523, 0.8835897445678711]

In [None]:
sample = 1
label_vec = model.predict(token_x[sample].reshape(1,-1))
label_id = np.argmax(label_vec)
label_name = ''
for name, ID in labels_index.items(): # for name, age in dictionary.iteritems(): (for Python 2.x)
    if label_id == ID:
        label_name = name
        break
print ('The category of article no %s is %s' %(sample ,label_name))

The category of article no 1 is Medical
