# Text classification with Reuters-21578 datasets

### See: https://kdd.ics.uci.edu/databases/reuters21578/README.txt for more information

In [None]:
%pylab inline

In [39]:
import re
import xml.sax.saxutils as saxutils

from bs4 import BeautifulSoup
from gensim.models.word2vec import Word2Vec

from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, LSTM

from multiprocessing import cpu_count

from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer, sent_tokenize
from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

from pandas import DataFrame

from sklearn.cross_validation import train_test_split

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ty020\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ty020\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ty020\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.


## General constants (modify them according to you environment)

In [17]:
# Set Numpy random seed
from numpy import *
np.random.seed(1000)

# Newsline folder and format
data_folder = 'D:\\swcontest\\elmo\\Reuters-21578-Classification-master\\reuters21578\\'

sgml_number_of_files = 17
sgml_file_name_template = 'reut2-NNN.sgm'

# Category files
category_files = {
    'to_': ('Topics', 'all-topics-strings.lc.txt'),
    'pl_': ('Places', 'all-places-strings.lc.txt'),
    'pe_': ('People', 'all-people-strings.lc.txt'),
    'or_': ('Organizations', 'all-orgs-strings.lc.txt'),
    'ex_': ('Exchanges', 'all-exchanges-strings.lc.txt')
}

# Word2Vec number of features
num_features = 500
# Limit each newsline to a fixed number of words
document_max_num_words = 100
# Selected categories
selected_categories = ['pl_usa']

## Prepare documents and categories

In [18]:
# Create category dataframe

# Read all categories
category_data = []

for category_prefix in category_files.keys():
    with open(data_folder + category_files[category_prefix][1], 'r') as file:
        for category in file.readlines():
            category_data.append([category_prefix + category.strip().lower(), 
                                  category_files[category_prefix][0], 
                                  0])

# Create category dataframe
news_categories = DataFrame(data=category_data, columns=['Name', 'Type', 'Newslines'])

In [29]:
def update_frequencies(categories):
    for category in categories:
        idx = news_categories[news_categories.Name == category].index[0]
        f = news_categories.get_value(idx, 'Newslines')
        news_categories.set_value(idx, 'Newslines', f+1)
    
def to_category_vector(categories, target_categories):
    vector = zeros(len(target_categories)).astype(float32)
    
    for i in range(len(target_categories)):
        if target_categories[i] in categories:
            vector[i] = 1.0
    
    return vector

In [30]:
# Parse SGML files
document_X = {}
document_Y = {}

def strip_tags(text):
    return re.sub('<[^<]+?>', '', text).strip()

def unescape(text):
    return saxutils.unescape(text)

# Iterate all files
for i in range(sgml_number_of_files):
    if i < 10:
        seq = '00' + str(i)
    else:
        seq = '0' + str(i)
        
    file_name = sgml_file_name_template.replace('NNN', seq)
    print('Reading file: %s' % file_name)
    
    with open(data_folder + file_name, 'r') as file:
        content = BeautifulSoup(file.read().lower())
        
        for newsline in content('reuters'):
            document_categories = []
            
            # News-line Id
            document_id = newsline['newid']
            
            # News-line text
            document_body = strip_tags(str(newsline('text')[0].body)).replace('reuter\n&#3;', '')
            document_body = unescape(document_body)
            
            # News-line categories
            topics = newsline.topics.contents
            places = newsline.places.contents
            people = newsline.people.contents
            orgs = newsline.orgs.contents
            exchanges = newsline.exchanges.contents
            
            for topic in topics:
                document_categories.append('to_' + strip_tags(str(topic)))
                
            for place in places:
                document_categories.append('pl_' + strip_tags(str(place)))
                
            for person in people:
                document_categories.append('pe_' + strip_tags(str(person)))
                
            for org in orgs:
                document_categories.append('or_' + strip_tags(str(org)))
                
            for exchange in exchanges:
                document_categories.append('ex_' + strip_tags(str(exchange)))
                
            # Create new document    
            update_frequencies(document_categories)
            
            document_X[document_id] = document_body
            document_Y[document_id] = to_category_vector(document_categories, selected_categories)

Reading file: reut2-000.sgm




 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))
  after removing the cwd from sys.path.
  """


Reading file: reut2-001.sgm
Reading file: reut2-002.sgm
Reading file: reut2-003.sgm
Reading file: reut2-004.sgm
Reading file: reut2-005.sgm
Reading file: reut2-006.sgm
Reading file: reut2-007.sgm
Reading file: reut2-008.sgm
Reading file: reut2-009.sgm
Reading file: reut2-010.sgm
Reading file: reut2-011.sgm
Reading file: reut2-012.sgm
Reading file: reut2-013.sgm
Reading file: reut2-014.sgm
Reading file: reut2-015.sgm
Reading file: reut2-016.sgm


In [31]:
document_X

{'1': 'None',
 '2': 'None',
 '3': 'None',
 '4': 'None',
 '5': 'None',
 '6': 'None',
 '7': 'None',
 '8': 'None',
 '9': 'None',
 '10': 'None',
 '11': 'None',
 '12': 'None',
 '13': 'None',
 '14': 'None',
 '15': 'None',
 '16': 'None',
 '17': 'None',
 '18': 'None',
 '19': 'None',
 '20': 'None',
 '21': 'None',
 '22': 'None',
 '23': 'None',
 '24': 'None',
 '25': 'None',
 '26': 'None',
 '27': 'None',
 '28': 'None',
 '29': 'None',
 '30': 'None',
 '31': 'None',
 '32': 'None',
 '33': 'None',
 '34': 'None',
 '35': 'None',
 '36': 'None',
 '37': 'None',
 '38': 'None',
 '39': 'None',
 '40': 'None',
 '41': 'None',
 '42': 'None',
 '43': 'None',
 '44': 'None',
 '45': 'None',
 '46': 'None',
 '47': 'None',
 '48': 'None',
 '49': 'None',
 '50': 'None',
 '51': 'None',
 '52': 'None',
 '53': 'None',
 '54': 'None',
 '55': 'None',
 '56': 'None',
 '57': 'None',
 '58': 'None',
 '59': 'None',
 '60': 'None',
 '61': 'None',
 '62': 'None',
 '63': 'None',
 '64': 'None',
 '65': 'None',
 '66': 'None',
 '67': 'None',
 '68

## Top 20 categories (by number of newslines)

In [32]:

news_categories.sort_values(by='Newslines', ascending=False, inplace=True)
news_categories.head(20)

Unnamed: 0,Name,Type,Newslines
296,pl_usa,Places,19594
35,to_earn,Topics,6810
0,to_acq,Topics,3812
293,pl_uk,Places,2538
166,pl_canada,Places,1750
219,pl_japan,Places,1734
73,to_money-fx,Topics,1246
45,to_grain,Topics,1066
126,to_trade,Topics,890
28,to_crude,Topics,870


## Tokenize newsline documents

In [35]:
# Load stop-words
stop_words = set(stopwords.words('english'))

# Initialize tokenizer
# It's also possible to try with a stemmer or to mix a stemmer and a lemmatizer
tokenizer = RegexpTokenizer('[\'a-zA-Z]+')

# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()

# Tokenized document collection
newsline_documents = []

In [40]:
def tokenize(document):
    words = []

    for sentence in sent_tokenize(document):
        tokens = [lemmatizer.lemmatize(t.lower()) for t in tokenizer.tokenize(sentence) if t.lower() not in stop_words]
        words += tokens

    return words

# Tokenize
for key in document_X.keys():
    newsline_documents.append(tokenize(document_X[key]))

number_of_documents = len(document_X)

## Word2Vec Model
### See: https://radimrehurek.com/gensim/models/word2vec.html and https://code.google.com/p/word2vec/ for more information

In [41]:
# Load an existing Word2Vec model
w2v_model = Word2Vec.load(data_folder + 'reuters.word2vec')

FileNotFoundError: [Errno 2] No such file or directory: 'D:\\swcontest\\elmo\\Reuters-21578-Classification-master\\reuters21578\\reuters.word2vec'

In [42]:
# Create new Gensim Word2Vec model
w2v_model = Word2Vec(newsline_documents, size=num_features, min_count=1, window=10, workers=cpu_count())
w2v_model.init_sims(replace=True)
w2v_model.save(data_folder + 'reuters.word2vec')

## Vectorize each document

In [43]:
num_categories = len(selected_categories)
X = zeros(shape=(number_of_documents, document_max_num_words, num_features)).astype(float32)
Y = zeros(shape=(number_of_documents, num_categories)).astype(float32)

empty_word = zeros(num_features).astype(float32)

for idx, document in enumerate(newsline_documents):
    for jdx, word in enumerate(document):
        if jdx == document_max_num_words:
            break
            
        else:
            if word in w2v_model:
                X[idx, jdx, :] = w2v_model[word]
            else:
                X[idx, jdx, :] = empty_word

for idx, key in enumerate(document_Y.keys()):
    Y[idx, :] = document_Y[key]

  del sys.path[0]
  


## Split training and test sets

In [44]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3)

## Create Keras model

In [45]:
model = Sequential()

model.add(LSTM(int(document_max_num_words*1.5), input_shape=(document_max_num_words, num_features)))
model.add(Dropout(0.3))
model.add(Dense(num_categories))
model.add(Activation('sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

## Train and evaluate model

In [46]:
# Train model
model.fit(X_train, Y_train, batch_size=128, nb_epoch=5, validation_data=(X_test, Y_test))

# Evaluate model
score, acc = model.evaluate(X_test, Y_test, batch_size=128)
    
print('Score: %1.4f' % score)
print('Accuracy: %1.4f' % acc)

  


Train on 11900 samples, validate on 5100 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Score: 0.6823
Accuracy: 0.5739


In [48]:
X_test.shape


(5100, 100, 500)

In [49]:
model.save('D:/my_model.h6')