# Reuters-21578 collection

Understanding the collection is important for the data science.Lets have the quick look at the reuters collection

In [1]:
from nltk.corpus import reuters

In [2]:
# list of document ids
documents = reuters.fileids()
print("Documents: {}".format(len(documents)))

# Train documents
train_docs_id = list(filter(lambda doc: doc.startswith("train"),documents))
print("Total train documents: {}".format(len(train_docs_id)))

# Test documents
test_docs_id = list(filter(lambda doc: doc.startswith("test"),documents))
print("Total test documents: {}".format(len(test_docs_id)))

Documents: 10788
Total train documents: 7769
Total test documents: 3019


In [3]:
doc = 'training/9865'

print(reuters.raw(doc))
print()

print(reuters.categories(doc))

FRENCH FREE MARKET CEREAL EXPORT BIDS DETAILED
  French operators have requested licences
  to export 675,500 tonnes of maize, 245,000 tonnes of barley,
  22,000 tonnes of soft bread wheat and 20,000 tonnes of feed
  wheat at today's European Community tender, traders said.
      Rebates requested ranged from 127.75 to 132.50 European
  Currency Units a tonne for maize, 136.00 to 141.00 Ecus a tonne
  for barley and 134.25 to 141.81 Ecus for bread wheat, while
  rebates requested for feed wheat were 137.65 Ecus, they said.
  



['barley', 'corn', 'grain', 'wheat']


In [4]:
from operator import itemgetter
from pprint import pprint

# List of categories
categories = reuters.categories()
print("Number of categories: {}".format(len(categories)))
print()

print(categories)
print()

# Documents per category
category_distributionn = [(category,len(reuters.fileids(category))) for category in categories]
category_distributionn = sorted(category_distributionn,key=itemgetter(1),reverse=True)

print("Most common categories")
pprint(category_distributionn[:5])
print()

print("Last common categories")
print(category_distributionn[-5:])

Number of categories: 90

['acq', 'alum', 'barley', 'bop', 'carcass', 'castor-oil', 'cocoa', 'coconut', 'coconut-oil', 'coffee', 'copper', 'copra-cake', 'corn', 'cotton', 'cotton-oil', 'cpi', 'cpu', 'crude', 'dfl', 'dlr', 'dmk', 'earn', 'fuel', 'gas', 'gnp', 'gold', 'grain', 'groundnut', 'groundnut-oil', 'heat', 'hog', 'housing', 'income', 'instal-debt', 'interest', 'ipi', 'iron-steel', 'jet', 'jobs', 'l-cattle', 'lead', 'lei', 'lin-oil', 'livestock', 'lumber', 'meal-feed', 'money-fx', 'money-supply', 'naphtha', 'nat-gas', 'nickel', 'nkr', 'nzdlr', 'oat', 'oilseed', 'orange', 'palladium', 'palm-oil', 'palmkernel', 'pet-chem', 'platinum', 'potato', 'propane', 'rand', 'rape-oil', 'rapeseed', 'reserves', 'retail', 'rice', 'rubber', 'rye', 'ship', 'silver', 'sorghum', 'soy-meal', 'soy-oil', 'soybean', 'strategic-metal', 'sugar', 'sun-meal', 'sun-oil', 'sunseed', 'tea', 'tin', 'trade', 'veg-oil', 'wheat', 'wpi', 'yen', 'zinc']

Most common categories
[('earn', 3964),
 ('acq', 2369),
 ('mone

In [5]:
from nltk.corpus import stopwords
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier

stop_words = stopwords.words('english')

train_docs_id = list(filter(lambda doc: doc.startswith("train"),documents))
test_docs_id = list(filter(lambda doc: doc.startswith("test"),documents))

train_docs = [reuters.raw(doc_id) for doc_id in train_docs_id]
test_docs = [reuters.raw(doc_id) for doc_id in test_docs_id]

# Tokenization
vectorizer = TfidfVectorizer(stop_words=stop_words)
vectorised_train_documents = vectorizer.fit_transform(train_docs)
vectorised_test_documents = vectorizer.transform(test_docs)

# Transform multilabels labels
mlb = MultiLabelBinarizer()
train_labels = mlb.fit_transform([reuters.categories(doc_id) for doc_id in train_docs_id])
test_labels = mlb.transform([reuters.categories(doc_id) for doc_id in test_docs_id])

# Classifier
classifier = OneVsRestClassifier(LinearSVC(random_state=42))
classifier.fit(vectorised_train_documents,train_labels)

predictions = classifier.predict(vectorised_test_documents)
#
print("Number of labels assigned: {}".format(sum([sum(prediction) for prediction in predictions])))

Number of labels assigned: 3126


In [6]:
from sklearn.metrics import f1_score,precision_score,recall_score

precision = precision_score(test_labels,predictions,average='micro')
recall = recall_score(test_labels,predictions,average='micro')
f1 = f1_score(test_labels,predictions,average='micro')
print('Micro-average quality numbers')
print("Precision: {:.4f},Recall: {:.4f},F1-measure: {:.4f}".format(precision,recall,f1))

precision = precision_score(test_labels, predictions, average='macro')
recall = recall_score(test_labels, predictions, average='macro')
f1 = f1_score(test_labels, predictions, average='macro')
print("Macro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, 
                                                                     recall, 
                                                                     f1))

Micro-average quality numbers
Precision: 0.9517,Recall: 0.7946,F1-measure: 0.8661
Macro-average quality numbers
Precision: 0.6305, Recall: 0.3715, F1-measure: 0.4451


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
