In [77]:
import nltk
import random
from nltk.corpus import reuters
from tqdm.auto import tqdm, trange

In [78]:
categories = list(reuters.categories())

print(categories)

['acq', 'alum', 'barley', 'bop', 'carcass', 'castor-oil', 'cocoa', 'coconut', 'coconut-oil', 'coffee', 'copper', 'copra-cake', 'corn', 'cotton', 'cotton-oil', 'cpi', 'cpu', 'crude', 'dfl', 'dlr', 'dmk', 'earn', 'fuel', 'gas', 'gnp', 'gold', 'grain', 'groundnut', 'groundnut-oil', 'heat', 'hog', 'housing', 'income', 'instal-debt', 'interest', 'ipi', 'iron-steel', 'jet', 'jobs', 'l-cattle', 'lead', 'lei', 'lin-oil', 'livestock', 'lumber', 'meal-feed', 'money-fx', 'money-supply', 'naphtha', 'nat-gas', 'nickel', 'nkr', 'nzdlr', 'oat', 'oilseed', 'orange', 'palladium', 'palm-oil', 'palmkernel', 'pet-chem', 'platinum', 'potato', 'propane', 'rand', 'rape-oil', 'rapeseed', 'reserves', 'retail', 'rice', 'rubber', 'rye', 'ship', 'silver', 'sorghum', 'soy-meal', 'soy-oil', 'soybean', 'strategic-metal', 'sugar', 'sun-meal', 'sun-oil', 'sunseed', 'tea', 'tin', 'trade', 'veg-oil', 'wheat', 'wpi', 'yen', 'zinc']


In [79]:
#Put all data to one list
documents = []

for category in tqdm(reuters.categories()):
    for fileid in reuters.fileids(category):
        documents.append([list(reuters.words(fileid)), category, fileid])

random.shuffle(documents)

print(documents[1])

  0%|          | 0/90 [00:00<?, ?it/s]

[['PAYCHEX', 'INC', '&', 'lt', ';', 'PAYX', '>', '3RD', 'QTR', 'FEB', '28', 'NET', 'Shr', '13', 'cts', 'vs', '10', 'cts', 'Net', '1', ',', '109', ',', '000', 'vs', '875', ',', '000', 'Revs', '16', '.', '6', 'mln', 'vs', '13', '.', '2', 'mln', 'Nine', 'mths', 'Shr', '44', 'cts', 'vs', '33', 'cts', 'Net', '3', ',', '770', ',', '000', 'vs', '2', ',', '851', ',', '000', 'Revs', '46', '.', '9', 'mln', 'vs', '36', '.', '9', 'mln', 'NOTE', ':', 'Share', 'adjusted', 'for', 'three', '-', 'for', '-', 'two', 'stock', 'split', 'in', 'June', '1986', '.'], 'earn', 'training/8292']


# Preparing texts

In [94]:
import string


def remove_punctuation(text):
    return [ch for ch in text if ch[0] not in string.punctuation]


def remove_numbers(text):
    return [i for i in text if not i[0].isdigit()]


def get_lower(text):
    return [word.lower() for word in text]

In [95]:
#Remove punctuation and numbers from lower case text
prep_text = []

for document in tqdm(documents):
    prep_text.append([get_lower(remove_numbers(remove_punctuation(document[0]))), document[1], document[2]])

  0%|          | 0/13328 [00:00<?, ?it/s]

In [96]:
print(prep_text[1])

[['paychex', 'inc', 'lt', 'payx', 'qtr', 'feb', 'net', 'shr', 'cts', 'vs', 'cts', 'net', 'vs', 'revs', 'mln', 'vs', 'mln', 'nine', 'mths', 'shr', 'cts', 'vs', 'cts', 'net', 'vs', 'revs', 'mln', 'vs', 'mln', 'note', 'share', 'adjusted', 'for', 'three', 'for', 'two', 'stock', 'split', 'in', 'june'], 'earn', 'training/8292']


# Stemming

In [98]:
from nltk.stem import PorterStemmer
ps = PorterStemmer()


def stemming_text(text):
    return [ps.stem(word) for word in text]

In [99]:
#Text stemming
stemmed_texts = []

for text in tqdm(prep_text):
    stemmed_texts.append([stemming_text(text[0]), text[1], text[2]])

  0%|          | 0/13328 [00:00<?, ?it/s]

In [100]:
print(stemmed_texts[1])

[['paychex', 'inc', 'lt', 'payx', 'qtr', 'feb', 'net', 'shr', 'ct', 'vs', 'ct', 'net', 'vs', 'rev', 'mln', 'vs', 'mln', 'nine', 'mth', 'shr', 'ct', 'vs', 'ct', 'net', 'vs', 'rev', 'mln', 'vs', 'mln', 'note', 'share', 'adjust', 'for', 'three', 'for', 'two', 'stock', 'split', 'in', 'june'], 'earn', 'training/8292']


In [101]:
from nltk.corpus import stopwords


def remove_stopwords(text):
    return [word for word in text if word not in stopwords.words('english')]

In [102]:
#Remove stopwords
without_stopwords = []

for text in tqdm(stemmed_texts):
    without_stopwords.append([remove_stopwords(text[0]), text[1], text[2]])

  0%|          | 0/13328 [00:00<?, ?it/s]

In [103]:
print(without_stopwords[1])

[['paychex', 'inc', 'lt', 'payx', 'qtr', 'feb', 'net', 'shr', 'ct', 'vs', 'ct', 'net', 'vs', 'rev', 'mln', 'vs', 'mln', 'nine', 'mth', 'shr', 'ct', 'vs', 'ct', 'net', 'vs', 'rev', 'mln', 'vs', 'mln', 'note', 'share', 'adjust', 'three', 'two', 'stock', 'split', 'june'], 'earn', 'training/8292']


# Preparing data for learning

In [105]:
X_train = [" ".join(text[0]) for text in without_stopwords if 'training' in text[2]]
X_test = [" ".join(text[0]) for text in without_stopwords if 'test' in text[2]]
y_train = [topic[1] for topic in without_stopwords if 'training' in topic[2]]
y_test = [topic[1] for topic in without_stopwords if 'test' in topic[2]]

# Naive Bayes Classifier

In [107]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

In [108]:
nb = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', MultinomialNB()),
              ])

In [109]:
%%time
nb.fit(X_train, y_train)

Wall time: 810 ms


Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('clf', MultinomialNB())])

In [110]:
%%time
from sklearn.metrics import classification_report
y_pred = nb.predict(X_test)

print(X_test[2], y_test[2], y_pred[2])

cellular inc lt cel sell unit take gain cellular inc said reach definit agreement sell asset wholli michigan cellular inc centuri telephon enterpris inc lt ctl add ct share year earn result said sale subject regulatori approv repres capit gain excess dlr origin price paid cellular cellular interest michigan acquir decemb acq acq
Wall time: 257 ms


In [112]:
from sklearn.metrics import accuracy_score

print('accuracy = %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred))

accuracy = 0.6047008547008547
                 precision    recall  f1-score   support

            acq       0.66      0.94      0.77       719
           alum       0.00      0.00      0.00        23
         barley       0.00      0.00      0.00        14
            bop       0.00      0.00      0.00        30
        carcass       0.00      0.00      0.00        18
     castor-oil       0.00      0.00      0.00         1
          cocoa       0.00      0.00      0.00        18
        coconut       0.00      0.00      0.00         2
    coconut-oil       0.00      0.00      0.00         3
         coffee       1.00      0.18      0.30        28
         copper       0.00      0.00      0.00        18
     copra-cake       0.00      0.00      0.00         1
           corn       0.00      0.00      0.00        56
         cotton       0.00      0.00      0.00        20
     cotton-oil       0.00      0.00      0.00         2
            cpi       0.00      0.00      0.00        28


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Linear Support Vector Machine

In [114]:
from sklearn.linear_model import SGDClassifier

sgd = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, random_state=42, max_iter=5, tol=None)),
               ])

In [115]:
%%time
sgd.fit(X_train, y_train)

Wall time: 1.37 s


Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('clf',
                 SGDClassifier(alpha=0.001, max_iter=5, random_state=42,
                               tol=None))])

In [116]:
%%time
y_pred = sgd.predict(X_test)

print('accuracy = %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred))

accuracy = 0.7013888888888888
                 precision    recall  f1-score   support

            acq       0.73      0.99      0.84       719
           alum       0.43      0.43      0.43        23
         barley       0.50      0.07      0.12        14
            bop       0.43      0.20      0.27        30
        carcass       0.60      0.17      0.26        18
     castor-oil       0.00      0.00      0.00         1
          cocoa       0.54      0.78      0.64        18
        coconut       0.17      0.50      0.25         2
    coconut-oil       0.00      0.00      0.00         3
         coffee       0.52      1.00      0.68        28
         copper       1.00      0.50      0.67        18
     copra-cake       0.00      0.00      0.00         1
           corn       0.37      0.12      0.19        56
         cotton       0.48      0.55      0.51        20
     cotton-oil       0.00      0.00      0.00         2
            cpi       0.52      0.46      0.49        28


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Lets check

In [147]:
from nltk.tokenize import word_tokenize

gold_text = '''
The World Gold Council, the market development organisation for the gold industry, 
announces that Equinox Gold Corp. has joined its Board of Members. 

Equinox Gold is a Canadian mining company with seven operating gold mines and 
construction underway at an eighth site. Equinox Gold operates entirely in the Americas, 
with two properties in the United States, one in Mexico and five in Brazil. 
Equinox Gold has a multimillion ounce gold reserve base and a pipeline of development and 
expansion projects for organic gold production growth. The Company’s proposed
acquisition of Premier Gold Mines will bring further asset and country diversification 
with the addition of a producing mine in Mexico and a development-stage project in Canada.

Randy Smallwood, Chair of the World Gold Council, commented: 
“I am thrilled to welcome Equinox Gold to the World Gold Council. 
They play an important role in the production and exploration of gold in the Americas 
and have exciting future growth plans. I have known the company and management team for 
many years and am delighted that they will bring their experience and expertise to support 
the mission of the World Gold Council.”

Christian Milau, Chief Executive Officer, Equinox Gold, said: 
“We are very pleased to join the World Gold Council. Our values are aligned with 
the World’s Gold Council’s commitments to raising standards and building trust 
and transparency in the industry. We look forward to engaging with the Council’s 
members to share best practices and work towards achieving our shared goals.” 
'''

gold_text = ' '.join(remove_stopwords(stemming_text(get_lower(remove_numbers(remove_punctuation(word_tokenize(gold_text)))))))

In [148]:
#Predict for news about gold
ect_pred = sgd.predict([gold_text])
print(ect_pred)

['gold']


In [153]:
coffee_text = '''
That morning cup of java may be providing a lot more benefits than just giving you the energy
to start your day. Numerous studies have shown that daily consumption of coffee can help you 
to live a longer, healthier life.

You may have heard of some of coffee's many health benefits, but there may also be a few 
that you weren't aware of.

Last year, the Harvard Gazette reviewed a number of studies and discovered "an emerging 
picture of coffee as a potentially powerful elixir" against a range of ailments, 
from cancer to cavities.
'''

coffee_text = ' '.join(remove_stopwords(stemming_text(get_lower(remove_numbers(remove_punctuation(word_tokenize(coffee_text)))))))

In [154]:
#Predict for news about coffee
ect_pred_2 = sgd.predict([coffee_text])
print(ect_pred_2)

['coffee']


# Conclusion

From Linear Support Vector Machine algorithm we have more accuracity then Naive Bayes Classifier algorithm, but accuracity still not too much (70 %). So if we take more data with better quality we can predict it better.