In [2]:
from nltk.corpus import reuters
from nltk.tokenize import sent_tokenize, word_tokenize

# Code to download corpora
import nltk
nltk.download("reuters")
nltk.download('punkt')

[nltk_data] Downloading package reuters to /home/camkirk/nltk_data...
[nltk_data]   Package reuters is already up-to-date!
[nltk_data] Downloading package punkt to /home/camkirk/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## The NLTK Reuters corpus

In [3]:
# The reuters corpus includes over 10,000 news articles, many of which are about financial markets
# These articles are tagged by topic, or category
print(reuters.categories())

['acq', 'alum', 'barley', 'bop', 'carcass', 'castor-oil', 'cocoa', 'coconut', 'coconut-oil', 'coffee', 'copper', 'copra-cake', 'corn', 'cotton', 'cotton-oil', 'cpi', 'cpu', 'crude', 'dfl', 'dlr', 'dmk', 'earn', 'fuel', 'gas', 'gnp', 'gold', 'grain', 'groundnut', 'groundnut-oil', 'heat', 'hog', 'housing', 'income', 'instal-debt', 'interest', 'ipi', 'iron-steel', 'jet', 'jobs', 'l-cattle', 'lead', 'lei', 'lin-oil', 'livestock', 'lumber', 'meal-feed', 'money-fx', 'money-supply', 'naphtha', 'nat-gas', 'nickel', 'nkr', 'nzdlr', 'oat', 'oilseed', 'orange', 'palladium', 'palm-oil', 'palmkernel', 'pet-chem', 'platinum', 'potato', 'propane', 'rand', 'rape-oil', 'rapeseed', 'reserves', 'retail', 'rice', 'rubber', 'rye', 'ship', 'silver', 'sorghum', 'soy-meal', 'soy-oil', 'soybean', 'strategic-metal', 'sugar', 'sun-meal', 'sun-oil', 'sunseed', 'tea', 'tin', 'trade', 'veg-oil', 'wheat', 'wpi', 'yen', 'zinc']


In [4]:
# We'll find the first article about crude oil
reuters.fileids(categories="crude")[0]

'test/14829'

In [9]:
article = reuters.raw('test/14829')
print(article)

JAPAN TO REVISE LONG-TERM ENERGY DEMAND DOWNWARDS
  The Ministry of International Trade and
  Industry (MITI) will revise its long-term energy supply/demand
  outlook by August to meet a forecast downtrend in Japanese
  energy demand, ministry officials said.
      MITI is expected to lower the projection for primary energy
  supplies in the year 2000 to 550 mln kilolitres (kl) from 600
  mln, they said.
      The decision follows the emergence of structural changes in
  Japanese industry following the rise in the value of the yen
  and a decline in domestic electric power demand.
      MITI is planning to work out a revised energy supply/demand
  outlook through deliberations of committee meetings of the
  Agency of Natural Resources and Energy, the officials said.
      They said MITI will also review the breakdown of energy
  supply sources, including oil, nuclear, coal and natural gas.
      Nuclear energy provided the bulk of Japan's electric power
  in the fiscal year ended March

## Tokenizing with string splits

In [10]:
# Simple sentence tokenizing with string split
article.split('.')

['JAPAN TO REVISE LONG-TERM ENERGY DEMAND DOWNWARDS\n  The Ministry of International Trade and\n  Industry (MITI) will revise its long-term energy supply/demand\n  outlook by August to meet a forecast downtrend in Japanese\n  energy demand, ministry officials said',
 '\n      MITI is expected to lower the projection for primary energy\n  supplies in the year 2000 to 550 mln kilolitres (kl) from 600\n  mln, they said',
 '\n      The decision follows the emergence of structural changes in\n  Japanese industry following the rise in the value of the yen\n  and a decline in domestic electric power demand',
 '\n      MITI is planning to work out a revised energy supply/demand\n  outlook through deliberations of committee meetings of the\n  Agency of Natural Resources and Energy, the officials said',
 '\n      They said MITI will also review the breakdown of energy\n  supply sources, including oil, nuclear, coal and natural gas',
 "\n      Nuclear energy provided the bulk of Japan's electric 

In [11]:
# Word tokenizing with string split
article.split(" ")

['JAPAN',
 'TO',
 'REVISE',
 'LONG-TERM',
 'ENERGY',
 'DEMAND',
 'DOWNWARDS\n',
 '',
 'The',
 'Ministry',
 'of',
 'International',
 'Trade',
 'and\n',
 '',
 'Industry',
 '(MITI)',
 'will',
 'revise',
 'its',
 'long-term',
 'energy',
 'supply/demand\n',
 '',
 'outlook',
 'by',
 'August',
 'to',
 'meet',
 'a',
 'forecast',
 'downtrend',
 'in',
 'Japanese\n',
 '',
 'energy',
 'demand,',
 'ministry',
 'officials',
 'said.\n',
 '',
 '',
 '',
 '',
 '',
 'MITI',
 'is',
 'expected',
 'to',
 'lower',
 'the',
 'projection',
 'for',
 'primary',
 'energy\n',
 '',
 'supplies',
 'in',
 'the',
 'year',
 '2000',
 'to',
 '550',
 'mln',
 'kilolitres',
 '(kl)',
 'from',
 '600\n',
 '',
 'mln,',
 'they',
 'said.\n',
 '',
 '',
 '',
 '',
 '',
 'The',
 'decision',
 'follows',
 'the',
 'emergence',
 'of',
 'structural',
 'changes',
 'in\n',
 '',
 'Japanese',
 'industry',
 'following',
 'the',
 'rise',
 'in',
 'the',
 'value',
 'of',
 'the',
 'yen\n',
 '',
 'and',
 'a',
 'decline',
 'in',
 'domestic',
 'electri

## NLTK tokenization

In [12]:
# Now using NLTK's sent_tokenize function
sent_tokenize(article)

['JAPAN TO REVISE LONG-TERM ENERGY DEMAND DOWNWARDS\n  The Ministry of International Trade and\n  Industry (MITI) will revise its long-term energy supply/demand\n  outlook by August to meet a forecast downtrend in Japanese\n  energy demand, ministry officials said.',
 'MITI is expected to lower the projection for primary energy\n  supplies in the year 2000 to 550 mln kilolitres (kl) from 600\n  mln, they said.',
 'The decision follows the emergence of structural changes in\n  Japanese industry following the rise in the value of the yen\n  and a decline in domestic electric power demand.',
 'MITI is planning to work out a revised energy supply/demand\n  outlook through deliberations of committee meetings of the\n  Agency of Natural Resources and Energy, the officials said.',
 'They said MITI will also review the breakdown of energy\n  supply sources, including oil, nuclear, coal and natural gas.',
 "Nuclear energy provided the bulk of Japan's electric power\n  in the fiscal year ended M

In [13]:
#...and word_tokenize function
word_tokenize(article)

['JAPAN',
 'TO',
 'REVISE',
 'LONG-TERM',
 'ENERGY',
 'DEMAND',
 'DOWNWARDS',
 'The',
 'Ministry',
 'of',
 'International',
 'Trade',
 'and',
 'Industry',
 '(',
 'MITI',
 ')',
 'will',
 'revise',
 'its',
 'long-term',
 'energy',
 'supply/demand',
 'outlook',
 'by',
 'August',
 'to',
 'meet',
 'a',
 'forecast',
 'downtrend',
 'in',
 'Japanese',
 'energy',
 'demand',
 ',',
 'ministry',
 'officials',
 'said',
 '.',
 'MITI',
 'is',
 'expected',
 'to',
 'lower',
 'the',
 'projection',
 'for',
 'primary',
 'energy',
 'supplies',
 'in',
 'the',
 'year',
 '2000',
 'to',
 '550',
 'mln',
 'kilolitres',
 '(',
 'kl',
 ')',
 'from',
 '600',
 'mln',
 ',',
 'they',
 'said',
 '.',
 'The',
 'decision',
 'follows',
 'the',
 'emergence',
 'of',
 'structural',
 'changes',
 'in',
 'Japanese',
 'industry',
 'following',
 'the',
 'rise',
 'in',
 'the',
 'value',
 'of',
 'the',
 'yen',
 'and',
 'a',
 'decline',
 'in',
 'domestic',
 'electric',
 'power',
 'demand',
 '.',
 'MITI',
 'is',
 'planning',
 'to',
 'w