In [1]:
from bs4 import BeautifulSoup
import urllib
import nltk
from nltk import FreqDist
from nltk.corpus import stopwords
import re

# Crawl articles

In [6]:
url = 'http://www-03.ibm.com/press/us/en/pressreleases/recent.wss'
html = urllib.urlopen(url).read()
soup = BeautifulSoup(html,'lxml')
print soup.prettify()

<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html lang="en-US" xml:lang="en-US" xmlns="http://www.w3.org/1999/xhtml">
 <head>
  <meta content="text/html;charset=utf-8" http-equiv="Content-Type"/>
  <meta content='(pics-1.1 "http://www.icra.org/ratingsv02.html" l gen true r (cz 1 lz 1 nz 1 oz 1 vz 1) "http://www.rsac.org/ratingsv01.html" l gen true r (n 0 s 0 v 0 l 0) "http://www.classify.org/safesurf/" l gen true r (ss~~000 1))' http-equiv="PICS-Label"/>
  <meta content="US" name="IBM.Country"/>
  <meta content="sid=1002" name="IBM.PageAttributes"/>
  <meta content="2005-12-03" name="DC.Date" scheme="iso8601"/>
  <meta content="v17 Template Generator, Template 17.02" name="Source"/>
  <meta content="http://www-03.ibm.com/press/img/ibmpos_blu_feed.jpg" property="og:image"/>
  <meta content="IBM Press Room - IBM Press room - Press releases" name="Abstract"/>
  <meta content="IBM Press Room - IBM Press room - Press releases

In [12]:
articles = []
for href in soup.findAll(href=re.compile('[0-9]+\.wss')):
    articles.append(href.get('href'))

In [13]:
articles[0]

'/press/us/en/pressrelease/49018.wss'

In [36]:
for article in articles:
    article_url = 'http://www-03.ibm.com%s' % article
    html = urllib.urlopen(article_url).read()
    soup = BeautifulSoup(html,'lxml')
    text = soup.find('div',class_='ibm-container-body').text
    text = text.replace('\t','')
    text = text.replace('\n','')
    name  = re.findall('[0-9]+',article)[0]
    f = open('Articles/%s.txt' % name,'w')
    f.write(text.encode('utf-8'))
    f.close()

# Load articles

In [2]:
import glob
import json

In [3]:
text = ""
for txt in glob.glob('Articles/*.txt'):
    text+=json.dumps(open(txt,'r').read())

In [4]:
tokens = nltk.word_tokenize(text)
text = nltk.Text(tokens)
content = [w for w in text if w.isalpha()]

# Simple bag of words

In [5]:
bag_of_words_simple = FreqDist(content)
rank = sorted(bag_of_words_simple.items(),key= lambda s:s[1],reverse=True)
print rank[:20]

[('and', 624), ('the', 457), ('to', 391), ('of', 302), ('IBM', 234), ('a', 211), ('in', 209), ('for', 141), ('with', 114), ('is', 107), ('on', 97), ('will', 82), ('that', 81), ('more', 80), ('as', 77), ('data', 74), ('new', 71), ('The', 69), ('Watson', 67), ('their', 62)]


# bag-of-words with stemming and stop word removal 

In [6]:
stopwords = nltk.corpus.stopwords.words('english')
porter = nltk.PorterStemmer()
content_nostop_stem = [porter.stem(w) for w in text if w.lower() not in stopwords if w.isalpha()]
bag_of_words_nostop_stem = FreqDist(content_nostop_stem)
rank = sorted(bag_of_words_nostop_stem.items(),key= lambda s:s[1],reverse=True)
print rank[:20]

[(u'IBM', 234), (u'data', 74), (u'new', 71), (u'Watson', 67), (u'servic', 66), (u'cloud', 63), (u'busi', 55), (u'provid', 54), (u'develop', 49), (u'help', 48), (u'health', 48), (u'Cloud', 46), (u'platform', 44), (u'custom', 42), (u'digit', 41), (u'technolog', 41), (u'market', 39), (u'inform', 39), (u'solut', 39), (u'offer', 38)]


In [7]:
type(text)

nltk.text.Text

# POS (Part of Speech) and NNP approach

In [8]:
POS = nltk.pos_tag(tokens)
tags = [(word,tag) for (word,tag) in POS if tag.startswith('N')]
tag_fd = FreqDist(tags)
rank = sorted(tag_fd.items(),key= lambda s:s[1],reverse=True)
print rank[:20]

[(('IBM', 'NNP'), 234), (('Watson', 'NNP'), 67), (('data', 'NNS'), 67), (('services', 'NNS'), 55), (('cloud', 'NN'), 53), (('health', 'NN'), 48), (('Cloud', 'NNP'), 46), (('business', 'NN'), 39), (('information', 'NN'), 37), (('platform', 'NN'), 36), (('\\u201d', 'NNP'), 31), (('technology', 'NN'), 30), (('clients', 'NNS'), 29), (('analytics', 'NNS'), 27), (('insights', 'NNS'), 27), (('visit', 'NN'), 26), (('customers', 'NNS'), 24), (('today', 'NN'), 23), (('company', 'NN'), 23), (('NYSE', 'NNP'), 21)]
