In [None]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords

from string import punctuation
from collections import Counter
import re

from goose import Goose
import matplotlib

%matplotlib inline



pd.options.mode.chained_assignment = None

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
# list of stopwords like articles, preposition
stop = set(stopwords.words('english'))

In [4]:
def tokenizer(text):
    try:
        tokens_ = [word_tokenize(sent) for sent in sent_tokenize(text)]
        
        tokens = []
        for token_by_sent in tokens_:
            tokens += token_by_sent

        tokens = list(filter(lambda t: t.lower() not in stop, tokens))
        tokens = list(filter(lambda t: t not in punctuation, tokens))
        tokens = list(filter(lambda t: t not in [u"'s", u"n't", u"...", u"''", u'``', 
                                            u'\u2014', u'\u2026', u'\u2013'], tokens))
        filtered_tokens = []
        for token in tokens:
            if re.search('[a-zA-Z]', token):
                filtered_tokens.append(token)

        filtered_tokens = list(map(lambda token: token.lower(), filtered_tokens))

        return filtered_tokens
    except Error as e:
        print(e)

In [10]:
def get_keywords(tokens):
    return Counter(tokens).most_common(5)

### get the text

In [6]:
url = 'http://edition.cnn.com/2012/02/22/world/europe/uk-occupy-london/index.html?hpt=ieu_c2'
g = Goose()
article = g.extract(url=url)

In [7]:
article.cleaned_text

u'Occupy London protesters who have been camped outside the landmark St. Paul\'s Cathedral for the past four months lost their court bid to avoid eviction Wednesday in a decision made by London\'s Court of Appeal.\n\nLast month, the High Court ruled in favor of the City of London Corporation, the body that runs London\'s financial district, which applied for an eviction order after the protesters failed to abide by a previous order to pack up the camp.\n\nThe Occupy London group then filed an appeal against the ruling at the Court of Appeal.\n\nWednesday, the three judges of the court refused them the permission to appeal.\n\nMichael Paget, the lawyer representing the Occupy London group, said the group doesn\'t intend to file another legal challenge.\n\n"The occupy message has been heard and will continue to be heard. It has made a difference and will continue to make a difference."\n\nThe activists, who set up camp outside the cathedral October 15, had been protesting against corpora

### tokenize text

In [8]:
document = tokenizer(article.cleaned_text)

### get top 5 keywords (based on count)

In [11]:
top_5 = get_keywords(document)
print top_5

[(u'london', 7), (u'occupy', 5), (u'court', 5), (u'appeal', 4), (u'paul', 3)]


### text processing

In [14]:
# min_df is minimum number of documents that contain a term t
# max_features is maximum number of unique tokens (across documents) that we'd consider
# TfidfVectorizer preprocesses the descriptions using the tokenizer we defined above

vectorizer = TfidfVectorizer(min_df=1, max_features=10000, tokenizer=tokenizer, ngram_range=(1, 2))


In [15]:
vz = vectorizer.fit_transform(list(document))

In [18]:
tfidf = dict(zip(vectorizer.get_feature_names(), vectorizer.idf_))
tfidf = pd.DataFrame(columns=['tfidf']).from_dict(dict(tfidf), orient='index')
tfidf.columns = ['tfidf']

In [19]:
tfidf.tfidf.hist(bins=50, figsize=(15,7))

ImportError: No module named matplotlib.pyplot