In [1]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from string import punctuation
from collections import Counter
import re

from goose import Goose
import matplotlib

%matplotlib inline

from sklearn.feature_extraction.text import TfidfVectorizer

pd.options.mode.chained_assignment = None

In [2]:
porter = PorterStemmer()
wnl = WordNetLemmatizer() 

In [3]:
# taken from http://ahmedbesbes.com/how-to-mine-newsfeed-data-and-extract-interactive-insights-in-python.html
stop = set(stopwords.words('english'))
def tokenizer(text):

    tokens_ = [word_tokenize(sent) for sent in sent_tokenize(text)]

    tokens = []
    for token_by_sent in tokens_:
        tokens += token_by_sent

    tokens = list(filter(lambda t: t.lower() not in stop, tokens))
    tokens = list(filter(lambda t: t not in punctuation, tokens))
    tokens = list(filter(lambda t: t not in [u"'s", u"n't", u"...", u"''", u'``', 
                                        u'\u2014', u'\u2026', u'\u2013'], tokens))
    filtered_tokens = []
    for token in tokens:
        token = wnl.lemmatize(token)
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)

    filtered_tokens = list(map(lambda token: token.lower(), filtered_tokens))

    return filtered_tokens

In [4]:
def get_keywords(tokens):
    return Counter(tokens).most_common(5)

### get the text

In [5]:
url = 'https://medium.com/airbnb-engineering/automated-machine-learning-a-paradigm-shift-that-accelerates-data-scientist-productivity-airbnb-f1f8a10d61f8?utm_content=bufferca661&amp;utm_medium=social&amp;utm_source=twitter.com&amp;utm_campaign=buffer'
g = Goose()
article = g.extract(url=url)

### tokenize text

In [6]:
document = tokenizer(article.cleaned_text)

### get top 5 keywords (based on count)

In [7]:
top_5 = get_keywords(document)
print top_5

[(u'aml', 17), (u'model', 17), (u'data', 11), (u'learning', 7), (u'problem', 6)]


### text processing

In [11]:
vect = TfidfVectorizer(min_df=1, max_features=5, tokenizer=tokenizer, ngram_range=(1, 2))
vz = vect.fit_transform(list(document))

In [12]:
tfidf = dict(zip(vect.get_feature_names(), vect.idf_))
tfidf = pd.DataFrame(columns=['tfidf']).from_dict(dict(tfidf), orient='index')
tfidf.columns = ['tfidf']

In [13]:
tfidf.sort_values(by=['tfidf'], ascending=False).head(5)

Unnamed: 0,tfidf
problem,5.252569
learning,5.119037
data,4.713572
model,4.308107
aml,4.308107
