# Lesson 11: Natural Language Processing & Text Classification
## Starter code for guided practice & demos

In [1]:
# Imports
import json
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pathlib import Path
import seaborn as sns
%matplotlib inline

# Config
DATA_DIR = Path('../datasets')
np.random.seed(1)

In [2]:
# Import data
df = pd.read_csv(DATA_DIR / 'stumbleupon.tsv', sep='\t')
df['title'] = df.boilerplate.map(lambda x: json.loads(x).get('title', ''))
df['body'] = df.boilerplate.map(lambda x: json.loads(x).get('body', ''))
df.head()

Unnamed: 0,url,urlid,boilerplate,alchemy_category,alchemy_category_score,avglinksize,commonlinkratio_1,commonlinkratio_2,commonlinkratio_3,commonlinkratio_4,...,linkwordscore,news_front_page,non_markup_alphanum_characters,numberOfLinks,numwords_in_url,parametrizedLinkRatio,spelling_errors_ratio,label,title,body
0,http://www.bloomberg.com/news/2010-12-23/ibm-p...,4042,"{""title"":""IBM Sees Holographic Calls Air Breat...",business,0.789131,2.055556,0.676471,0.205882,0.047059,0.023529,...,24,0,5424,170,8,0.152941,0.07913,0,IBM Sees Holographic Calls Air Breathing Batte...,A sign stands outside the International Busine...
1,http://www.popsci.com/technology/article/2012-...,8471,"{""title"":""The Fully Electronic Futuristic Star...",recreation,0.574147,3.677966,0.508021,0.28877,0.213904,0.144385,...,40,0,4973,187,9,0.181818,0.125448,1,The Fully Electronic Futuristic Starting Gun T...,And that can be carried on a plane without the...
2,http://www.menshealth.com/health/flu-fighting-...,1164,"{""title"":""Fruits that Fight the Flu fruits tha...",health,0.996526,2.382883,0.562016,0.321705,0.120155,0.042636,...,55,0,2240,258,11,0.166667,0.057613,1,Fruits that Fight the Flu fruits that fight th...,Apples The most popular source of antioxidants...
3,http://www.dumblittleman.com/2007/12/10-foolpr...,6684,"{""title"":""10 Foolproof Tips for Better Sleep ""...",health,0.801248,1.543103,0.4,0.1,0.016667,0.0,...,24,0,2737,120,5,0.041667,0.100858,1,10 Foolproof Tips for Better Sleep,There was a period in my life when I had a lot...
4,http://bleacherreport.com/articles/1205138-the...,9006,"{""title"":""The 50 Coolest Jerseys You Didn t Kn...",sports,0.719157,2.676471,0.5,0.222222,0.123457,0.04321,...,14,0,12032,162,10,0.098765,0.082569,0,The 50 Coolest Jerseys You Didn t Know Existed...,Jersey sales is a curious business Whether you...


## Demo: "Natural language processing with spacy"
Let's use spacy to process some news articles.

In [3]:
import spacy
from spacy.en import English

# Load the NLP toolkit by specifying the language.
nlp_toolkit = English()
nlp_toolkit

<spacy.en.English at 0x114ee4150>

In [10]:
title = u'IBM sees holographic calls, amazing air breathing batteries'
parsed = nlp_toolkit(title)
print parsed  # has a bunch of methods, try typing parsed. and doing tab-complete to see these
print list(parsed)  # compare with list(title) ;)

IBM sees holographic calls, amazing air breathing batteries
[IBM, sees, holographic, calls, ,, amazing, air, breathing, batteries]


In [13]:
for (i, word) in enumerate(parsed):
    print("Word: {}".format(word))
    print("\t Phrase type: {}".format(word.dep_))
    print("\t Is the word a known entity type? {}".format(word.ent_type_ if word.ent_type_ else "No"))
    print("\t Lemma: {}".format(word.lemma_))
    print("\t Parent of this word: {}".format(word.head.lemma_))
    print word.sentiment

Word: IBM
	 Phrase type: nsubj
	 Is the word a known entity type? ORG
	 Lemma: ibm
	 Parent of this word: see
0.0
Word: sees
	 Phrase type: ROOT
	 Is the word a known entity type? No
	 Lemma: see
	 Parent of this word: see
0.0
Word: holographic
	 Phrase type: amod
	 Is the word a known entity type? No
	 Lemma: holographic
	 Parent of this word: call
0.0
Word: calls
	 Phrase type: dobj
	 Is the word a known entity type? No
	 Lemma: call
	 Parent of this word: see
0.0
Word: ,
	 Phrase type: punct
	 Is the word a known entity type? No
	 Lemma: ,
	 Parent of this word: battery
0.0
Word: amazing
	 Phrase type: amod
	 Is the word a known entity type? No
	 Lemma: amazing
	 Parent of this word: battery
0.0
Word: air
	 Phrase type: compound
	 Is the word a known entity type? No
	 Lemma: air
	 Parent of this word: breathing
0.0
Word: breathing
	 Phrase type: compound
	 Is the word a known entity type? No
	 Lemma: breathing
	 Parent of this word: battery
0.0
Word: batteries
	 Phrase type: conj
	 

In [6]:
def references_organisation(title):
    parsed = nlp_toolkit(unicode(title))
    return any([word.ent_type_ == 'ORG' for word in parsed])
df['references_organisation'] = df['title'].fillna('').map(references_organisation)

In [7]:
print df['references_organisation'].head()
print 

df[df['references_organisation']][['title']].head()

0     True
1    False
2    False
3    False
4    False
Name: references_organisation, dtype: bool



Unnamed: 0,title
0,IBM Sees Holographic Calls Air Breathing Batte...
6,fashion lane American Wild Child
8,Valet The Handbook 31 Days 31 days
10,Business Financial News Breaking US Internatio...
11,A Tip of the Cap to The Greatest Iron Man of T...


## Activity: "Using spacy"
Using the code above, write a function to identify titles that mention a person (PERSON). What about titles that mention either an organisation or a person?

In [None]:
# ...


## Demo: "Text processing in scikit-learn"

In [None]:
# Last lesson, we used this to create a feature for whether the title contains 'recipe'
df['recipe'] = df['title'].map(lambda t: 1 if 'recipe' in unicode(t).lower() else 0)
print df['recipe'][7376:7380]

In [None]:
# Let's now extract single word features using CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer

# First, create list of titles, filling missing values with a blank string
titles = df['title'].fillna('')

# Instantiate a new CountVectorizer
vectorizer = CountVectorizer(max_features=1000,     # max number of words to consider (uses first N most frequent)
                             ngram_range=(1, 2),    # e.g. (1,1) for single words, (1,2) for bigrams, etc
                             stop_words='english',  # remove English language stop words, e.g. 'to', 'the', 'it'
                             binary=True)           # use 1/0 instead of word count

# Use `fit` to learn the vocabulary of the titles
vectorizer.fit(titles)

# Use `transform` to generate the sample X word matrix - one column per feature (word or n-grams)
X = vectorizer.transform(titles)
X

In [None]:
# Sparse matrix! So only the non-zero entries are recorded...
print X[0:6]

## Demo: "Build a random forest model using vectorized title features"

In [None]:
# Use `transform` to generate the sample X word matrix - one column per feature (word or n-grams)
X = vectorizer.transform(titles).toarray()
y = df['label']

X

In [None]:
from sklearn.cross_validation import cross_val_score
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators = 20)
scores = cross_val_score(model, X, y, scoring='roc_auc')
print('CV AUC {}, Average AUC {}'.format(scores, scores.mean()))

## Activity: "TF-IDF knowledge check"
Use `TfIdfVectorizer` instead of `CountVectorizer` to create a feature representation of the StumbleUpon titles. Is this an improvement?

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# ...


## Independent Practice: "Text classification in scikit-learn"
1. Use the text features of title with one or more feature sets from the previous random forest model. Train this model to see if it improves AUC.
2. Use the body text instead of the title. Does this give an improvement?
3. Use TfIdfVectorizer instead of CountVectorizer. Does this give an improvement?

**Check:** Were you able to prepare a model that uses both quantitative features and text features? Does this model improve the AUC?

In [None]:
# Build a random forest model to predict evergreen-ness of a website
# using the title features plus some quantitative features

# Tip: you may want to check out what this does: `from scipy.sparse import hstack`

# ...


In [None]:
# Build a random forest model to predict evergreen-ness of a website
# using the body text as features

# ...


In [None]:
# Try using tf-idf, does this give an improvement?

# ...

In [None]:
# What model gives the best mean cross-validated AUC for predicting evergreen-ness?

# ...
