In [216]:
# Import all of the things you need to import!
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem.porter import PorterStemmer
from sklearn.cluster import KMeans
import re
count_vectorizer = CountVectorizer()
pd.options.display.max_columns = 30

# Homework 14 (or so): TF-IDF text analysis and clustering

Hooray, we kind of figured out how text analysis works! Some of it is still magic, but at least the **TF** and **IDF** parts make a little sense. Kind of. Somewhat.

No, just kidding, we're *professionals* now.

## Investigating the Congressional Record

The [Congressional Record](https://en.wikipedia.org/wiki/Congressional_Record) is more or less what happened in Congress every single day. Speeches and all that. A good large source of text data, maybe?

Let's pretend it's totally secret but we just got it leaked to us in a data dump, and we need to check it out. It was leaked from [this page here](http://www.cs.cornell.edu/home/llee/data/convote.html).

In [217]:
# If you'd like to download it through the command line...
# !curl -O http://www.cs.cornell.edu/home/llee/data/convote/convote_v1.1.tar.gz

In [218]:
# And then extract it through the command line...
# !tar -zxf convote_v1.1.tar.gz

You can explore the files if you'd like, but we're going to get the ones from `convote_v1.1/data_stage_one/development_set/`. It's a bunch of text files.

In [219]:
# glob finds files matching a certain filename pattern
import glob

# Give me all the text files
paths = glob.glob('../../../data/convote_v1.1/data_stage_one/development_set/*')
paths[:5]

['../../../data/convote_v1.1/data_stage_one/development_set/052_400239_1479011_ROY.txt',
 '../../../data/convote_v1.1/data_stage_one/development_set/493_400388_2346095_ROY.txt',
 '../../../data/convote_v1.1/data_stage_one/development_set/199_400133_2013013_DOY.txt',
 '../../../data/convote_v1.1/data_stage_one/development_set/493_400436_2346101_DON.txt',
 '../../../data/convote_v1.1/data_stage_one/development_set/421_400338_2010025_ROY.txt']

In [220]:
len(paths)

702

So great, we have 702 of them. Now let's import them.

In [221]:
speeches = []
for path in paths:
    with open(path) as speech_file:
        speech = {
            'pathname': path,
            'filename': path.split('/')[-1],
            'content': speech_file.read()
        }
    speeches.append(speech)
speeches_df = pd.DataFrame(speeches)
speeches_df.head()

Unnamed: 0,content,filename,pathname
0,"mr. chairman , i am happy to yield 2 minutes t...",052_400239_1479011_ROY.txt,../../../data/convote_v1.1/data_stage_one/deve...
1,"mr. chairman , i yield myself such time as i m...",493_400388_2346095_ROY.txt,../../../data/convote_v1.1/data_stage_one/deve...
2,"mr. speaker , i thank the gentleman from wisco...",199_400133_2013013_DOY.txt,../../../data/convote_v1.1/data_stage_one/deve...
3,"mr. chairman , i thank my friend , the gentlem...",493_400436_2346101_DON.txt,../../../data/convote_v1.1/data_stage_one/deve...
4,"madam speaker , i recently hosted roundtables ...",421_400338_2010025_ROY.txt,../../../data/convote_v1.1/data_stage_one/deve...


In class we had the `texts` variable. For the homework can just do `speeches_df['content']` to get the same sort of list of stuff.

**Take a look at the contents of the first 5 speeches**

In [222]:
speeches_df['content'].head(5)

0    mr. chairman , i am happy to yield 2 minutes t...
1    mr. chairman , i yield myself such time as i m...
2    mr. speaker , i thank the gentleman from wisco...
3    mr. chairman , i thank my friend , the gentlem...
4    madam speaker , i recently hosted roundtables ...
Name: content, dtype: object

# Doing our analysis

Use the `sklearn` package and a plain boring `CountVectorizer` to get a list of all of the tokens used in the speeches. If it won't list them all, that's ok! Make a dataframe with those terms as columns.

**Be sure to include English-language stopwords**

In [223]:
count_vectorizer = CountVectorizer(stop_words='english')
X = count_vectorizer.fit_transform(speeches_df['content'])

In [224]:
X

<702x9106 sparse matrix of type '<class 'numpy.int64'>'
	with 56106 stored elements in Compressed Sparse Row format>

In [225]:
X.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       ..., 
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [226]:
pd.DataFrame(X.toarray())

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,...,9091,9092,9093,9094,9095,9096,9097,9098,9099,9100,9101,9102,9103,9104,9105
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
8,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
9,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0


Okay, it's **far** too big to even look at. Let's try to get a list of features from a new `CountVectorizer` that only takes the top 100 words.

In [227]:
count_vectorizer = CountVectorizer(stop_words='english', max_features=100)
X = count_vectorizer.fit_transform(speeches_df['content'])
X

<702x100 sparse matrix of type '<class 'numpy.int64'>'
	with 11088 stored elements in Compressed Sparse Row format>

In [228]:
X.toarray()

array([[0, 0, 0, ..., 0, 0, 1],
       [0, 0, 2, ..., 0, 0, 1],
       [1, 0, 0, ..., 1, 1, 0],
       ..., 
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 2],
       [0, 0, 0, ..., 0, 0, 1]], dtype=int64)

In [229]:
pd.DataFrame(X.toarray())

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,...,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99
0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1,0,0,2,0,2,0,0,0,0,1,0,0,0,4,14,...,5,0,1,3,1,0,0,1,0,0,0,0,0,0,1
2,1,0,0,0,0,0,0,0,0,0,0,0,0,2,0,...,0,1,0,1,4,0,0,0,0,0,0,0,1,1,0
3,0,0,0,0,3,0,1,0,0,0,0,0,0,1,4,...,3,1,1,0,0,0,0,0,1,2,0,0,0,0,0
4,0,0,2,0,0,0,2,0,1,0,0,0,0,2,0,...,0,0,0,1,1,14,2,2,0,0,3,2,0,0,0
5,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
6,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7,0,0,1,0,1,0,0,0,0,0,0,0,0,2,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
8,3,0,0,0,2,0,0,0,0,1,0,0,0,2,0,...,0,0,0,2,1,0,0,0,0,0,0,0,1,0,1
9,1,0,0,0,0,0,0,0,0,0,0,0,0,0,6,...,0,1,0,1,3,0,0,1,1,0,0,0,1,1,0


Now let's push all of that into a dataframe with nicely named columns.

In [230]:
speech_df = pd.DataFrame(X.toarray(), columns=count_vectorizer.get_feature_names())
speech_df.head(5)

Unnamed: 0,000,11,act,allow,amendment,america,american,amp,association,balance,based,believe,bipartisan,chairman,children,...,teachers,thank,think,time,today,trade,united,urge,vote,want,way,work,year,years,yield
0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1,0,0,2,0,2,0,0,0,0,1,0,0,0,4,14,...,5,0,1,3,1,0,0,1,0,0,0,0,0,0,1
2,1,0,0,0,0,0,0,0,0,0,0,0,0,2,0,...,0,1,0,1,4,0,0,0,0,0,0,0,1,1,0
3,0,0,0,0,3,0,1,0,0,0,0,0,0,1,4,...,3,1,1,0,0,0,0,0,1,2,0,0,0,0,0
4,0,0,2,0,0,0,2,0,1,0,0,0,0,2,0,...,0,0,0,1,1,14,2,2,0,0,3,2,0,0,0


Everyone seems to start their speeches with "mr chairman" - how many speeches are there total, and many don't mention "chairman" and how many mention neither "mr" nor "chairman"?

In [231]:
print("All speeches:", speech_df['000'].count())

All speeches: 702


In [232]:
without_chairman = speech_df[speech_df['chairman'] == 0]['chairman'].count()
print("Without 'chairman':", without_chairman)

Without 'chairman': 250


In [233]:
without_mr_chairman = speech_df[(speech_df['chairman'] == 0) & (speech_df['mr'] == 0)]['chairman'].count()
print("Without 'Mr.' and 'chairman':", without_mr_chairman)

Without 'Mr.' and 'chairman': 76


What is the index of the speech thank is the most thankful, a.k.a. includes the word 'thank' the most times?

In [234]:
speech_df['thank'].idxmax()

139

If I'm searching for `China` and `trade`, what are the top 3 speeches to read according to the `CountVectoriser`?

In [238]:
china_trade = speech_df.sort_values(by=['china', 'trade'], ascending=False)[['china', 'trade']]
china_trade.head(3)

Unnamed: 0,china,trade
638,29,63
326,27,9
622,16,11


Now what if I'm using a `TfidfVectorizer`?

In [239]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english', norm='l1')
X = tfidf_vectorizer.fit_transform(speech_df)
tfid_df = pd.DataFrame(X.toarray(), columns=tfidf_vectorizer.get_feature_names())
china_trade2 = tfid_df.sort_values(by=['china', 'trade'], ascending=False)[['china', 'trade']]
china_trade2.head(3)

Unnamed: 0,china,trade
15,1.0,0.0
90,0.0,1.0
0,0.0,0.0


**What's the content of the speeches?** Here's a way to get them:

In [243]:
# index 0 is the first speech, which was the first one imported.
print(paths[638])
print(paths[326])
print(paths[622])

../../../data/convote_v1.1/data_stage_one/development_set/421_400328_2053036_RMY.txt
../../../data/convote_v1.1/data_stage_one/development_set/421_400385_2010038_RMY.txt
../../../data/convote_v1.1/data_stage_one/development_set/421_400227_2010019_DON.txt


In [244]:
# Pass that into 'cat' using { } which lets you put variables in shell commands
# that way you can pass the path to cat
!cat {paths[638]}

OSError: [Errno 12] Cannot allocate memory

**Now search for something else!** Another two terms that might show up. `elections` and `chaos`? Whatever you thnik might be interesting.

In [245]:
def boring_tokenizer(str_input):
    words = re.sub(r"[^A-Za-z0-9\-]", " ", str_input).lower().split()
    return words

count_vectorizer = CountVectorizer(stop_words='english', tokenizer=boring_tokenizer)
X = count_vectorizer.fit_transform(speeches_df['content'])

more_speeches_df = pd.DataFrame(X.toarray(), columns=count_vectorizer.get_feature_names())
tech_ny = more_speeches_df.sort_values(by=['technology', 'york'], ascending=False)[['technology', 'york']]
tech_ny.head(5)

Unnamed: 0,technology,york
472,4,6
237,2,0
46,1,0
173,1,0
262,1,0


In [246]:
!cat {paths[173]}

OSError: [Errno 12] Cannot allocate memory

# Enough of this garbage, let's cluster

Using a **simple counting vectorizer**, cluster the documents into **eight categories**, telling me what the top terms are per category.

Using a **term frequency vectorizer**, cluster the documents into **eight categories**, telling me what the top terms are per category.

Using a **term frequency inverse document frequency vectorizer**, cluster the documents into **eight categories**, telling me what the top terms are per category.

In [247]:
# Simple counting vectorizer
def boring_tokenizer(str_input):
    words = re.sub(r"[^A-Za-z0-9\-]", " ", str_input).lower().split()
    return words

count_vectorizer = CountVectorizer(stop_words='english', tokenizer=boring_tokenizer)
count_vectorizer._validate_vocabulary()
X = count_vectorizer.fit_transform(speeches_df['content'])

number_of_clusters = 8
km = KMeans(n_clusters=number_of_clusters)
km.fit(X)

print("Top terms per cluster:")
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for i in range(number_of_clusters):
    top_ten_words = [terms[ind] for ind in order_centroids[i, :5]]
    print("Cluster {}: {}".format(i, ' '.join(top_ten_words)))

Top terms per cluster:
Cluster 0: casted auctions acquiesced boyoo 2015webster
Cluster 1: binty 202and atoll bloodings befriend
Cluster 2: auctions casted bubblehead butbenj actuality
Cluster 3: bigots cheated avowed byes bo-
Cluster 4: bigots accusation assassin 2015webster cheated
Cluster 5: actionand byes ariella annunciated --
Cluster 6: 475 binlow burglar agreat accusingly
Cluster 7: buzzed -all buzzingor archie attackalthough


In [248]:
# term frequency vectorizer

vectorizer = TfidfVectorizer(use_idf=True, tokenizer=boring_tokenizer, stop_words='english')
X = vectorizer.fit_transform(speeches_df['content'])

number_of_clusters = 8
km = KMeans(n_clusters=number_of_clusters)
km.fit(X)

print("Top terms per cluster:")
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for i in range(number_of_clusters):
    top_ten_words = [terms[ind] for ind in order_centroids[i, :5]]
    print("Cluster {}: {}".format(i, ' '.join(top_ten_words)))

Top terms per cluster:
Cluster 0: balance yield time chairman mr
Cluster 1: start head children program religious
Cluster 2: mr yield minutes gentleman chairman
Cluster 3: yield gentleman mr chairman texas
Cluster 4: china trade s currency speaker
Cluster 5: demand recorded vote mr speaker
Cluster 6: mr amendment chairman time gentleman
Cluster 7: reserve balance time mr chairman


In [249]:
#term frequency inverse document frequency vectorizer

idf_vectorizer = TfidfVectorizer(stop_words='english', tokenizer=boring_tokenizer, use_idf=True)
X = idf_vectorizer.fit_transform(speeches_df['content'])

number_of_clusters = 8
km = KMeans(n_clusters=number_of_clusters)
km.fit(X)

print("Top terms per cluster:")
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for i in range(number_of_clusters):
    top_ten_words = [terms[ind] for ind in order_centroids[i, :5]]
    print("Cluster {}: {}".format(i, ' '.join(top_ten_words)))

Top terms per cluster:
Cluster 0: demand recorded vote mr speaker
Cluster 1: yield gentleman mr minutes 2
Cluster 2: mr amendment chairman gentleman speaker
Cluster 3: start head children program religious
Cluster 4: frivolous election house lawsuits elections
Cluster 5: china trade s currency cafta
Cluster 6: claim consent unanimous opposition ask
Cluster 7: balance yield chairman time mr


**Which one do you think works the best?**

In my optionion, the "term frequency vectorizer" works best in this particular case. 

# Harry Potter time

I have a scraped collection of Harry Potter fanfiction at https://github.com/ledeprogram/courses/raw/master/algorithms/data/hp.zip.

I want you to read them in, vectorize them and cluster them. Use this process to find out **the two types of Harry Potter fanfiction**. What is your hypothesis?

In [250]:
paths2 = glob.glob('../../../data/hp/*')
paths2[:5]

['../../../data/hp/10596282.txt',
 '../../../data/hp/10605896.txt',
 '../../../data/hp/10607034.txt',
 '../../../data/hp/10611870.txt',
 '../../../data/hp/10607731.txt']

In [251]:
len(paths2)

1874

In [252]:
fanfiction = []
for path in paths2:
    with open(path) as fan_file:
        fiction = {
            'pathname': path,
            'filename': path.split('/')[-1],
            'content': fan_file.read()
        }
    fanfiction.append(fiction)
fanfiction_df = pd.DataFrame(fanfiction)
fanfiction_df.head()

Unnamed: 0,content,filename,pathname
0,"25 July 1975Dear Sirius,You showed up at my ho...",10596282.txt,../../../data/hp/10596282.txt
1,It was the kind of mistake that left a sickeni...,10605896.txt,../../../data/hp/10605896.txt
2,"Hey there! This is my next story titled ""My Mi...",10607034.txt,../../../data/hp/10607034.txt
3,"Teddy sat by Molly's side, arm wrapped around ...",10611870.txt,../../../data/hp/10611870.txt
4,DZ2's 'A Great Deal of Courage' ChallengePlot:...,10607731.txt,../../../data/hp/10607731.txt


In [253]:
vectorizer = TfidfVectorizer(use_idf=True, tokenizer=boring_tokenizer, stop_words='english')
X = vectorizer.fit_transform(fanfiction_df['content'])

In [254]:
number_of_clusters = 2
km = KMeans(n_clusters=number_of_clusters)
km.fit(X)

print("Top terms per cluster:")
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for i in range(number_of_clusters):
    top_ten_words = [terms[ind] for ind in order_centroids[i, :5]]
    print("Cluster {}: {}".format(i, ' '.join(top_ten_words)))

Top terms per cluster:
Cluster 0: harry hermione t s draco
Cluster 1: t s lily james sirius


It looks that one part of the fanfiction is telling the story of Harry Potter and his mates (and rivals) at the Hogwarts School while the other part is dedicated to the story of Harry Potter's parents James und Lily Potter.