# Analysing Shakespeare

In this exercise, we will analyse Shakepeare using our new knowledge on text miming.

In [1]:
# download shakespeare

import requests

URL = "https://www.gutenberg.org/cache/epub/100/pg100.txt"
page = requests.get(URL)

# print(len(page.text))

text = page.text

In [17]:
# the text can be split into several documents using \r\n\r\n\r\n in regex
# these newlines seperate each document

shakespeare = text.split("\r\n\r\n\r\n ")

In [24]:
# the second split is the first story
shakespeare[1]

"                    1\r\n  From fairest creatures we desire increase,\r\n  That thereby beauty's rose might never die,\r\n  But as the riper should by time decease,\r\n  His tender heir might bear his memory:\r\n  But thou contracted to thine own bright eyes,\r\n  Feed'st thy light's flame with self-substantial fuel,\r\n  Making a famine where abundance lies,\r\n  Thy self thy foe, to thy sweet self too cruel:\r\n  Thou that art now the world's fresh ornament,\r\n  And only herald to the gaudy spring,\r\n  Within thine own bud buriest thy content,\r\n  And tender churl mak'st waste in niggarding:\r\n    Pity the world, or else this glutton be,\r\n    To eat the world's due, by the grave and thee."

In [47]:
# convert it to a dataframe
import pandas as pd

df = pd.DataFrame(shakespeare[1:155])
df

Unnamed: 0,0
0,1\r\n From fairest creatu...
1,2\r\n When forty winters ...
2,3\r\n Look in thy glass a...
3,4\r\n Unthrifty lovelines...
4,5\r\n Those hours that wi...
...,...
149,150\r\n O from what power...
150,151\r\n Love is too young...
151,152\r\n In loving thee th...
152,153\r\n Cupid laid by his...


Finally, let's create a Shakespeare corpus and investigate the first 5 documents in it.

In [40]:
import numpy as np 

shakespeare_five = df.iloc[1:6]
shakespeare_five.describe()

Unnamed: 0,0
count,5
unique,5
top,1\r\n From fairest creatu...
freq,1


Next, we apply our usual text transformations to clean the text, using the Corpuser function:

In [48]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

stopwords = list(stopwords.words('english'))

def Corpuser(corpus):
    corpus = word_tokenize(corpus)
    corpus = [word.replace(" ", "") for word in corpus]
    corpus = [word.lower() for word in corpus if word.isalpha()]

    corpus = [word for word in corpus if word not in stopwords]
    
    return corpus

Create the DocumentTermMatrix:

In [77]:
# then we create a frequency table

docs = shakespeare[1:155]
def frequencytable(corpus):
    words = Corpuser(corpus)
    freq_table = {}
    for word in words:
        if word in freq_table:
            freq_table[word] += 1
        else:
            freq_table[word] = 1
    return freq_table

ft = frequencytable(str(docs))

In [78]:
def dtm(documents):
    dfs = []
    for i in range(len(documents)):
        table = frequencytable(str(documents[i]))
        i = pd.DataFrame.from_dict(table, orient='index', columns={i})
        dfs.append(i)
    dtm = pd.concat(dfs, axis=1)
    dtm = dtm.fillna(0)
    
    return dtm

In [79]:
shakespeare_dtm = dtm(docs)

In [80]:
shakespeare_dtm

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,144,145,146,147,148,149,150,151,152,153
fairest,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
creatures,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
desire,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
increase,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
thereby,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
helena,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
gentlewoman,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
protected,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
diana,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


Let's have a closer look at the first 10 docs

In [158]:
doc10 = shakespeare[1:11]
ft = frequencytable(str(doc10))
dtm_10 = dtm(doc10)

In [159]:
dtm_10

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
fairest,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
creatures,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
desire,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
increase,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
thereby,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
fairer,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
lodged,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
presence,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
kind,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


There are many many empty entries (0), which means the words do not appear in the document. We can for example use only the 10 most occuring terms

In [164]:
def top_n_terms(freqtab, n):
    sorted_ft = sorted(freqtab.items(), key=lambda x: x[1], reverse=True)
    freqtab = sorted_ft[:n]  
    terms = [tup[0] for tup in freqtab]
    return terms

In [135]:
# def dtm_10(documents):
#     dfs = []
#     for i in range(len(documents)):
#         table = frequencytable(str(documents[i]))
#         i = pd.DataFrame.from_dict(table, orient='index', columns={i})
#         dfs.append(i)
#     dfs
#     dtm = pd.concat(dfs, axis=1)
#     dtm = dtm.fillna(0)
    
#     return dtm

In [165]:
top10 = top_n_terms(ft, 10)
top10

['thou',
 'thy',
 'thee',
 'beauty',
 'self',
 'thine',
 'world',
 'art',
 'sweet',
 'another']

In [166]:
dtm_top10 = dtm_10[dtm_10.index.isin(top10)]

In [167]:
dtm_top10

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
beauty,1.0,4.0,0.0,2.0,3.0,1.0,1.0,0.0,1.0,1.0
thou,2.0,3.0,6.0,5.0,0.0,5.0,2.0,4.0,3.0,6.0
thine,2.0,2.0,2.0,0.0,0.0,2.0,0.0,2.0,0.0,1.0
thy,5.0,7.0,4.0,6.0,0.0,3.0,2.0,0.0,2.0,6.0
self,2.0,0.0,0.0,4.0,0.0,2.0,1.0,0.0,1.0,4.0
sweet,1.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0
art,1.0,1.0,1.0,0.0,0.0,2.0,0.0,0.0,0.0,3.0
world,3.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0
thee,1.0,0.0,2.0,3.0,0.0,4.0,0.0,2.0,2.0,2.0
another,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0


Better, much better ...


Let's find the 50 most frequent terms in Shakespeare next.

In [168]:
top50 = top_n_terms(ft, 50)
top50

['thou',
 'thy',
 'thee',
 'beauty',
 'self',
 'thine',
 'world',
 'art',
 'sweet',
 'another',
 'dost',
 'doth',
 'still',
 'ten',
 'one',
 'eyes',
 'make',
 'love',
 'die',
 'time',
 'shall',
 'treasure',
 'shame',
 'fair',
 'look',
 'glass',
 'whose',
 'age',
 'live',
 'single',
 'every',
 'summer',
 'winter',
 'times',
 'widow',
 'may',
 'desire',
 'might',
 'tender',
 'heir',
 'bear',
 'lies',
 'fresh',
 'within',
 'waste',
 'else',
 'deep',
 'youth',
 'lusty',
 'much']

## TF/IDF

We have discussed the tf/idf scoring for documents in the lecture. 

In python, tf/idf is easy to create using scikit learn. 

import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()
vec = tfidf.fit_transform(doc10)
 
matrix = pd.DataFrame(vec.toarray().transpose(), index=tfidf.get_feature_names())
 
matrix

## k-means clustering

Using, tf-idf we can apply our favourite clustering technique k-means to understand common clusters of words.

In [174]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=10).fit(vec)

In [177]:
kmeans.labels_

array([6, 5, 9, 0, 2, 8, 3, 4, 7, 1], dtype=int32)

In [None]:
# what to do with the kmeans??
# what exactly is being clustered?
