In [1]:
from gutenberg.acquire import load_etext
from gutenberg.cleanup import strip_headers
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
import numpy as np
import scipy.io as sio

In [2]:
# Loads Moby Dick
mobydick = strip_headers(load_etext(2701)).strip()
text = [mobydick.split('ETYMOLOGY')[-1]]

In [3]:
# Loads Pride and Prejudice
pride = strip_headers(load_etext(1342)).strip()
text.append(pride.split('Austen')[-1])

In [4]:
# Loads Sense and Sensibility
sense = strip_headers(load_etext(161)).strip()
text.append(sense.split('\n(1811)\n\n')[-1])

In [5]:
# Loads Huck Finn
huckfinn = strip_headers(load_etext(76)).strip()
text.append(huckfinn.split('\nEXPLANATORY\n\n')[-1])

In [6]:
# Loads Tom Sawyer
toms = strip_headers(load_etext(74)).strip()
text.append(toms.split('\nPREFACE\n\n')[-1])

In [7]:
# Loads Twenty Thousand Leagues
leagues20k = strip_headers(load_etext(164)).strip()
text.append(leagues20k.split(' VERNE\n\n')[-1])

In [8]:
# Loads Ulysses
ulysses = strip_headers(load_etext(4300)).strip()
text.append(ulysses.split('\n— I —\n')[-1])

In [9]:
# Loads Wizard of Oz
wizard = strip_headers(load_etext(55)).strip()
text.append(wizard.split('\nTHE WONDERFUL WIZARD OF OZ\n\n')[-1])

In [10]:
# Loads Beowulf
#beowulf = strip_headers(load_etext(16328)).strip()
#text.append(beowulf.split('\nBEOWULF.\n\n')[-1])

In [26]:
titles = ['Moby Dick','Pride Prejudice','Sense Sensibility','Huck Finn','Tom Sawyer','20K Leagues','Ulysses','Wiz of Oz']

In [27]:
#print(beowulf[:1000])
text = [text[i] for i in [0,1,2,5,6,7]]
titles = [titles[i] for i in [0,1,2,5,6,7]]
titles

['Moby Dick',
 'Pride Prejudice',
 'Sense Sensibility',
 '20K Leagues',
 'Ulysses',
 'Wiz of Oz']

In [14]:
chapters = []
for i in range(len(text)):
    chapters.append(text[i].split('CHAPTER '))
# manual fixes
chapters[1] = text[1].split('Chapter ')
chapters[6-2] = text[6-2].split(' ]\n\n')
import re
chapters[7-2] = re.compile("\n\n\n[0-9]{1,2}\.\s").split(text[7-2])[1:]
allchaps = [y for x in chapters for y in x]

In [15]:
#re.compile("\n\nX{0,1}IX|IV|V?I{0,3}\.\s").split(text[-1])[-100:]

In [16]:
chapix = [len(x) for x in chapters]
# in matlab: chapix = [0 cumsum(chapix)]
chapix = np.insert(np.cumsum(chapix),0,0)
chapix

array([  0, 150, 212, 263, 310, 329, 351])

In [17]:
def textcounter(chapters,n_features=1000):
    tfidf_vectorizer = TfidfVectorizer(max_df=0.999, min_df=2,
                                       max_features=n_features,
                                       stop_words='english')
    freq = tfidf_vectorizer.fit_transform(chapters)
    words = tfidf_vectorizer.get_feature_names()
    return freq, words

In [18]:
freq, words = textcounter(allchaps,50)

In [19]:
words[freq.sum(0).argmax()]

'whale'

In [20]:
sio.savemat('books.mat',{'freq':freq.todense(),'words':np.array(words,dtype=np.object),\
                         'chapix':chapix,'titles':np.array(titles,dtype=np.object)})

In [21]:
sepfreq = []
sepwords = []
for i in range(len(titles)):
    tmpfreq, tmpwords = textcounter(chapters[i],15)
    sepfreq.append(tmpfreq)
    sepwords.append(tmpwords)

In [22]:
sepwords

[['ahab',
  'boat',
  'captain',
  'great',
  'head',
  'like',
  'long',
  'man',
  'old',
  'said',
  'sea',
  'ship',
  'time',
  'whale',
  'ye'],
 ['bennet',
  'bingley',
  'darcy',
  'did',
  'elizabeth',
  'jane',
  'know',
  'miss',
  'mr',
  'mrs',
  'said',
  'sister',
  'soon',
  'think',
  'time'],
 ['dashwood',
  'did',
  'edward',
  'elinor',
  'jennings',
  'know',
  'marianne',
  'miss',
  'mother',
  'mrs',
  'said',
  'sister',
  'think',
  'time',
  'willoughby'],
 ['captain',
  'conseil',
  'day',
  'did',
  'feet',
  'land',
  'like',
  'long',
  'nautilus',
  'ned',
  'nemo',
  'said',
  'sea',
  'sir',
  'water'],
 ['bloom',
  'did',
  'eyes',
  'good',
  'hand',
  'know',
  'like',
  'man',
  'mr',
  'old',
  'said',
  'says',
  'stephen',
  'time',
  'yes'],
 ['asked',
  'came',
  'girl',
  'good',
  'great',
  'green',
  'lion',
  'little',
  'oz',
  'said',
  'scarecrow',
  'tin',
  'toto',
  'witch',
  'woodman']]

In [23]:
sepwords = [y for x in sepwords for y in x]
sepfreq = [y.todense() for x in sepfreq for y in x]

In [24]:
sepfreq[1:10][1:10]

[matrix([[0.        , 0.        , 0.        , 0.22636679, 0.        ,
          0.18968858, 0.        , 0.        , 0.88680139, 0.28645535,
          0.        , 0.        , 0.11477586, 0.10143051, 0.14436513]]),
 matrix([[0.        , 0.03037384, 0.        , 0.13388089, 0.43728575,
          0.31786646, 0.23385453, 0.34928032, 0.32780248, 0.5082578 ,
          0.15728794, 0.09442728, 0.18101941, 0.11997878, 0.25614709]]),
 matrix([[0.        , 0.        , 0.        , 0.49340825, 0.26859747,
          0.41346131, 0.23505105, 0.22716127, 0.        , 0.        ,
          0.1242157 , 0.        , 0.62543798, 0.        , 0.        ]]),
 matrix([[0.        , 0.        , 0.        , 0.26128346, 0.14223531,
          0.43789539, 0.24894172, 0.60146422, 0.        , 0.        ,
          0.5262256 , 0.13821401, 0.        , 0.        , 0.        ]]),
 matrix([[0.        , 0.        , 0.        , 0.16996574, 0.18504906,
          0.71213094, 0.1619374 , 0.        , 0.16646197, 0.        ,
        

In [28]:
sio.savemat('books.mat',{'freq':sepfreq,'words':np.array(sepwords,dtype=np.object),\
                         'chapix':chapix,'titles':np.array(titles,dtype=np.object)})