In [1]:
import numpy as np
import nltk
import glob
import os
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans
from scipy.cluster.vq import whiten
sentence_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
word_tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
 
# Load data
data_folder = r"./books/"
files = sorted(glob.glob(os.path.join(data_folder, "chapter*.txt")))
chapters = []
for fn in files:
    with open(fn) as f:
        chapters.append(f.read().replace('\n', ' '))
all_text = ' '.join(chapters)

In [2]:
import nltk

In [3]:
print all_text



In [4]:
for num, ch_text in enumerate(chapters):
    ch_text = ch_text.decode('utf-8','ignore')
    chapters[num] = ch_text
all_text = all_text.decode('utf-8','ignore')

In [5]:
# create feature vectors
num_chapters = len(chapters)
fvs_lexical = np.zeros((len(chapters), 3), np.float64)
fvs_punct = np.zeros((len(chapters), 3), np.float64)
for e, ch_text in enumerate(chapters):
    # note: the nltk.word_tokenize includes punctuation
    

    tokens = nltk.word_tokenize(ch_text.lower())
    words = word_tokenizer.tokenize(ch_text.lower())
    sentences = sentence_tokenizer.tokenize(ch_text)
    vocab = set(words)
    words_per_sentence = np.array([len(word_tokenizer.tokenize(s))
                                   for s in sentences])
 
    # average number of words per sentence
    fvs_lexical[e, 0] = words_per_sentence.mean()
    # sentence length variation
    fvs_lexical[e, 1] = words_per_sentence.std()
    # Lexical diversity
    fvs_lexical[e, 2] = len(vocab) / float(len(words))
 
    # Commas per sentence
    fvs_punct[e, 0] = tokens.count(',') / float(len(sentences))
    # Semicolons per sentence
    fvs_punct[e, 1] = tokens.count(';') / float(len(sentences))
    # Colons per sentence
    fvs_punct[e, 2] = tokens.count(':') / float(len(sentences))
 
# apply whitening to decorrelate the features
fvs_lexical = whiten(fvs_lexical)
fvs_punct = whiten(fvs_punct)

In [6]:
for chapter in chapters:
    print(chapter[:5])

  Rom
  Eph
  Col
  Gal
  1 C
  2 C
  Phi


In [7]:
print(fvs_lexical)
print(fvs_punct)

[[ 5.06180455  3.00352589  3.171466  ]
 [ 7.82564994  3.28893228  5.43567884]
 [ 5.73152427  3.44258462  5.59637298]
 [ 4.69599007  2.3638906   4.57929573]
 [ 4.66757661  4.50546673  3.12873433]
 [ 5.48567328  5.65954776  3.59114272]
 [ 5.80124814  3.79823213  5.22761445]]
[[ 3.98702027  2.16537309  1.98259344]
 [ 6.47261384  1.23644665  1.19547425]
 [ 5.60205712  1.41007532  1.49116337]
 [ 3.78981097  1.38841215  1.56613806]
 [ 3.51320842  2.57862164  2.68660918]
 [ 4.17209085  4.14209626  4.25514117]
 [ 4.91973044  1.13704603  1.37420938]]


In [8]:
# get most common words in the whole book
NUM_TOP_WORDS = 20
all_tokens = nltk.word_tokenize(all_text)
print(all_tokens[:10])
fdist = nltk.FreqDist(all_tokens)
vocab = fdist.keys()[:NUM_TOP_WORDS]
print(vocab)
# use sklearn to create the bag for words feature vector for each chapter
vectorizer = CountVectorizer(vocabulary=vocab, tokenizer=nltk.word_tokenize)
# print(vectorizer)
print(vectorizer.fit_transform(chapters).toarray().astype(np.float64))
fvs_bow = vectorizer.fit_transform(chapters).toarray().astype(np.float64)

# normalise by dividing each row by its Euclidean norm
print(np.c_[np.apply_along_axis(np.linalg.norm, 1, fvs_bow)]==0)
fvs_bow /= np.c_[np.apply_along_axis(np.linalg.norm, 1, fvs_bow)]

[u'Romans', u'1', u'1Paul', u',', u'a', u'servant', u'of', u'Christ', u'Jesus', u',']
[[ 1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  8.  0.  0.  0.  0.  5.  0.  0.
   0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  1.  0.  0.  2.  0.  0.  0.  0.  0.  0.  0.
   0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  1.]
 [ 0.  0.  0.  0.  0.  1.  0.  0.  0.  0.  4.  1.  0.  0.  0.  1.  0.  0.
   0.  0.]
 [ 0.  0.  0.  1.  0.  1.  1.  0.  0.  0.  1.  0.  0.  0.  0.  0.  1.  0.
   0.  1.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  2.  0.  0.
   0.  1.]]
[[False]
 [False]
 [ True]
 [False]
 [False]
 [False]
 [False]]


  app.launch_new_instance()


In [9]:
print(fvs_bow)

[[ 0.10540926  0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.84327404  0.          0.          0.
   0.          0.52704628  0.          0.          0.          0.        ]
 [ 0.          0.          0.          0.          0.          0.          0.
   0.4472136   0.          0.          0.89442719  0.          0.          0.
   0.          0.          0.          0.          0.          0.        ]
 [        nan         nan         nan         nan         nan         nan
          nan         nan         nan         nan         nan         nan
          nan         nan         nan         nan         nan         nan
          nan         nan]
 [ 0.          0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          0.          0.
   0.          0.          0.          0.          0.          1.        ]
 [ 0.          0.          0.          0.          0.     

In [17]:
# get part of speech for each token in each chapter
from nltk.data import load
def token_to_pos(ch):
    tokens = nltk.word_tokenize(ch)
    return [p[1] for p in nltk.pos_tag(tokens)]
chapters_pos = [token_to_pos(ch) for ch in chapters]
 
# count frequencies for common POS types
pos_list = ['NN', 'NNP', 'DT', 'IN', 'JJ', 'NNS']
# tagdict = load('help/tagsets/upenn_tagset.pickle')
# pos_list = tagdict.keys()

fvs_syntax = np.array([[ch.count(pos) for pos in pos_list]
                       for ch in chapters_pos]).astype(np.float64)

print(fvs_syntax)
# normalise by dividing each row by number of tokens in the chapter
fvs_syntax /= np.c_[np.array([len(ch) for ch in chapters_pos])]

[[ 1289.   666.   922.  1318.   441.   332.]
 [  302.   147.   205.   332.    73.    84.]
 [  290.   142.   192.   337.   105.    96.]
 [  388.   227.   323.   441.   136.   123.]
 [ 1243.   518.  1030.  1263.   424.   411.]
 [  724.   357.   478.   927.   272.   216.]
 [  292.   153.   184.   379.    88.    83.]]


In [11]:
from nltk.data import load
tagdict = load('help/tagsets/upenn_tagset.pickle')
pos_list = tagdict.keys()
pos_list

['PRP$',
 'VBG',
 'VBD',
 '``',
 'VBN',
 ',',
 "''",
 'VBP',
 'WDT',
 'JJ',
 'WP',
 'VBZ',
 'DT',
 'RP',
 '$',
 'NN',
 ')',
 '(',
 'FW',
 'POS',
 '.',
 'TO',
 'LS',
 'RB',
 ':',
 'NNS',
 'NNP',
 'VB',
 'WRB',
 'CC',
 'PDT',
 'RBS',
 'RBR',
 'CD',
 'PRP',
 'EX',
 'IN',
 'WP$',
 'MD',
 'NNPS',
 '--',
 'JJS',
 'JJR',
 'SYM',
 'UH']

In [12]:
print(pos_list)

['PRP$', 'VBG', 'VBD', '``', 'VBN', ',', "''", 'VBP', 'WDT', 'JJ', 'WP', 'VBZ', 'DT', 'RP', '$', 'NN', ')', '(', 'FW', 'POS', '.', 'TO', 'LS', 'RB', ':', 'NNS', 'NNP', 'VB', 'WRB', 'CC', 'PDT', 'RBS', 'RBR', 'CD', 'PRP', 'EX', 'IN', 'WP$', 'MD', 'NNPS', '--', 'JJS', 'JJR', 'SYM', 'UH']


In [13]:
print(fvs_syntax)

[[ 0.11150519  0.05761246  0.07975779  0.11401384  0.03814879  0.02871972]
 [ 0.13056636  0.06355383  0.08862949  0.14353653  0.03156074  0.03631647]
 [ 0.11880377  0.05817288  0.07865629  0.13805817  0.04301516  0.03932814]
 [ 0.10210526  0.05973684  0.085       0.11605263  0.03578947  0.03236842]
 [ 0.10646681  0.04436831  0.0882227   0.10817987  0.03631692  0.03520343]
 [ 0.09367318  0.04618968  0.061845    0.1199379   0.03519213  0.02794669]
 [ 0.11128049  0.05830793  0.07012195  0.14443598  0.03353659  0.0316311 ]]


In [14]:
def PredictAuthors(fvs):
    km = KMeans(n_clusters=2, init='k-means++', n_init=10, verbose=0)
    km.fit(fvs)
 
    return km

In [15]:
print('Lexical:', PredictAuthors(fvs_lexical).labels_)
print('Punc:',PredictAuthors(fvs_punct).labels_)
print('Syntax:',PredictAuthors(fvs_syntax).labels_)


('Lexical:', array([1, 0, 0, 0, 1, 1, 0]))
('Punc:', array([0, 0, 0, 0, 1, 1, 0]))
('Syntax:', array([0, 1, 1, 0, 0, 0, 1]))


In [None]:
print('BOW:',PredictAuthors(fvs_bow).labels_)

In [None]:
PredictAuthors(fvs_syntax).cluster_centers_

In [None]:
PredictAuthors(fvs_syntax).predict([[0.12593638,  0.04548909,  0.08663554,  0.09901205, 0.05591141,  0.02073608]])

In [None]:
PredictAuthors(fvs_bow).predict( [[ 0.,          0.,          0.,          0.,          0.,          0.98058068,
   0.,          0.,          0.,          0.19611614]])

In [None]:
PredictAuthors(fvs_bow).cluster_centers_

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline  

centroids = PredictAuthors(fvs_bow).cluster_centers_

print(centroids)

plt.scatter(centroids[:, 0], centroids[:, 1],
            marker='x', s=169, linewidths=3,
            color='w', zorder=10)
