http://jeroenjanssens.com/2013/09/19/seven-command-line-tools-for-data-science.html

In [205]:
import numpy as np
from os import path
data_dir = "/home/daniel/git/Python2.7/DataScience/nlp_training_data"

chap_line_nums = np.loadtxt(path.join(data_dir, 'alice_chap_nums.txt'), dtype='int', delimiter='\n')

f = open(path.join(data_dir,'alice.txt'), 'r')
lines = f.readlines()
f.close()
     
print chap_line_nums

[ 212  422  632  891 1160 1461 1792 2137 2454 2778 3077 3343 3637]


In [206]:
for i in range(chap_line_nums.size - 1):
    chap_num = i + 1
    f = open(path.join(data_dir,'chap_%02i.txt' %chap_num), 'w')
    
    start = chap_line_nums[i]
    end = chap_line_nums[i+1] - 1

    for line in lines[start:end]:
        f.write(line) 
    
    f.close()
    

In [284]:
from nltk import tokenize
import nltk
import re

In [648]:
def get_chap_sents(filename):
    f = open(filename, 'r')
    lines = f.read()
    f.close()

    sents = tokenize.sent_tokenize(lines)
    sents = map(lambda s:s.replace('\r\n',' '), sents)
    sents = map(lambda s:s.replace('_',''), sents)
    
    return sents

def join_n_sentences(sentences, n=5):
    new_lines = []
    num_sent = len(sentences)
    index = np.arange(0, num_sent, n)
    
    for i in index:
        concat_lines = ' '.join(sentences[i:i+n+1])
        new_lines.append(concat_lines)
        
    return new_lines    

def build_docs(filename):
    sents = get_chap_sents(filename)
    doc = join_n_sentences(sents)
    
    return doc

def get_nltk_pos(string):
    test = string.decode('ascii', 'ignore')
    test = tokenize.word_tokenize(test)
    pos = nltk.pos_tag(test)
    
    return pos

pat = "^[NVJR]"
#pat = "^N"
reg = re.compile(pat)
def filter_pos(pos):
    out = []
    for item in pos:
        if reg.match(item[1]):
            out.append(item[0])
            
    return out        

def get_pos(doc):
    out = []
    for string in doc:
        pos = get_nltk_pos(string)
        #print pos
        temp = filter_pos(pos)
        string = ' '.join(temp)
        out.append(string)
    
    #out = [item for sub_list in out for item in sub_list]
    return out

def remove_short_words(pos):
    out = []
    for word in pos.split(' '):
        if len(word) < 3:
            continue
        else:
            out.append(word)
         
    return ' '.join(out)    
      
        

In [649]:
import glob
import pandas as pd

chps = glob.glob(path.join(data_dir,"chap_*.txt"))
chps.sort()

for ch in chps:
    print ch
    

/home/daniel/git/Python2.7/DataScience/nlp_training_data/chap_01.txt
/home/daniel/git/Python2.7/DataScience/nlp_training_data/chap_02.txt
/home/daniel/git/Python2.7/DataScience/nlp_training_data/chap_03.txt
/home/daniel/git/Python2.7/DataScience/nlp_training_data/chap_04.txt
/home/daniel/git/Python2.7/DataScience/nlp_training_data/chap_05.txt
/home/daniel/git/Python2.7/DataScience/nlp_training_data/chap_06.txt
/home/daniel/git/Python2.7/DataScience/nlp_training_data/chap_07.txt
/home/daniel/git/Python2.7/DataScience/nlp_training_data/chap_08.txt
/home/daniel/git/Python2.7/DataScience/nlp_training_data/chap_09.txt
/home/daniel/git/Python2.7/DataScience/nlp_training_data/chap_10.txt
/home/daniel/git/Python2.7/DataScience/nlp_training_data/chap_11.txt
/home/daniel/git/Python2.7/DataScience/nlp_training_data/chap_12.txt


In [650]:
docs = build_docs(chps[0])
print docs[3] 
print '\n' + str(len(docs)) + " docs" + '\n'
docs = map(remove_short_words, docs)
print docs[3]

Why, I wouldn't say anything about it, even if I fell off the top of the house!" (Which was very likely true.) Down, down, down. Would the fall never come to an end? "I wonder how many miles I've fallen by this time?" she said aloud.

18 docs

Why, wouldn't say anything about it, even fell off the top the house!" (Which was very likely true.) Down, down, down. Would the fall never come end? wonder how many miles I've fallen this time?" she said aloud.


In [651]:
pos = get_pos(docs)
print pos[0]

[ Sidenote Down Rabbit-Hole ] ALICE was beginning get very tired sitting sister bank having nothing do once twice had peeped book sister was reading had pictures conversations use book thought Alice pictures conversations was considering own mind well hot day made feel very sleepy stupid ) pleasure making daisy-chain worth trouble getting picking daisies suddenly White Rabbit pink eyes ran close was nothing very remarkable did Alice think very much way hear Rabbit say dear dear too late thought over afterwards occurred have wondered time seemed quite natural ) Rabbit actually took watch waistcoat-pocket looked then hurried Alice started feet flashed across mind had never seen rabbit waistcoat-pocket watch take out burning curiosity ran field was just time see pop large rabbit-hole hedge


In [652]:
df = pd.DataFrame()

df['doc'] = pos
#df['doc'] = docs
df['chap'] = np.ones(len(pos))
df

Unnamed: 0,doc,chap
0,[ Sidenote Down Rabbit-Hole ] ALICE was beginn...,1
1,( thought over afterwards occurred have wonder...,1
2,took down jar shelves passed was labelled ORAN...,1
3,n't say anything even fell top house Which was...,1
4,said aloud getting somewhere centre earth Let ...,1
5,Presently began again wonder fall right earth ...,1
6,( tried curtsey spoke fancy curtseying 're fal...,1
7,was nothing else do Alice soon began talking a...,1
8,are mice air 'm afraid catch bat 's very mouse...,1
9,see n't answer question did n't much matter wa...,1


In [653]:
for ind, _file in enumerate(chps[1:], start=2):
    docs = build_docs(_file)
    docs = map(remove_short_words, docs)
    pos = get_pos(docs)
    n = len(pos)
    temp = pd.DataFrame({ 'doc':pos, 'chap':np.ones(n)*ind })
    #print temp
    df = pd.concat([df, temp], axis=0)

df    

Unnamed: 0,chap,doc
0,1,[ Sidenote Down Rabbit-Hole ] ALICE was beginn...
1,1,( thought over afterwards occurred have wonder...
2,1,took down jar shelves passed was labelled ORAN...
3,1,n't say anything even fell top house Which was...
4,1,said aloud getting somewhere centre earth Let ...
5,1,Presently began again wonder fall right earth ...
6,1,( tried curtsey spoke fancy curtseying 're fal...
7,1,was nothing else do Alice soon began talking a...
8,1,are mice air 'm afraid catch bat 's very mouse...
9,1,see n't answer question did n't much matter wa...


In [654]:
from sklearn.feature_extraction.text import TfidfVectorizer, ENGLISH_STOP_WORDS

In [663]:
stop_words = ENGLISH_STOP_WORDS.union(['alice'])

In [665]:
vectorizer = TfidfVectorizer(max_features=400,
                             lowercase=True,
                             max_df=1.0,
                             min_df=3,
                             use_idf=True,
                             stop_words=stop_words,
                             norm='l2',
                             ngram_range=(1, 1),
                             analyzer='word',
                             decode_error='ignore',
                             strip_accents='unicode'
                             )


matrix = vectorizer.fit_transform(df['doc'])
features = vectorizer.get_feature_names()
       
print matrix.shape 
print features
# vectorize the "docs"
# kmeans and store cluster in df

(330, 400)
[u'added', u'afraid', u'air', u'aloud', u'angrily', u'answer', u'anxiously', u'appeared', u'arm', u'arms', u'ask', u'asked', u'asleep', u'away', u'baby', u'beautiful', u'beg', u'began', u'begin', u'beginning', u'begun', u'believe', u'best', u'better', u'birds', u'bit', u'book', u'bottle', u'box', u'bread', u'bright', u'business', u'butter', u'ca', u'called', u'came', u'cat', u'caterpillar', u'cats', u'certainly', u'change', u'changed', u'cheshire', u'child', u'children', u'chimney', u'close', u'come', u'coming', u'continued', u'conversation', u'cook', u'course', u'court', u'creatures', u'cried', u'croquet', u'curious', u'dance', u'day', u'deal', u'dear', u'deep', u'did', u'different', u'dinah', u'distance', u'dodo', u'does', u'doing', u'door', u'dormouse', u'draw', u'dreadfully', u'dream', u'drink', u'dropped', u'dry', u'duchess', u'eagerly', u'ear', u'eat', u'eggs', u'end', u'everybody', u'evidence', u'exactly', u'exclaimed', u'executed', u'explain', u'eye', u'eyes', u'face

In [666]:
from sklearn.cluster import KMeans
k = len(chps)
km = KMeans(k)
assignments = km.fit_predict(matrix)

print np.unique(assignments)
    


[ 0  1  2  3  4  5  6  7  8  9 10 11]


In [667]:
df['cluster_chaps'] = assignments + 1

In [671]:
print df[df.chap == 12]

    chap                                                doc  cluster_chaps
0     12  [ Sidenote Alice Evidence ] HERE cried Alice q...             11
1     12  Alice looked jury-box saw haste had put Lizard...             11
2     12  Nothing said Alice Nothing whatever persisted ...              8
3     12  were just beginning write down slates White Ra...              8
4     12  read out book Rule persons more mile high leav...              8
5     12  Well sha'n't go rate said Alice besides not re...              8
6     12  's more evidence come yet please Majesty said ...              8
7     12  directed said jurymen is n't directed said Whi...              8
8     12  asked jurymen 're not said White Rabbit 's que...              8
9     12  Please Majesty said Knave did n't write n't pr...              8
10    12  said Alice do n't even know 're Read said King...              8
11    12  asked Begin beginning King said gravely till c...              8
12    12  chance Involved