# Stephen King Novel NLP

## Imports

In [1]:
import pandas as pd
import numpy as np
from IPython import display
from pymongo import MongoClient
import os
import re
from textblob import TextBlob
import nltk
from nltk.corpus import stopwords
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from sklearn.decomposition import NMF, LatentDirichletAllocation
from nltk.util import ngrams
import operator
from gensim import corpora, models, similarities, matutils
from sklearn import datasets
import json
import spacy 
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import KMeans
from sklearn.externals import joblib

# gensim
from gensim import corpora, models, similarities, matutils
# sklearn
from sklearn import datasets
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans
# logging for gensim (set to INFO)
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)


display.clear_output(wait=True)
from config import user_name,password,ip
from epub_conversion.utils import open_book

## Stoplist

In [2]:
stoplist = stopwords.words('english')
stoplist += ['.', ',', '(', ')', "'", '"']
#stoplist = set(stoplist)

## Functions

### Manual Word Count

In [3]:
def clean_text(row):
    text = row['content'].lower()
    text = text.strip('\n')
    return text

### Manual Word Count

In [4]:
def book_word_count(book,n,stoplist=stoplist):
    text = clean_text(book)
    words = [''.join(words) for words in text.split()]
    title = book['title']
    counter = Counter()
    n = n
    words = [w for w in words if w not in stoplist]
    bigrams = ngrams(words, n)
    counter += Counter(bigrams)
    sorted_counter = sorted(counter.items(), key=operator.itemgetter(1),reverse=True)
    return title, sorted_counter

### Count Vectorizer Function

In [5]:
def book_cv(dtbooks,stoplist):
    cv = CountVectorizer(stop_words=stoplist,token_pattern="\\b[a-z][a-z]+\\b")
    print(type(dtbooks[0]))
    cv.fit(dtbooks)
    x = cv.transform(dtbooks)
    x_back = x.toarray()
    df = pd.DataFrame(x_back, columns=cv.get_feature_names())
    counts = cv.transform(dtbooks).transpose()
    print(counts.shape)
    corpus = matutils.Sparse2Corpus(counts)
    id2word = dict((v, k) for k, v in cv.vocabulary_.items())
    return df,corpus,id2word

### Display Topics

In [56]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print ("Topic %d:" % (topic_idx))
        print (" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))
#     topic_words = []
#     for r in model.components_:
#         a = sorted([(v,i) for i,v in enumerate(r)],reverse=True)[0:7]
#         topic_words.append([books[e[1]-1] for e in a])
#     return topic_words

In [7]:
# def display_topics(H, W, feature_names, documents, no_top_words, no_top_documents):
#     for topic_idx, topic in enumerate(H):
#         print("Topic %d:" % (topic_idx))
#         print(" ".join([feature_names[i]
#                         for i in topic.argsort()[:-no_top_words - 1:-1]]))
#         top_doc_indices = np.argsort( W[:,topic_idx] )[::-1][0:no_top_documents]
#         for doc_index in top_doc_indices:
#             print(documents[doc_index])

### Cleanup

In [67]:
def cleanup(token, lower = True):
    if lower:
       token = token.lower()
    return token.strip()

### Entity Detection

In [12]:
def ie_preprocess(document):
    document = ' '.join([i for i in document.split() if i not in stoplist])
    sentences = nltk.sent_tokenize(document)
    sentences = [nltk.word_tokenize(sent) for sent in sentences]
    sentences = [nltk.pos_tag(sent) for sent in sentences]
    return sentences

In [13]:
def extract_names(document):
    names = []
    sentences = ie_preprocess(document)
    for tagged_sentence in sentences:
        for chunk in nltk.ne_chunk(tagged_sentence):
            if type(chunk) == nltk.tree.Tree:
                if chunk.label() == 'PERSON':
                    names.append(' '.join([c[0] for c in chunk]))
    return names

### Tokenize and Stem

In [68]:
def tokenize_and_stem(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems

In [69]:
def tokenize_only(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    return filtered_tokens

## Data Processing

**Create a dictionary from all books**

In [14]:
# book_list = []
# book_dict = {}
# path = "/Users/xavier/dev/metis/fletcher/books/"
# for file in os.listdir(path):
#     if file.endswith(".txt"):
#         clean_name = file.replace(" - Stephen King.txt","")
#         book_dict[clean_name] = open(path+file, "r").read()
#         book_list.append(clean_name)

In [15]:
# # Insert the books into mongo db
# clean_list = []
# for k,v in book_dict.items():
#     try:
#         year = re.search("[Cc]opyright ©\s*.*_*(\d{4}).*Stephen King|[Cc]opyright ©\s.*Stephen King.*_*(\d{4})|[Cc]opyright ©\s*.*_*(\d{4}).*Richard Bachman|[Cc]opyright ©\s.*Richard Bachman.*_*(\d{4})",v).group(0) # get copyright year from book text
#         year = re.search("(\d{4})",year).group(0)
#     except:
#         year = ""
#     try:
#         isbn = re.search(".*ISBN+:*(\d*.*)",v)[1].split(" ")
#         isbn = max(isbn, key=len)
#     except:
#         isbn = ""
#     try:
#         start = v.find('******start_of_file******')+25
#         end = v.find('******end_of_file******')
#         text = v[start:end]
#     except:
#         text = ""
        
#     doc = {"title":k,"year":year,'isbn':isbn,"content":text}
#     clean_list.append(doc)
#     #print(doc['title'],doc['isbn'])
#     #print(doc['title'],doc['year'])
#     #db.books.insert_one(doc)

In [16]:
# len(clean_list)

In [17]:
# #pd.DataFrame(a, index=['i',])
# df = pd.DataFrame(clean_list)

In [18]:
# df.to_pickle('books.pkl')

In [19]:
df = pd.read_pickle('books.pkl')

In [20]:
df.columns.tolist()

['content', 'isbn', 'title', 'year']

In [21]:
df.head()

Unnamed: 0,content,isbn,title,year
0,d by “Duel”\n\nJoe Hill and Stephen King\n\n\n...,9780062215956,Throttle,2009
1,TS\n\n\n\nCover Page\n\nTitle Page\n\n\n\nIntr...,978-0-385-52884-9,Night Shift,1976
2,this Scribner eBook.\n\n\n\n* * *\n\n\n\nSign...,0-7432-0467-0,Riding the Bullet,2000
3,Page\n\nCopyright Page\n\nDedication\n\n\n\n\...,978-1-101-13813-7,Roadwork,1981
4,dication\n\nIntroduction\n\nAuthor’s Note\n\n\...,978-0-385-52822-1,Salem's Lot,1975


In [22]:
document = df.iloc[28]['content']

In [23]:
# df['content'] = df.content.apply(lambda x: x.lower())
# df['content'] = df.content.apply(lambda x: x.strip("\n"))

In [24]:
# words = [''.join(words) for words in gs_text.split()]
# vectorizer = TfidfVectorizer(stop_words=stop, ngram_range=(1))
# doc_vectors = vectorizer.fit_transform(documents)

### Character extraction test

In [25]:
characters = extract_names(document)

In [26]:
characters

['Penguin Book',
 'Stephen King',
 'Penguin Putnam',
 'Penguin Books',
 'Penguin Putnam',
 'Penguin Putnam',
 'STEPHEN',
 'Carrie',
 'Salem',
 'Christine Pet Sematary Cycle Werewolf',
 'Peter Straub',
 'Dolores Claiborne Insomnia Rose Madder',
 'Wizard Glass Bag Bones',
 'Tom Gordon Dreamcatcher Black House',
 'Peter Straub',
 'Skeleton Crew',
 'Atlantis Everything',
 'Eventual SCREENPLAYS Creepshow Cat',
 'Eye Silver Bullet Maximum Overdrive Pet Sematary Golden',
 'Century',
 'Who',
 'Merrys Pippins',
 'Max Yasgur',
 'Great Woodstock Music Festival',
 'Gandalfs',
 'Tolkien',
 'Stephen Donaldson',
 'Terry Brooks',
 'Tolkien',
 'Tolkien',
 'Tricky Dick Nixon',
 'Mr. Tolkien',
 'Look',
 'Stevie',
 'Nineteen',
 'Bob Seger',
 'Patrol Boy',
 'Bad Lieutenant',
 'Patrol Boy',
 'Mine',
 'Stephen',
 'God',
 'Tolkien',
 'Pall Malls',
 'Patrol Boy',
 'Maine',
 'Sergio Leone',
 'Bad',
 'Ugly',
 'Tolkien',
 'Leone',
 'Clint',
 'Lee Van Cleef',
 'Wizard Glass',
 'Leone',
 'Phoenix',
 'Seems',
 'Patr

In [27]:
stoplist = list(stoplist)
stoplist.extend(characters)
stoplist = set(stoplist)

In [28]:
print(stoplist)

{'STEPHEN', 'Aileen Ritter', 'Wolfe Look Homeward', 'Man Jesus', 'under', 'itself', 'didn', 'above', 'Wit', 'Already', 'John Farson', 'Clean', 'Hello', 'Jamie DeCurry', 'is', 'from', 'Shaw', 'Penguin Books', 'Thomas Jamie', 'Damascus', 'Peddler', 'Bathsheba', 'Phoenix', 'Sylvia Pittston', 'than', 'High Speech', 'Susan', 'Bad Lieutenant', 'Further', 'Reap', 'against', 'Old', 'in', 'Cuthbert Allgood', 'Seems', 'your', 'did', 'just', 'Maine', 'Hard', 'don', 'was', 'Coherent', 'with', 'what', 'do', 'Salem', 'Mark', 'they', 'not', 'Answer', 'Jesus', 'down', 'Stephen Donaldson', 'God Tull', 'aren', 'Zoltan', 'Slow Mutants', 'them', 'Mother', 'Chaucer', 'Hendrickson', 'Gather', 'Jake', 'Feast Reaptide', 'Leone', 'Roland', 'very', 'me', 'Milky Way', 'Cook', 'Wind', 'None', 'Scribner', 'Rub', 'Tolkien', 'couldn', 'Coffee Thermos', 'Beer', 'Dinh Gilead', 'Demon', 'Make', 'Lee Van Cleef', 'Has Cort', 'Blue Haven Heaven', 'themselves', 'Worlds', 'Suppose', 'until', 'LeMark', 'Cuthbert Jamie', 'Gan

### Word Tokenize

In [29]:
df['content'] = df.content.apply(lambda x: word_tokenize(x))

In [30]:
df.head()

Unnamed: 0,content,isbn,title,year
0,"[d, by, “, Duel, ”, Joe, Hill, and, Stephen, K...",9780062215956,Throttle,2009
1,"[TS, Cover, Page, Title, Page, Introduction, b...",978-0-385-52884-9,Night Shift,1976
2,"[this, Scribner, eBook, ., *, *, *, Sign, up, ...",0-7432-0467-0,Riding the Bullet,2000
3,"[Page, Copyright, Page, Dedication, Part, One,...",978-1-101-13813-7,Roadwork,1981
4,"[dication, Introduction, Author, ’, s, Note, ’...",978-0-385-52822-1,Salem's Lot,1975


### Omit Stop Words

In [31]:
#df['content'] = df.content.apply(lambda x: [word for word in x if word not in stoplist])

In [32]:
df.head()

Unnamed: 0,content,isbn,title,year
0,"[d, by, “, Duel, ”, Joe, Hill, and, Stephen, K...",9780062215956,Throttle,2009
1,"[TS, Cover, Page, Title, Page, Introduction, b...",978-0-385-52884-9,Night Shift,1976
2,"[this, Scribner, eBook, ., *, *, *, Sign, up, ...",0-7432-0467-0,Riding the Bullet,2000
3,"[Page, Copyright, Page, Dedication, Part, One,...",978-1-101-13813-7,Roadwork,1981
4,"[dication, Introduction, Author, ’, s, Note, ’...",978-0-385-52822-1,Salem's Lot,1975


### Stem Words

In [33]:
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")

In [34]:
df['content'] = df.content.apply(lambda x: [stemmer.stem(word) for word in x])

In [35]:
df.head()

Unnamed: 0,content,isbn,title,year
0,"[d, by, “, duel, ”, joe, hill, and, stephen, k...",9780062215956,Throttle,2009
1,"[ts, cover, page, titl, page, introduct, by, j...",978-0-385-52884-9,Night Shift,1976
2,"[this, scribner, ebook, ., *, *, *, sign, up, ...",0-7432-0467-0,Riding the Bullet,2000
3,"[page, copyright, page, dedic, part, one, -, n...",978-1-101-13813-7,Roadwork,1981
4,"[dicat, introduct, author, ’, s, note, ’, sale...",978-0-385-52822-1,Salem's Lot,1975


### Convert tokens back to a long string

In [36]:
df['content'] = df.content.apply(lambda x: " ".join(x))

In [37]:
df.head()

Unnamed: 0,content,isbn,title,year
0,d by “ duel ” joe hill and stephen king conten...,9780062215956,Throttle,2009
1,ts cover page titl page introduct by john d. m...,978-0-385-52884-9,Night Shift,1976
2,this scribner ebook . * * * sign up for our ne...,0-7432-0467-0,Riding the Bullet,2000
3,page copyright page dedic part one - novemb no...,978-1-101-13813-7,Roadwork,1981
4,dicat introduct author ’ s note ’ salem ’ s lo...,978-0-385-52822-1,Salem's Lot,1975


In [64]:
titles = df.title

In [65]:
books = df.content

In [66]:
books[4][:500]

'dicat introduct author ’ s note ’ salem ’ s lot prologu part one the marsten hous chapter one ben ( i ) chapter two susan ( i ) chapter three the lot ( i ) chapter four danni glick and other chapter five ben ( ii ) chapter six the lot ( ii ) chapter seven matt part two the emperor of ice cream chapter eight ben ( iii ) chapter nine susan ( ii ) chapter ten the lot ( iii ) chapter eleven ben ( iv ) chapter twelv mark chapter thirteen father callahan part three the desert villag chapter fourteen t'

## Model Fits

### Count Vectorizer

In [40]:
cv = CountVectorizer(stop_words=stoplist)
x = cv.fit_transform(books)
cv_feature_names = cv.get_feature_names()
#x = cv.transform(books)

In [41]:
x.shape

(68, 58142)

In [42]:
# x_back = x.toarray()

In [43]:
# bookDF = pd.DataFrame(x_back, columns=cv.get_feature_names())

In [44]:
# bookDF.head()

### TFIDF Vectorizer

In [45]:
tf = TfidfVectorizer(stop_words=stoplist)
x2 = tf.fit_transform(books)
tf_feature_names = tf.get_feature_names()

### Cosine Similarity

In [46]:
dist = 1 - cosine_similarity(x2)

### K-Means

In [47]:
num_clusters = 5
km = KMeans(n_clusters=num_clusters)
km.fit(x2)
clusters = km.labels_.tolist()

#### Pickle Kmeans Model

In [48]:
# joblib.dump(km, 'doc_cluster.pkl')
# km = joblib.load('doc_cluster.pkl')
# clusters = km.labels_.tolist()

### NMF

#### With Count Vectorizer

In [49]:
nmf = NMF(n_components=20, init='random')
fit = nmf.fit_transform(x)

In [60]:
display_topics(nmf,cv_feature_names,10)

Topic 0:
said one look like would back could sam time thought
Topic 1:
said bill look one like back richi would ben eddi
Topic 2:
jack said richard wolf look like back one go hand
Topic 3:
roland said one look like back would hand susan could
Topic 4:
said one look go back barbi big like would jim
Topic 5:
jonesi henri one like look back said gray owen kurtz
Topic 6:
bobbi like said look one garden go back would could
Topic 7:
said alan one look like would back gaunt go hand
Topic 8:
said like would look one go back stu man could
Topic 9:
like look one rosi back could would norman hand thought
Topic 10:
roland eddi said jake one susannah look like would time
Topic 11:
clay said tom one look jordan like alic go man
Topic 12:
said one go like would look back andi dan could
Topic 13:
johnni said look like one back david go hand steve
Topic 14:
ralph look loi like said one back hand go could
Topic 15:
said one like back look go would could know time
Topic 16:
would peter flagg one said tho

#### With TFIDF

In [57]:
nmf2 = NMF(n_components=20, init='random').fit(x2)
fit2 = nmf2.transform(x2)

In [58]:
# no_top_words = 5
# no_top_documents = 2

In [59]:
display_topics(nmf2,tf_feature_names,5)

Topic 0:
said one like look back
Topic 1:
hodg jerom morri bradi say
Topic 2:
flagg peyna denni peter beson
Topic 3:
lisey scott amanda dooley one
Topic 4:
jack richard said counting like
Topic 5:
abra kid like one danni
Topic 6:
loui jud rachel gage said
Topic 7:
ralph loi look like mcgovern
Topic 8:
bobbi ever said kaz one
Topic 9:
garrati mcvri stebbin olson said
Topic 10:
jonesi kurtz duddit henri owen
Topic 11:
jessi like one would gerald
Topic 12:
roland eddi jake said susannah
Topic 13:
tess streeter would ramona said
Topic 14:
ginelli halleck billi heidi said
Topic 15:
wesley ur robbi said kindl
Topic 16:
vinc lemmi said stephani dave
Topic 17:
becki cal grass demuth tobin
Topic 18:
perci coffey delacroix wharton said
Topic 19:
trisha like one back look


### LDA

#### With Count Vectorizer

In [97]:
cv2 = CountVectorizer(stop_words=stoplist)
x3 = cv2.fit_transform(books)
cv2_feature_names = cv.get_feature_names()

In [98]:
lda = LatentDirichletAllocation(n_topics=20, max_iter=5, learning_method='online', learning_offset=50.,random_state=0).fit(x3)



In [99]:
display_topics(lda,cv2_feature_names,5)

Topic 0:
said one like back could
Topic 1:
said one like back would
Topic 2:
said one like would time
Topic 3:
said one back would like
Topic 4:
said one back like would
Topic 5:
said like back would one
Topic 6:
said like one could would
Topic 7:
said one like could back
Topic 8:
said back one like would
Topic 9:
like one said would could
Topic 10:
said one back would get
Topic 11:
one said like back would
Topic 12:
said one like back could
Topic 13:
said back one could would
Topic 14:
said like one back could
Topic 15:
said one like back would
Topic 16:
one would like said back
Topic 17:
said one like would back
Topic 18:
one back said like would
Topic 19:
said one like would could


In [45]:
# lda = models.LdaModel(corpus=corp, num_topics=10, id2word=id2word, passes=10)

2017-11-02 17:40:37,750 : INFO : using symmetric alpha at 0.1
2017-11-02 17:40:37,751 : INFO : using symmetric eta at 2.13193251154e-06
2017-11-02 17:40:37,826 : INFO : using serial LDA version on this node
2017-11-02 17:40:58,603 : INFO : running online (multi-pass) LDA training, 10 topics, 10 passes over the supplied corpus of 7 documents, updating model once every 7 documents, evaluating perplexity every 7 documents, iterating 50x with a convergence threshold of 0.001000
2017-11-02 17:41:47,044 : INFO : -17.591 per-word bound, 197481.1 perplexity estimate based on a held-out corpus of 7 documents with 1304555 words
2017-11-02 17:41:47,046 : INFO : PROGRESS: pass 0, at document #7/7
2017-11-02 17:41:48,766 : INFO : topic #7 (0.100): 0.005*"said" + 0.005*"roland" + 0.004*"eddie" + 0.003*"like" + 0.003*"jake" + 0.003*"one" + 0.003*"would" + 0.002*"back" + 0.002*"susannah" + 0.002*"could"
2017-11-02 17:41:48,772 : INFO : topic #1 (0.100): 0.006*"said" + 0.004*"roland" + 0.004*"one" + 0.

2017-11-02 17:44:00,647 : INFO : topic diff=0.267182, rho=0.377964
2017-11-02 17:44:23,423 : INFO : -11.579 per-word bound, 3060.3 perplexity estimate based on a held-out corpus of 7 documents with 1304555 words
2017-11-02 17:44:23,424 : INFO : PROGRESS: pass 6, at document #7/7
2017-11-02 17:44:30,039 : INFO : topic #7 (0.100): 0.001*"said" + 0.001*"roland" + 0.001*"would" + 0.001*"eddie" + 0.001*"one" + 0.001*"like" + 0.001*"jake" + 0.001*"susannah" + 0.000*"back" + 0.000*"could"
2017-11-02 17:44:30,044 : INFO : topic #3 (0.100): 0.000*"said" + 0.000*"eddie" + 0.000*"one" + 0.000*"like" + 0.000*"roland" + 0.000*"would" + 0.000*"jake" + 0.000*"back" + 0.000*"susannah" + 0.000*"could"
2017-11-02 17:44:30,048 : INFO : topic #4 (0.100): 0.000*"roland" + 0.000*"said" + 0.000*"eddie" + 0.000*"one" + 0.000*"would" + 0.000*"could" + 0.000*"like" + 0.000*"know" + 0.000*"jake" + 0.000*"gunslinger"
2017-11-02 17:44:30,052 : INFO : topic #8 (0.100): 0.006*"said" + 0.006*"roland" + 0.004*"one" + 

In [46]:
# lda.print_topics()

2017-11-02 17:45:59,374 : INFO : topic #0 (0.100): 0.005*"gunslinger" + 0.003*"said" + 0.003*"boy" + 0.003*"man" + 0.002*"one" + 0.002*"like" + 0.002*"would" + 0.002*"back" + 0.002*"black" + 0.001*"jake"
2017-11-02 17:45:59,380 : INFO : topic #1 (0.100): 0.000*"said" + 0.000*"roland" + 0.000*"one" + 0.000*"would" + 0.000*"like" + 0.000*"eddie" + 0.000*"back" + 0.000*"could" + 0.000*"looked" + 0.000*"jake"
2017-11-02 17:45:59,384 : INFO : topic #2 (0.100): 0.005*"said" + 0.004*"eddie" + 0.004*"roland" + 0.003*"one" + 0.003*"susannah" + 0.003*"mia" + 0.002*"like" + 0.002*"would" + 0.002*"jake" + 0.002*"back"
2017-11-02 17:45:59,388 : INFO : topic #3 (0.100): 0.000*"said" + 0.000*"eddie" + 0.000*"one" + 0.000*"like" + 0.000*"roland" + 0.000*"would" + 0.000*"jake" + 0.000*"back" + 0.000*"susannah" + 0.000*"could"
2017-11-02 17:45:59,392 : INFO : topic #4 (0.100): 0.000*"roland" + 0.000*"said" + 0.000*"eddie" + 0.000*"one" + 0.000*"would" + 0.000*"could" + 0.000*"like" + 0.000*"know" + 0.00

[(0,
  '0.005*"gunslinger" + 0.003*"said" + 0.003*"boy" + 0.003*"man" + 0.002*"one" + 0.002*"like" + 0.002*"would" + 0.002*"back" + 0.002*"black" + 0.001*"jake"'),
 (1,
  '0.000*"said" + 0.000*"roland" + 0.000*"one" + 0.000*"would" + 0.000*"like" + 0.000*"eddie" + 0.000*"back" + 0.000*"could" + 0.000*"looked" + 0.000*"jake"'),
 (2,
  '0.005*"said" + 0.004*"eddie" + 0.004*"roland" + 0.003*"one" + 0.003*"susannah" + 0.003*"mia" + 0.002*"like" + 0.002*"would" + 0.002*"jake" + 0.002*"back"'),
 (3,
  '0.000*"said" + 0.000*"eddie" + 0.000*"one" + 0.000*"like" + 0.000*"roland" + 0.000*"would" + 0.000*"jake" + 0.000*"back" + 0.000*"susannah" + 0.000*"could"'),
 (4,
  '0.000*"roland" + 0.000*"said" + 0.000*"eddie" + 0.000*"one" + 0.000*"would" + 0.000*"could" + 0.000*"like" + 0.000*"know" + 0.000*"jake" + 0.000*"gunslinger"'),
 (5,
  '0.006*"roland" + 0.005*"said" + 0.005*"eddie" + 0.005*"one" + 0.004*"jake" + 0.003*"like" + 0.003*"would" + 0.003*"could" + 0.003*"back" + 0.002*"susannah"'),
 (6

In [47]:
# # Transform the docs from the word space to the topic space (like "transform" in sklearn)
# lda_corpus = lda[corp]
# lda_docs = [doc for doc in lda_corpus]

In [48]:
# lda_docs[:15]

[[(2, 0.99999309930389702)],
 [(8, 0.99999659776824634)],
 [(5, 0.99999504978884213)],
 [(0, 0.9999867660232642)],
 [(5, 0.80863682693916183), (9, 0.19135691336193036)],
 [(5, 0.9999967903172694)],
 [(8, 0.99999642158611446)]]

In [49]:
# lda.log_perplexity

<bound method LdaModel.log_perplexity of <gensim.models.ldamodel.LdaModel object at 0x1a29cd38d0>>

In [50]:
# corp2,id2word2 = book_cv(all_text,stoplist)

<class 'str'>
(2680972, 67)


In [52]:
# lda2 = models.LdaModel(corpus=corp2, num_topics=10, id2word=id2word2, passes=10)

2017-11-02 18:59:21,423 : INFO : using symmetric alpha at 0.1
2017-11-02 18:59:21,425 : INFO : using symmetric eta at 3.72999046614e-07
2017-11-02 18:59:21,803 : INFO : using serial LDA version on this node
2017-11-02 19:01:16,931 : INFO : running online (multi-pass) LDA training, 10 topics, 10 passes over the supplied corpus of 67 documents, updating model once every 67 documents, evaluating perplexity every 67 documents, iterating 50x with a convergence threshold of 0.001000
2017-11-02 19:06:27,225 : INFO : -18.245 per-word bound, 310611.4 perplexity estimate based on a held-out corpus of 67 documents with 10236337 words
2017-11-02 19:06:27,227 : INFO : PROGRESS: pass 0, at document #67/67
2017-11-02 19:06:41,693 : INFO : topic #0 (0.100): 0.005*"said" + 0.003*"would" + 0.003*"like" + 0.003*"one" + 0.002*"time" + 0.002*"could" + 0.002*"back" + 0.002*"looked" + 0.002*"go" + 0.002*"way"
2017-11-02 19:06:41,733 : INFO : topic #5 (0.100): 0.005*"said" + 0.004*"one" + 0.003*"like" + 0.003

2017-11-02 19:23:20,101 : INFO : topic diff=0.286577, rho=0.377964
2017-11-02 19:26:11,274 : INFO : -12.282 per-word bound, 4981.2 perplexity estimate based on a held-out corpus of 67 documents with 10236337 words
2017-11-02 19:26:11,276 : INFO : PROGRESS: pass 6, at document #67/67
2017-11-02 19:26:53,785 : INFO : topic #9 (0.100): 0.000*"said" + 0.000*"one" + 0.000*"like" + 0.000*"back" + 0.000*"could" + 0.000*"would" + 0.000*"know" + 0.000*"thought" + 0.000*"right" + 0.000*"looked"
2017-11-02 19:26:53,833 : INFO : topic #2 (0.100): 0.003*"said" + 0.002*"one" + 0.002*"dan" + 0.002*"like" + 0.002*"back" + 0.001*"would" + 0.001*"could" + 0.001*"abra" + 0.001*"know" + 0.001*"little"
2017-11-02 19:26:53,871 : INFO : topic #7 (0.100): 0.000*"said" + 0.000*"one" + 0.000*"back" + 0.000*"would" + 0.000*"thought" + 0.000*"could" + 0.000*"like" + 0.000*"know" + 0.000*"right" + 0.000*"see"
2017-11-02 19:26:53,916 : INFO : topic #3 (0.100): 0.001*"said" + 0.001*"wesley" + 0.001*"one" + 0.000*"li

In [53]:
# lda2.print_topics()

2017-11-02 19:37:40,060 : INFO : topic #0 (0.100): 0.005*"roland" + 0.004*"said" + 0.004*"one" + 0.003*"would" + 0.002*"like" + 0.002*"back" + 0.002*"eddie" + 0.002*"could" + 0.002*"susannah" + 0.002*"jake"
2017-11-02 19:37:40,086 : INFO : topic #1 (0.100): 0.005*"said" + 0.004*"one" + 0.003*"like" + 0.003*"back" + 0.003*"would" + 0.002*"could" + 0.002*"know" + 0.002*"little" + 0.002*"looked" + 0.002*"time"
2017-11-02 19:37:40,132 : INFO : topic #2 (0.100): 0.002*"said" + 0.002*"dan" + 0.002*"one" + 0.002*"like" + 0.001*"back" + 0.001*"abra" + 0.001*"would" + 0.001*"could" + 0.001*"know" + 0.001*"little"
2017-11-02 19:37:40,180 : INFO : topic #3 (0.100): 0.001*"wesley" + 0.000*"kindle" + 0.000*"robbie" + 0.000*"said" + 0.000*"ur" + 0.000*"one" + 0.000*"like" + 0.000*"ellen" + 0.000*"back" + 0.000*"know"
2017-11-02 19:37:40,220 : INFO : topic #4 (0.100): 0.003*"said" + 0.002*"one" + 0.002*"danny" + 0.002*"like" + 0.002*"back" + 0.002*"would" + 0.002*"jack" + 0.001*"could" + 0.001*"time"

[(0,
  '0.005*"roland" + 0.004*"said" + 0.004*"one" + 0.003*"would" + 0.002*"like" + 0.002*"back" + 0.002*"eddie" + 0.002*"could" + 0.002*"susannah" + 0.002*"jake"'),
 (1,
  '0.005*"said" + 0.004*"one" + 0.003*"like" + 0.003*"back" + 0.003*"would" + 0.002*"could" + 0.002*"know" + 0.002*"little" + 0.002*"looked" + 0.002*"time"'),
 (2,
  '0.002*"said" + 0.002*"dan" + 0.002*"one" + 0.002*"like" + 0.001*"back" + 0.001*"abra" + 0.001*"would" + 0.001*"could" + 0.001*"know" + 0.001*"little"'),
 (3,
  '0.001*"wesley" + 0.000*"kindle" + 0.000*"robbie" + 0.000*"said" + 0.000*"ur" + 0.000*"one" + 0.000*"like" + 0.000*"ellen" + 0.000*"back" + 0.000*"know"'),
 (4,
  '0.003*"said" + 0.002*"one" + 0.002*"danny" + 0.002*"like" + 0.002*"back" + 0.002*"would" + 0.002*"jack" + 0.001*"could" + 0.001*"time" + 0.001*"know"'),
 (5,
  '0.004*"said" + 0.004*"one" + 0.003*"like" + 0.003*"back" + 0.002*"would" + 0.002*"could" + 0.002*"time" + 0.002*"know" + 0.002*"little" + 0.002*"right"'),
 (6,
  '0.005*"said" 

In [54]:
# print(stoplist)

{'shan', 'too', 'wasn', 'under', 'about', 'than', 'can', 'further', 'above', 'o', 't', 'herself', 'again', 'at', 'was', 'didn', 'we', 'isn', 'does', 'me', 'i', 'couldn', 'the', 'been', 'she', 'her', "'", 'as', 'those', 'then', 'until', 'these', 'when', 'both', 'y', 'before', 'they', 'same', 're', ')', 'each', 'were', 'are', 'mightn', 'mustn', 'nor', 'very', 'of', 'how', 'don', 'any', 'ours', 'your', 'there', 'our', 'm', 'being', 'aren', 'do', 'so', 'am', 'some', 'such', '.', 'did', 'up', 'through', 'not', 'myself', 'you', 'should', 'between', 'out', 'where', 'to', 'weren', 'for', 's', 'haven', 'yourself', 'whom', 'down', '(', 'from', 'my', 'yours', 'ain', 'is', 'yourselves', 'their', 'which', 'what', 've', 'had', 'needn', 'himself', 'more', 'an', 'few', 'here', 'a', 'but', 'below', 'if', '"', 'why', 'doing', 'all', 'on', 'itself', 'just', 'hadn', 'them', 'own', 'hasn', 'only', 'will', 'wouldn', 'be', 'him', ',', 'shouldn', 'by', 'against', 'with', 'it', 'into', 'he', 'theirs', 'ourselv

## Scrap

Test Splitting the text for just the story content

In [95]:
# start = gs_text.find('******start_of_file******')+25
# end = gs_text.find('******end_of_file******')
# gs_text = gs_text[start:end]

In [105]:
# gs_text[-20:-1]

'ble final battle.\n\n'

In [36]:
# words = [''.join(words) for words in gs_text.split()]

In [52]:
counter = Counter()
n = 1
words = [w for w in words if w not in stoplist]
bigrams = ngrams(words, n)
counter += Counter(bigrams)

sorted_counter = sorted(counter.items(), key=operator.itemgetter(1),reverse=True)
# for word, count in gBlob.word_counts.items():
#     print("%15s %i" % (word, count))

In [53]:
sorted_counter

[(('gunslinger',), 531),
 (('man',), 243),
 (('boy',), 236),
 (('one',), 218),
 (('like',), 201),
 (('would',), 186),
 (('“i',), 169),
 (('black',), 156),
 (('said.',), 151),
 (('looked',), 147),
 (('back',), 143),
 (('could',), 136),
 (('said',), 129),
 (('jake',), 120),
 (('made',), 103),
 (('don’t',), 99),
 (('even',), 92),
 (('him.',), 92),
 (('it.',), 90),
 (('him,',), 89),
 (('eyes',), 88),
 (('seemed',), 87),
 (('came',), 85),
 (('roland',), 85),
 (('face',), 83),
 (('went',), 82),
 (('felt',), 81),
 (('might',), 80),
 (('way',), 77),
 (('still',), 77),
 (('know',), 76),
 (('time',), 75),
 (('never',), 74),
 (('come',), 74),
 (('“you',), 73),
 (('see',), 71),
 (('first',), 69),
 (('two',), 69),
 (('thought',), 68),
 (('cort',), 68),
 (('began',), 66),
 (('hands',), 66),
 (('said,',), 66),
 (('long',), 65),
 (('it,',), 65),
 (('last',), 65),
 (('go',), 64),
 (('almost',), 63),
 (('head',), 60),
 (('didn’t',), 59),
 (('saw',), 59),
 (('perhaps',), 59),
 (('old',), 58),
 (('three',

In [70]:
all_books = db.books.find({})

In [7]:
# df = pd.DataFrame(columns=['title','isbn','text'])
# for book in all_books:
#     df2 = 

In [8]:
# book_counts = {}
# for book in all_books:
#     title,counts = book_word_count(book,1)
#     book_counts[title] = counts

In [9]:
# clean_books = []
# for book in all_books:
#     text = clean_text(book)
#     clean_books.append(text)

In [10]:
# book_list = sorted(book_list)

In [11]:
dt_books = ['Gunslinger, The','Drawing of the Three, The','Waste Lands, The',
            'Wizard and Glass','Wolves of the Calla','Song of Susannah','Dark Tower, The']

In [30]:
print(dt_books)

['Gunslinger, The', 'Drawing of the Three, The', 'Waste Lands, The', 'Wizard and Glass', 'Wolves of the Calla', 'Song of Susannah', 'Dark Tower, The']


In [12]:
# all_books = db.books.find({})

In [13]:
# all_text = []
# for book in all_books:
#     text = clean_text(book)
#     all_text.append(text)

In [5]:
# dt_text = []
# for book in all_books:
#     if book['title'] in dt_books:
#         text  = clean_text(book)
#         name = book['title']
#         print(book['title'])
#         dt_text.append(text)
#     #else:
#         #print("Haha Fuck you!")

In [64]:
len(dt_text)

7

In [69]:
df,corp,id2word = book_cv(dt_text,stoplist)

<class 'str'>
(29598, 7)


In [None]:
df.shape