# Stephen King Novel NLP

## Imports

In [1]:
import pandas as pd
import numpy as np
from IPython import display
import os
import re

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
import json
from sklearn.externals import joblib
from statsmodels.nonparametric.smoothers_lowess import lowess
from textblob import TextBlob

display.clear_output(wait=True)
from config import user_name,password,ip
from epub_conversion.utils import open_book

%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt, mpld3
plt.style.use('fivethirtyeight')
plt.rcParams['figure.figsize'] = 20, 10

## Stoplist

In [2]:
stoplist = stopwords.words('english')
stoplist += ['.', ',', '(', ')', "'", '"']
#stoplist = set(stoplist)

## Import Sentiment

In [3]:
sentDF = pd.read_pickle('pickles/sentiment.pkl')

## Functions

### Manual Word Count

In [4]:
def clean_text(row):
    text = row['content'].lower()
    text = text.strip('\n')
    return text

### Cleanup

In [5]:
def cleanup(token, lower = True):
    if lower:
       token = token.lower()
    return token.strip()

### Entity Detection

In [6]:
def ie_preprocess(document):
    document = ' '.join([i for i in document.split() if i not in stoplist])
    sentences = nltk.sent_tokenize(document)
    sentences = [nltk.word_tokenize(sent) for sent in sentences]
    sentences = [nltk.pos_tag(sent) for sent in sentences]
    return sentences

In [7]:
def extract_names(document):
    names = []
    sentences = ie_preprocess(document)
    for tagged_sentence in sentences:
        for chunk in nltk.ne_chunk(tagged_sentence):
            if type(chunk) == nltk.tree.Tree:
                if chunk.label() == 'PERSON':
                    names.append(' '.join([c[0] for c in chunk]))
    return names

### Tokenize and Stem

In [8]:
def tokenize_and_stem(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems

In [9]:
def tokenize_only(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    return filtered_tokens

### Sentiment

In [10]:
def a_sentiment(filename):
    with open (filename, "r") as myfile:
        text=myfile.read().replace('\n', ' ')
        blob = TextBlob(text)       
        tot = 0.0
        sent = []        
        afinn = Afinn()
        for sentence in blob.sentences:            
            senti = afinn.score(str(sentence))        
            sent.append(senti)   
        return sent

### Sentiment Plot

In [11]:
def sentiplot(filename, title):
    fig = plt.figure()
    sent= a_sentiment(filename)    
    plt.plot(sent,label=title)
    plt.ylabel("Cumulative Emotional Valence")
    plt.xlabel("Sentence #")    
    plt.legend();


### Plot Stories

In [12]:
def plot_story(title,df=sentDF,f=.1,d=0.0):
    bookDF = df.loc[df['title'] == title]
    sentiment = list(bookDF.sentiment)
    lowX = list(range(1,len(sentiment)+1))
    #print(len(sent),len(lowX))
    low_plot = lowess(sentiment,lowX,frac=.1,return_sorted=False,delta=d)
    #f2 = interp1d(lowX, sentiment, kind='cubic')
    title = title
    #y = np.arange(-1.0,2.0,.1)
    fig = plt.figure()
    plt.plot(low_plot,label=title)
    plt.ylabel("Sentiment Score")
    plt.xlabel("Sentence #")    
    plt.legend();
#     plt.savefig('images/'+title+'.svg')
   
    #mpld3.show()
    return

## Data Processing

**Create a dictionary from all books**

In [13]:
book_list = []
book_dict = {}
path = "/Users/xavier/dev/metis/fletcher/books/"
for file in os.listdir(path):
    if file.endswith(".txt"):
        clean_name = file.replace(" - Stephen King.txt","")
        book_dict[clean_name] = open(path+file, "r").read()
        book_list.append(clean_name)

In [14]:
clean_list = []
for k,v in book_dict.items():
    try:
        year = re.search("[Cc]opyright ©\s*.*_*(\d{4}).*Stephen King|[Cc]opyright ©\s.*Stephen King.*_*(\d{4})|[Cc]opyright ©\s*.*_*(\d{4}).*Richard Bachman|[Cc]opyright ©\s.*Richard Bachman.*_*(\d{4})",v).group(0) # get copyright year from book text
        year = re.search("(\d{4})",year).group(0)
    except:
        year = ""
    try:
        isbn = re.search(".*ISBN+:*(\d*.*)",v)[1].split(" ")
        isbn = max(isbn, key=len)
    except:
        isbn = ""
    try:
        start = v.find('******start_of_file******')+25
        end = v.find('******end_of_file******')
        text = v[start:end]
    except:
        text = ""
        
    doc = {"title":k,"year":year,'isbn':isbn,"content":text}
    clean_list.append(doc)
    #print(doc['title'],doc['isbn'])
    #print(doc['title'],doc['year'])
    #db.books.insert_one(doc)

In [15]:
df = pd.DataFrame(clean_list)

In [16]:
df.to_pickle('pickles/books.pkl')

In [17]:
df.content.iloc[0]

'd by “Duel”\n\nJoe Hill and Stephen King\n\n\n\n\n\nContents\n\nCover\n\n\n\nTitle page\n\n\n\nThrottle\n\n\n\nAbout the Authors\n\n\n\nCredits\n\n\n\nCopyright\n\n\n\nMore from the Authors\n\n\n\nAbout the Publisher\n\n\n\n\n******START_OF_FILE******\nThrottle\n\n\nTHEY RODE WEST FROM THE SLAUGHTER, through the painted desert, and did not stop until they were a hundred miles away. Finally, in the early afternoon, they turned in at a diner with a white stucco exterior and pumps on concrete islands out front. The overlapping thunder of their engines shook the plate-glass windows as they rolled by. They drew up together among parked long-haul trucks, on the west side of the building, and there they put down their kickstands and turned off their bikes.\n\nRace Adamson had led them the whole way, his Harley running sometimes as much as a quarter-mile ahead of anyone else’s. It had been Race’s habit to ride out in front ever since he had returned to them, after two years in the sand. He ra

### Import Pickle

In [18]:
# df = pd.read_pickle('pickles/books.pkl')

In [19]:
df.columns.tolist()

['content', 'isbn', 'title', 'year']

In [20]:
df.head()

Unnamed: 0,content,isbn,title,year
0,d by “Duel”\n\nJoe Hill and Stephen King\n\n\n...,9780062215956,Throttle,2009
1,TS\n\n\n\nCover Page\n\nTitle Page\n\n\n\nIntr...,978-0-385-52884-9,Night Shift,1976
2,this Scribner eBook.\n\n\n\n* * *\n\n\n\nSign...,0-7432-0467-0,Riding the Bullet,2000
3,Page\n\nCopyright Page\n\nDedication\n\n\n\n\...,978-1-101-13813-7,Roadwork,1981
4,dication\n\nIntroduction\n\nAuthor’s Note\n\n\...,978-0-385-52822-1,Salem's Lot,1975


In [21]:
document = df.iloc[28]['content']

In [22]:
# df['content'] = df.content.apply(lambda x: x.lower())
# df['content'] = df.content.apply(lambda x: x.strip("\n"))

### Character extraction test

In [23]:
characters = extract_names(document)

In [24]:
characters[:10]

['Page',
 'Copyright Page',
 'Stephen King',
 'Prince Peter',
 'Flagg',
 'Thomas',
 'Peter',
 'Stephen',
 'Stephen King',
 'Penguin Group']

In [25]:
characters = [s.split(" ") for s in characters]

In [26]:
flat_list = [item for sublist in characters for item in sublist]

In [27]:
flat_list[:10]

['Page',
 'Copyright',
 'Page',
 'Stephen',
 'King',
 'Prince',
 'Peter',
 'Flagg',
 'Thomas',
 'Peter']

In [28]:
characters = set(flat_list)
characters = list(characters)

In [29]:
stoplist = list(stoplist)
stoplist.extend(characters)
stoplist = set(stoplist)

In [30]:
stoplist = [x.lower() for x in stoplist]

In [31]:
df.head()

Unnamed: 0,content,isbn,title,year
0,d by “Duel”\n\nJoe Hill and Stephen King\n\n\n...,9780062215956,Throttle,2009
1,TS\n\n\n\nCover Page\n\nTitle Page\n\n\n\nIntr...,978-0-385-52884-9,Night Shift,1976
2,this Scribner eBook.\n\n\n\n* * *\n\n\n\nSign...,0-7432-0467-0,Riding the Bullet,2000
3,Page\n\nCopyright Page\n\nDedication\n\n\n\n\...,978-1-101-13813-7,Roadwork,1981
4,dication\n\nIntroduction\n\nAuthor’s Note\n\n\...,978-0-385-52822-1,Salem's Lot,1975


### Word Tokenize

In [32]:
df['content'] = df.content.apply(lambda x: word_tokenize(x))

In [33]:
df.head()

Unnamed: 0,content,isbn,title,year
0,"[d, by, “, Duel, ”, Joe, Hill, and, Stephen, K...",9780062215956,Throttle,2009
1,"[TS, Cover, Page, Title, Page, Introduction, b...",978-0-385-52884-9,Night Shift,1976
2,"[this, Scribner, eBook, ., *, *, *, Sign, up, ...",0-7432-0467-0,Riding the Bullet,2000
3,"[Page, Copyright, Page, Dedication, Part, One,...",978-1-101-13813-7,Roadwork,1981
4,"[dication, Introduction, Author, ’, s, Note, ’...",978-0-385-52822-1,Salem's Lot,1975


### Omit Stop Words

In [34]:
df['content'] = df.content.apply(lambda x: [word for word in x if word not in stoplist])

In [35]:
df.head()

Unnamed: 0,content,isbn,title,year
0,"[“, Duel, ”, Joe, Hill, Stephen, King, Content...",9780062215956,Throttle,2009
1,"[TS, Cover, Page, Title, Page, Introduction, J...",978-0-385-52884-9,Night Shift,1976
2,"[Scribner, eBook, *, *, *, Sign, newsletter, r...",0-7432-0467-0,Riding the Bullet,2000
3,"[Page, Copyright, Page, Dedication, Part, One,...",978-1-101-13813-7,Roadwork,1981
4,"[dication, Introduction, Author, ’, Note, ’, S...",978-0-385-52822-1,Salem's Lot,1975


### Convert tokens back to a long string

In [36]:
df['content'] = df.content.apply(lambda x: " ".join(x))

In [37]:
df.head()

Unnamed: 0,content,isbn,title,year
0,“ Duel ” Joe Hill Stephen King Contents Cover ...,9780062215956,Throttle,2009
1,TS Cover Page Title Page Introduction John D. ...,978-0-385-52884-9,Night Shift,1976
2,Scribner eBook * * * Sign newsletter receive s...,0-7432-0467-0,Riding the Bullet,2000
3,Page Copyright Page Dedication Part One - NOVE...,978-1-101-13813-7,Roadwork,1981
4,dication Introduction Author ’ Note ’ SALEM ’ ...,978-0-385-52822-1,Salem's Lot,1975


In [38]:
books=df.content.copy(deep=True)

In [39]:
type(books[0])

str

### Sentence Tokenize

In [40]:
df['content'] = df.content.apply(lambda x: sent_tokenize(x))

In [41]:
df.head()

Unnamed: 0,content,isbn,title,year
0,[“ Duel ” Joe Hill Stephen King Contents Cover...,9780062215956,Throttle,2009
1,[TS Cover Page Title Page Introduction John D....,978-0-385-52884-9,Night Shift,1976
2,[Scribner eBook * * * Sign newsletter receive ...,0-7432-0467-0,Riding the Bullet,2000
3,[Page Copyright Page Dedication Part One - NOV...,978-1-101-13813-7,Roadwork,1981
4,[dication Introduction Author ’ Note ’ SALEM ’...,978-0-385-52822-1,Salem's Lot,1975


### Stem Words

In [42]:
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")

In [43]:
titles = df.title

In [44]:
df['content'] = df.content.apply(lambda x: [stemmer.stem(word) for word in x])

In [45]:
df.head()

Unnamed: 0,content,isbn,title,year
0,[“ duel ” joe hill stephen king contents cover...,9780062215956,Throttle,2009
1,[ts cover page title page introduction john d....,978-0-385-52884-9,Night Shift,1976
2,[scribner ebook * * * sign newsletter receive ...,0-7432-0467-0,Riding the Bullet,2000
3,[page copyright page dedication part one - nov...,978-1-101-13813-7,Roadwork,1981
4,[dication introduction author ' note ' salem '...,978-0-385-52822-1,Salem's Lot,1975


In [46]:
# books[4][:500]

### Lower Stop List

### Convert sentences to Rows

This will be used for visualizing the sentiment analysis

In [47]:
s = df.apply(lambda x: pd.Series(x['content']),axis=1).stack().reset_index(level=1, drop=True)
s.name = 'content'
sentDF = df.drop('content', axis=1).join(s)

In [48]:
sentDF.head()

Unnamed: 0,isbn,title,year,content
0,9780062215956,Throttle,2009,“ duel ” joe hill stephen king contents cover ...
0,9780062215956,Throttle,2009,” vince looked hand arm—lemmy ' let go ; lemmy...
0,9780062215956,Throttle,2009,clarke ' gone so ' money there ' nothing left ...
0,9780062215956,Throttle,2009,” “ you ought find race feels you assuming two...
0,9780062215956,Throttle,2009,” he glanced meaningfully men vince noticed fi...


In [49]:
sentDF.to_pickle('pickles/sentences.pkl')

### Processing Sentences

In [50]:
len(books)

68

In [51]:
titles = sentDF.title
#titles

In [52]:
totalvocab_stemmed = []
totalvocab_tokenized = []
for i in books:
    allwords_stemmed = tokenize_and_stem(i) #for each item in 'synopses', tokenize/stem
    totalvocab_stemmed.extend(allwords_stemmed) #extend the 'totalvocab_stemmed' list
    
    allwords_tokenized = tokenize_only(i)
    totalvocab_tokenized.extend(allwords_tokenized)

In [55]:
totalvocab_stemmed[:20]

['duel',
 'joe',
 'hill',
 'stephen',
 'king',
 'content',
 'cover',
 'titl',
 'throttl',
 'about',
 'author',
 'credit',
 'copyright',
 'more',
 'author',
 'about',
 'publish',
 '******start_of_file******',
 'throttl',
 'they']