In [None]:
!pip install spacy
!pip install nltk
#have to install the 'en_core_web_sm' model below like this because something about Jupyter Notebook
!pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.0/en_core_web_sm-2.2.0.tar.gz

In [None]:
import spacy
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
import re
from bs4 import BeautifulSoup
from contractions import CONTRACTION_MAP
import unicodedata
import en_core_web_sm

In [None]:
nlp = en_core_web_sm.load()
#nlp = spacy.load('en_core', parse = True, tag=True, entity=True) (DJ'S code)
#nlp_vec = spacy.load('en_vecs', parse = True, tag=True, entity=True) (DJ's code)
tokenizer = ToktokTokenizer()
##needed to use the comment out line below to get the stopwords
#nltk.download('stopwords') 
stopword_list = nltk.corpus.stopwords.words('english')
stopword_list.remove('no')
stopword_list.remove('not')

In [None]:
print(nlp)

In [None]:
print(stopword_list)

In [None]:
#function to remove accented characters
def remove_accented_chars(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text
#test the function with example
remove_accented_chars('Sómě Áccěntěd těxt')

In [None]:
#function to expand contractions

def expand_contractions(text, contraction_mapping=CONTRACTION_MAP):
    
    contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())), 
                                      flags=re.IGNORECASE|re.DOTALL)
    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contraction_mapping.get(match)\
                                if contraction_mapping.get(match)\
                                else contraction_mapping.get(match.lower())                       
        expanded_contraction = first_char+expanded_contraction[1:]
        return expanded_contraction
        
    expanded_text = contractions_pattern.sub(expand_match, text)
    expanded_text = re.sub("'", "", expanded_text)
    return expanded_text
#example
expand_contractions("Y'all can't expand contractions I'd think")

In [None]:
#function to remove special characters

def remove_special_characters(text, remove_digits=False):
    pattern = r'[^a-zA-z0-9\s]' if not remove_digits else r'[^a-zA-z\s]'
    text = re.sub(pattern, '', text)
    return text
#example
remove_special_characters("Well this was fun! What do you think? 123#@!", 
                          remove_digits=True)

In [None]:
#function to find stem words
def simple_stemmer(text):
    ps = nltk.porter.PorterStemmer()
    text = ' '.join([ps.stem(word) for word in text.split()])
    return text
#example
simple_stemmer("My system keeps crashing his crashed yesterday, ours crashes daily")

In [None]:
#function for lemmatization
def lemmatize_text(text):
    text = nlp(text)
    text = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])
    return text
#example
lemmatize_text("My system keeps crashing! his crashed yesterday, ours crashes daily")

In [None]:
#function to remove stopwords
def remove_stopwords(text, is_lower_case=False):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text
#example
remove_stopwords("The, and, if are stopwords, computer is not")

In [None]:
#Bringing it all together — Building a Text Normalizer
def normalize_corpus(corpus, html_stripping=True, contraction_expansion=True,
                     accented_char_removal=True, text_lower_case=True, 
                     text_lemmatization=True, special_char_removal=True, 
                     stopword_removal=True, remove_digits=True):
    
    normalized_corpus = []
    # normalize each document in the corpus
    for doc in corpus:
        # strip HTML
        if html_stripping:
            doc = strip_html_tags(doc)
        # remove accented characters
        if accented_char_removal:
            doc = remove_accented_chars(doc)
        # expand contractions    
        if contraction_expansion:
            doc = expand_contractions(doc)
        # lowercase the text    
        if text_lower_case:
            doc = doc.lower()
        # remove extra newlines
        doc = re.sub(r'[\r|\n|\r\n]+', ' ',doc)
        # lemmatize text
        if text_lemmatization:
            doc = lemmatize_text(doc)
        # remove special characters and\or digits    
        if special_char_removal:
            # insert spaces between special characters to isolate them    
            special_char_pattern = re.compile(r'([{.(-)!}])')
            doc = special_char_pattern.sub(" \\1 ", doc)
            doc = remove_special_characters(doc, remove_digits=remove_digits)  
        # remove extra whitespace
        doc = re.sub(' +', ' ', doc)
        # remove stopwords
        if stopword_removal:
            doc = remove_stopwords(doc, is_lower_case=text_lower_case)
            
        normalized_corpus.append(doc)
        
    return normalized_corpus

In [None]:
pwd

In [None]:
os.chdir('C:/Users/adamj/Documents/Github/Springboard/Project/Books-for-Project')

In [None]:
pwd

In [None]:
os.listdir() #see files in directory

In [None]:
#open all books to be analyzed (10 bestsellers; 10 nonbestsellers)
Beautiful_Disaster=open('Beautiful Disaster by Jamie McGuire_BS.txt',"r",encoding="utf8")
Breaching_His_Defenses=open('Breaching-His-Defenses by Allyson Lindt_NBS.txt',"r",encoding="utf8")
Desired=open('Desired by Alisa Woods_NBS.txt',"r",encoding="utf8")
Fearsome=open('Fearsome by S.A. Wolfe_NBS.txt',"r",encoding="utf8")
Fifty_Shades_Darker=open('Fifty Shades Darker by EL James_BS.txt',"r",encoding="utf8")
Fifty_Shades_of_Grey=open('Fifty Shades of Grey by EL James_BS.txt',"r",encoding="utf8")
Finding_Master_Right=open('Finding Master Right by Sparrow Beckett_NBS.txt',"r",encoding="utf8")
Healing_Her_Heart=open('Healing-Her-Heart by Laura Scott_NBS.txt',"r",encoding="utf8")
Hopeless=open('Hopeless by Colleen Hoover_BS.txt',"r",encoding="utf8")
Lilas_Loves=open("Lila's Loves by Laylah Roberts_NBS.txt","r",encoding="utf8")
Lolita=open("Lolita by Vladamir Norbakov_BS.txt","r",encoding="utf8")
Mademoiselle_at_Arms=open("Mademoiselle-At-Arms by Bailey Elizabeth_NBS.txt","r",encoding="utf8")
Me_Before_You=open("Me Before You by Jojo Moyes_BS.txt","r",encoding="utf8")
Mothers_Black_Book=open("Mother's-Black-Book by H.H. Fowler_NBS.txt","r",encoding="utf8")
Outlander=open("Outlander by Diana Gabaldon_BS.txt","r",encoding="utf8")
Taking_Chances=open('Taking-Chances by Ann Omasta_NBS.txt',"r",encoding="utf8")
The_Fault_in_Our_Stars=open('The Fault in Our Stars by John Green_BS.txt',"r",encoding="utf8")
The_Notebook=open('The Notebook by Nicholas Sparks_BS.txt',"r",encoding="utf8")
The_Time_Travelers_Wife=open("The Time Traveler's Wife by Audrey Niffenager_BS.txt","r",encoding="utf8")
The_Titan_Drowns_Time_Travel_Romance=open('The-Titan-Drowns-Time-Travel-Romance by Nyhs Glover_NBS.txt',"r",encoding="utf8")

In [119]:
#test code on one book
print(Lolita.readline())
print(Lolita.readline())
print(Lolita.readline())
print(Lolita.readline())
print(Lolita.readline())
print(Lolita.readline())
print(Lolita.readline())
print(Lolita.readline())
print(Lolita.readline())
print(Lolita.readline())

my arms she was always Lolita.



Did she have a precursor? She did, indeed she did. In point of fact, there

might have been no Lolita at all had I not loved, one summer, a certain initial

girl—Child. In a princedom by the sea. Oh when? About as many years before Lolita was born as my age was that summer. You can always count on a murderer for

a fancy prose style.



Ladies and gentlemen of the jury, exhibit number one is what the seraphs, the mis—

informed, simple, noble—winged seraphs, envied. Look at this tangle of thorns.





In [120]:
#create a list of all the books
books_corpus=[Beautiful_Disaster,Breaching_His_Defenses,Desired,Fearsome,Fifty_Shades_Darker,Fifty_Shades_of_Grey,
              Finding_Master_Right,Healing_Her_Heart,Hopeless,Lilas_Loves,Lolita,Mademoiselle_at_Arms,Me_Before_You,
              Mothers_Black_Book,Outlander,Taking_Chances,The_Fault_in_Our_Stars,The_Notebook,The_Time_Travelers_Wife,
              The_Titan_Drowns_Time_Travel_Romance]
labels=[1,0,0,0,1,1,0,0,1,0,1,0,1,0,1,0,1,1,1,0]

books_corpus_arr=np.array(books_corpus)
corpus_df = pd.DataFrame({'Document': books_corpus_arr, 
                          'Category': labels})
corpus_df = corpus_df[['Document', 'Category']]
corpus_df

Unnamed: 0,Document,Category
0,<_io.TextIOWrapper name='Beautiful Disaster by...,1
1,<_io.TextIOWrapper name='Breaching-His-Defense...,0
2,<_io.TextIOWrapper name='Desired by Alisa Wood...,0
3,<_io.TextIOWrapper name='Fearsome by S.A. Wolf...,0
4,<_io.TextIOWrapper name='Fifty Shades Darker b...,1
5,<_io.TextIOWrapper name='Fifty Shades of Grey ...,1
6,<_io.TextIOWrapper name='Finding Master Right ...,0
7,<_io.TextIOWrapper name='Healing-Her-Heart by ...,0
8,<_io.TextIOWrapper name='Hopeless by Colleen H...,1
9,"<_io.TextIOWrapper name=""Lila's Loves by Layla...",0


In [118]:
books_corpus

[<_io.TextIOWrapper name='Beautiful Disaster by Jamie McGuire_BS.txt' mode='r' encoding='utf8'>,
 <_io.TextIOWrapper name='Breaching-His-Defenses by Allyson Lindt_NBS.txt' mode='r' encoding='utf8'>,
 <_io.TextIOWrapper name='Desired by Alisa Woods_NBS.txt' mode='r' encoding='utf8'>,
 <_io.TextIOWrapper name='Fearsome by S.A. Wolfe_NBS.txt' mode='r' encoding='utf8'>,
 <_io.TextIOWrapper name='Fifty Shades Darker by EL James_BS.txt' mode='r' encoding='utf8'>,
 <_io.TextIOWrapper name='Fifty Shades of Grey by EL James_BS.txt' mode='r' encoding='utf8'>,
 <_io.TextIOWrapper name='Finding Master Right by Sparrow Beckett_NBS.txt' mode='r' encoding='utf8'>,
 <_io.TextIOWrapper name='Healing-Her-Heart by Laura Scott_NBS.txt' mode='r' encoding='utf8'>,
 <_io.TextIOWrapper name='Hopeless by Colleen Hoover_BS.txt' mode='r' encoding='utf8'>,
 <_io.TextIOWrapper name="Lila's Loves by Laylah Roberts_NBS.txt" mode='r' encoding='utf8'>,
 <_io.TextIOWrapper name='Lolita by Vladamir Norbakov_BS.txt' mode

In [None]:
os.listdir()

In [None]:
Beautiful_Disaster.close()
Breaching_His_Defenses.close()
Desired.close()
Fearsome.close()
Fifty_Shades_Darker.close()
Fifty_Shades_of_Grey.close()
Finding_Master_Right.close()
Healing_Her_Heart.close()
Hopeless.close()
Lilas_Loves.close()
Lolita.close()
Mademoiselle_at_Arms.close()
Me_Before_You.close()
Mothers_Black_Book.close()
Outlander.close()
Taking_Chances.close()
The_Fault_in_Our_Stars.close()
The_Notebook.close()
The_Time_Travelers_Wife.close()
The_Titan_Drowns_Time_Travel_Romance.close()