# Feature Generation
### This is Mukund's attempt at creating different features. Each cell corresponds to a new feature.

#### List of Proper Nouns

In [25]:
import os
import re #Regular expression operations
import string
import pandas as pd #To create a dataframe of data
#NLTK is an interesting library that was used in a Kaggle kernel and helps with a bunch of NLP stuff
import nltk
nltk.download("averaged_perceptron_tagger")
nltk.download("punkt")
nltk.download("stopwords")
from nltk import word_tokenize
from nltk.corpus import stopwords #for removing stopwords
from collections import OrderedDict

#######################################Remove stopwords from string##############################
def remStopwordsFromStr(string1):
    pattern = re.compile(r'\b(' + r'|'.join(stopwords.words('english')) + r')\b\s*') #compiling all stopwords.
    string2 = pattern.sub('', string1) #replacing the occurrences of stopwords in string1
    return string2
###########################End of Function#######################################################

###########################Function for removing punctuations from string########################
def remPuncFromStr(string1):
    translation_table = dict.fromkeys(map(ord, string.punctuation), ' ') #creating dictionary of punc & None
    string2 = string1.translate(translation_table) #apply punctuation removal
    return string2
############################End of Function###############################################

############################Most common proper noun in piece of text#############################
def mostFreqProperNoun(tag_prefix, tagged_text):
    #Let's get rid of some of the overly common proper nouns:
    commPropNouns = ["Mr", "Mrs", "Miss", "Ms"]
    
    #Create an initial dict with the 4 most commonly occuring proper nouns and plural proper nouns
    cfd = nltk.ConditionalFreqDist((tag, word) for (word, tag) in tagged_text
                                   if tag.startswith(tag_prefix))
    fourMostCommon = dict((tag, cfd[tag].most_common(10)) for tag in cfd.conditions())
    
    #Clean out any punctuation issues
    for tag,value in fourMostCommon.items():
        index = 0 #Used to work around the dynamically changing size of value
        while(index < len(value)):
            pair = value[index] #One of the tuples
            if (pair[0].isalpha() == False):
                value.remove(pair)
                index = 0 #Need to check all remaining values, so index is reset
            elif (pair[0] == "Mr" or pair[0] == "Mrs" or pair[0] == "Miss" or pair[0] == "Ms" or pair[0] == "A" or
                     pair[0] == "St"):
                value.remove(pair)
                index = 0 #Need to check all remaining values, so index is reset
            else:
                index = index + 1
        fourMostCommon[tag] = value
    
    #Put together a list of all the counts and find the max
    #listOfAllPropNouns
    listOfAllCounts = list()
    for tag,value in fourMostCommon.items():
        for i in range(len(value)):
            pair = value[i]
            listOfAllCounts.append(pair[1])
    greatest = 0
    if (len(listOfAllCounts) > 0): #Account for overly used common proper nouns
        greatest = max(listOfAllCounts)
        
    #Put together a list of all the proper nouns and find the proper noun corresponding to the max
    listOfAllPNouns = list()
    for tag,value in fourMostCommon.items():
        for i in range(len(value)):
            pair = value[i]
            listOfAllPNouns.append(pair)
    
    bestPair = tuple() #The tuple that corresponds to the most frequently occuring proper noun
    for i in range(len(listOfAllPNouns)):
        pair = listOfAllPNouns[i]
        if (pair[1] == greatest):
            bestPair = pair
    
    return bestPair
###########################End of Function############################################################

#Get all the Authors to go through:
dirList = list()
for root, dirs, files in os.walk("../Processed", topdown=False):
    for name in dirs:
        if (name != ".ipynb_checkpoints"):
            dirList.append(name)

#First create the dataFrame that will hold the results:
finalDFDict = dict()
finalDFDict = OrderedDict({"authors": dirList, "CommonProperNouns": set(['', '1', '2'])})
finalDF = pd.DataFrame(finalDFDict)

#Iterate through directory and process the data:
for dirs in dirList:
    propNounList = set() #A set of all the common proper nouns per author
    for file in os.listdir(os.fsencode(dirs)):
        fileName = os.fsencode(file).decode("utf-8")
        print("This book will be processed: " + dirs + " " + str(fileName))

        #Read in the raw data as a string
        rawDataFile = open(dirs + "/"+ str(fileName), "r", encoding="utf-8")
        rawData = rawDataFile.read() #type(rawData) == string
        rawDataLines = rawData.split("/n") #Look, lists can be easier to deal with!
        #print("Read the data for " + str(fileName) + "!")
        
        #One thing that can be useful is removing stopwords, and the NLTK lib does this for you!
        #Stopwords are super-common things like "this", "the", etc. and are usually removed to improve performance
        processedText = remStopwordsFromStr(rawData)
        processedText = remPuncFromStr(processedText)
        #print("Stopword and punctuation processing for " + str(fileName) + " done!")
                    
        #Okay, so now all of our text should have no stopwords
        #Thus, let's get a list of proper nouns
        
        #First, we need to tokenize the text. Then we tag the words with their respective parts of speech:
        tokenized_all_text = word_tokenize(processedText) #tokenize the text
        list_of_tagged_words = nltk.pos_tag(tokenized_all_text) #adding POS Tags to tokenized words
        set_pos  = (set(list_of_tagged_words)) # set of POS tags & words
        #print("Tagged all the words in file " + str(fileName) + "!")
        
        #We need to get just the proper nouns and find the n most common ones:
        properNouns = ["NNP","NNPS"] # POS tags of proper nouns
        listOfPNouns = list(map(lambda tuple_2 : tuple_2[0], filter(lambda tuple_2 : tuple_2[1] in  properNouns, set_pos)))

        #Now we create a new dataframe:
        result = mostFreqProperNoun("NNP", list_of_tagged_words)
        propNounList.add(result[0])
        finalDF.at[dirList.index(dirs), "CommonProperNouns"] = propNounList
    
    #Friendly message to the user:
    print("Finished with " + str(dirs) + "'s books!")

#Push data to a CSV to read from later:
finalDF.to_csv(path_or_buf=os.getcwd() + "\\features.csv", header=True, index=True)

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Mukund\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Mukund\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Mukund\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
This book will be processed: Dickens Bleak_House.txt




This book will be processed: Dickens David_Copperfield.txt
This book will be processed: Dickens Dombey_and_Son.txt
This book will be processed: Dickens Great_Expectations.txt
This book will be processed: Dickens Little_Dorrit.txt
This book will be processed: Dickens Nicholas_Nickleby.txt
This book will be processed: Dickens Oliver_Twist.txt
This book will be processed: Dickens Our_Mutual_Friend.txt
This book will be processed: Dickens The_Letters_of_Charles_Dickens.txt
This book will be processed: Dickens The_Pickwick_Papers.txt
Finished with Dickens's books!
This book will be processed: Tolstoy Anna_Karenina.txt
This book will be processed: Tolstoy Kingdom_of_God_is_Within_You.txt
This book will be processed: Tolstoy Sevastopol.txt
This book will be processed: Tolstoy The_Cossacks.txt
This book will be processed: Tolstoy The_Kreutzer_Sonata_and_Other_Stories.txt
This book will be processed: Tolstoy The_Resurrection.txt
This book will be processed: Tolstoy War_And_Peace.txt
Finished wi

#### Stopword count

In [1]:
import os
import re #Regular expression operations
import string
import pandas as pd #To create a dataframe of data
#NLTK is an interesting library that was used in a Kaggle kernel and helps with a bunch of NLP stuff
import nltk
nltk.download("averaged_perceptron_tagger")
nltk.download("punkt")
nltk.download("stopwords")
from nltk import word_tokenize
from nltk.corpus import stopwords #for removing stopwords

#Get all the Authors to go through:
dirList = list()
for root, dirs, files in os.walk("../Processed", topdown=False):
    for name in dirs:
        if (name != ".ipynb_checkpoints"):
            dirList.append(name)

#Read in the CSV:
featuresDF = pd.read_csv("features.csv", index_col=0, header=0)
propNounDict = list() #A set of all the common proper nouns per author

#Iterate through directory and process the data:
for dirs in dirList:
    totalCount = 0
    for file in os.listdir(os.fsencode(dirs)):
        fileName = os.fsencode(file).decode("utf-8")
        print("This book will be processed: " + dirs + " " + str(fileName))
        
        rawDataFile = open(dirs + "/"+ str(fileName), "r", encoding="utf-8")
        rawData = rawDataFile.read() #type(rawData) == string
        
        #Count # of stopwords:
        stop_words = set(stopwords.words("english"))
        count = len([w for w in str(rawData).lower().split() if w in stop_words])
        
        totalCount = count + totalCount
        
    featuresDF.at[dirList.index(dirs), "avgStopWordCountPerBook"] = (totalCount/len(os.listdir(os.fsencode(dirs))))
    #Friendly message to the user:
    print("Finished with " + str(dirs) + "'s books!")

#Push data to a CSV to read from later:
featuresDF.to_csv(path_or_buf=os.getcwd() + "\\features.csv", header=True, index=True)
featuresDF.to_csv(path_or_buf=os.getcwd() + "\\numFeatures.csv", header=True, index=True)

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Mukund\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Mukund\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Mukund\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
This book will be processed: Dickens Bleak_House.txt
This book will be processed: Dickens David_Copperfield.txt




This book will be processed: Dickens Dombey_and_Son.txt
This book will be processed: Dickens Great_Expectations.txt
This book will be processed: Dickens Little_Dorrit.txt
This book will be processed: Dickens Nicholas_Nickleby.txt
This book will be processed: Dickens Oliver_Twist.txt
This book will be processed: Dickens Our_Mutual_Friend.txt
This book will be processed: Dickens The_Letters_of_Charles_Dickens.txt
This book will be processed: Dickens The_Pickwick_Papers.txt
Finished with Dickens's books!
This book will be processed: Tolstoy Anna_Karenina.txt




This book will be processed: Tolstoy Kingdom_of_God_is_Within_You.txt
This book will be processed: Tolstoy Sevastopol.txt
This book will be processed: Tolstoy The_Cossacks.txt
This book will be processed: Tolstoy The_Kreutzer_Sonata_and_Other_Stories.txt
This book will be processed: Tolstoy The_Resurrection.txt
This book will be processed: Tolstoy War_And_Peace.txt
Finished with Tolstoy's books!
This book will be processed: Twain Adventures_of_Huckleberry_Finn.txt
This book will be processed: Twain A_Connecticut_Yankee_in_King_Arthur's_Court.txt
This book will be processed: Twain Life_on_the_Mississippi.txt
This book will be processed: Twain Roughing_It.txt
This book will be processed: Twain The_Adventures_Of_Tom_Sawyer.txt
This book will be processed: Twain The_Innocents_Abroad.txt
This book will be processed: Twain The_Prince_And_The_Pauper.txt
This book will be processed: Twain The_Tragedy_of_Pudd'nhead_Wilson.txt
Finished with Twain's books!


#### NGram Occurence

In [3]:
import os
import re #Regular expression operations
import string
import pandas as pd #To create a dataframe of data
import operator
#NLTK is an interesting library that was used in a Kaggle kernel and helps with a bunch of NLP stuff
import nltk
nltk.download("averaged_perceptron_tagger")
nltk.download("punkt")
nltk.download("stopwords")
from nltk import word_tokenize
from nltk import ngrams
from nltk.corpus import stopwords #for removing stopwords
from collections import Counter

#######################################Remove stopwords from string##############################
def remStopwordsFromStr(string1):
    pattern = re.compile(r'\b(' + r'|'.join(stopwords.words('english')) + r')\b\s*') #compiling all stopwords.
    string2 = pattern.sub('', string1) #replacing the occurrences of stopwords in string1
    return string2
###########################End of Function#######################################################

###########################Function for removing punctuations from string########################
def remPuncFromStr(string1):
    translation_table = dict.fromkeys(map(ord, string.punctuation), ' ') #creating dictionary of punc & None
    string2 = string1.translate(translation_table) #apply punctuation removal
    return string2
############################End of Function###############################################

##################################Build ngrams from the data#####################################
def ngramListGenerator(string1,count_of_words_in_ngram):
    #string1 = string1.lower()
    string1 = string1.replace('.','. ')
    all_grams = ngrams(string1.split(), count_of_words_in_ngram)
    grams_list = []
    for grams in all_grams:
        grams_list.append(grams)
    return(grams_list)
##################################End of Function################################################

#Get all the Authors to go through:
dirList = list()
for root, dirs, files in os.walk("../Processed", topdown=False):
    for name in dirs:
        if (name != ".ipynb_checkpoints"):
            dirList.append(name)

#Read in the CSV:
featuresDF = pd.read_csv("features.csv", index_col=0, header=0)
featuresDF["CommonNGrams"] = ""

#Iterate through directory and process the data:
for dirs in dirList:
    totalCount = 0
    listOfNGramsPerAuthor = set()
    for file in os.listdir(os.fsencode(dirs)):
        fileName = os.fsencode(file).decode("utf-8")
        print("This book will be processed: " + dirs + " " + str(fileName))
        
        rawDataFile = open(dirs + "/"+ str(fileName), "r", encoding="utf-8")
        rawData = rawDataFile.read() #type(rawData) == string
        
        #Remove punctuation and stopwords:
        processedText = remStopwordsFromStr(rawData)
        processedText = remPuncFromStr(processedText)
        
        #Get bigrams of text:
        ngramList = ngramListGenerator(processedText, 2)
        
        #Getting count for every bigram:
        ngramCounts = Counter(ngramList)

        #Getting top 10 bigram as per highest count:
        sortedNGram = dict(sorted(ngramCounts.items(), key=operator.itemgetter(1),reverse=True)[:10])
        
        #Filter out strange nGrams:
        processedNGrams = {k: v for k, v in sortedNGram.items() if (k[0].isalpha() != False and k[1].isalpha() != False)}
        
        #Store the nGrams in a list
        for i in processedNGrams:
            listOfNGramsPerAuthor.add(i)
    
    #Now add the list to the features:
    featuresDF.at[dirList.index(dirs), "CommonNGrams"] = listOfNGramsPerAuthor
        
    #Friendly message to the user:
    print("Finished with " + str(dirs) + "'s books!")

#Push data to a CSV to read from later:
featuresDF.to_csv(path_or_buf=os.getcwd() + "\\features.csv", header=True, index=True)

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Mukund\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Mukund\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Mukund\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
This book will be processed: Dickens Bleak_House.txt




This book will be processed: Dickens David_Copperfield.txt
This book will be processed: Dickens Dombey_and_Son.txt
This book will be processed: Dickens Great_Expectations.txt
This book will be processed: Dickens Little_Dorrit.txt
This book will be processed: Dickens Nicholas_Nickleby.txt
This book will be processed: Dickens Oliver_Twist.txt
This book will be processed: Dickens Our_Mutual_Friend.txt
This book will be processed: Dickens The_Letters_of_Charles_Dickens.txt
This book will be processed: Dickens The_Pickwick_Papers.txt
Finished with Dickens's books!
This book will be processed: Tolstoy Anna_Karenina.txt
This book will be processed: Tolstoy Kingdom_of_God_is_Within_You.txt
This book will be processed: Tolstoy Sevastopol.txt
This book will be processed: Tolstoy The_Cossacks.txt
This book will be processed: Tolstoy The_Kreutzer_Sonata_and_Other_Stories.txt
This book will be processed: Tolstoy The_Resurrection.txt
This book will be processed: Tolstoy War_And_Peace.txt
Finished wi

#### List of Bag of Words

In [4]:
import os
import re #Regular expression operations
import string
import pandas as pd #To create a dataframe of data
import operator
#NLTK is an interesting library that was used in a Kaggle kernel and helps with a bunch of NLP stuff
import nltk
nltk.download("averaged_perceptron_tagger")
nltk.download("punkt")
nltk.download("stopwords")
from nltk import word_tokenize
from nltk import ngrams
from nltk.corpus import stopwords #for removing stopwords
from collections import Counter

#Get all the Authors to go through:
dirList = list()
for root, dirs, files in os.walk("../Processed", topdown=False):
    for name in dirs:
        if (name != ".ipynb_checkpoints"):
            dirList.append(name)

#Read in the CSV:
featuresDF = pd.read_csv("features.csv", index_col=0, header=0)
featuresDF["BagOfWords"] = ""

#Iterate through each set of NGrams and create the bag of words:
for i in range(len(featuresDF["CommonNGrams"])):
    bagOfWords = set() #The final bag of words for the author
    
    #Pull the list of NGrams for author x:
    listOfNGrams = featuresDF.at[i, "CommonNGrams"]
    listOfNGrams = listOfNGrams.split(",")
    
    for j in range(len(listOfNGrams)):
        string = listOfNGrams[j]
        if (string.find("(") != -1):
            string = string[string.find("(")+1:]
        elif (string.find(")") != -1):
            string = string[:string.find(")")]
        listOfNGrams[j] = string
    
    #Now get every unique word in that list
    for k in range(len(listOfNGrams)):
        bagOfWords.add(listOfNGrams[k])
    
    #Now add the bag of words to the features:
    featuresDF.at[i, "BagOfWords"] = bagOfWords
        
    #Friendly message to the user: (Plural - authors)
    print("Finished with " + str(featuresDF.at[i,"authors"]) + "!")

#Push data to a CSV to read from later:
featuresDF.to_csv(path_or_buf=os.getcwd() + "\\features.csv", header=True, index=True)

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Mukund\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Mukund\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Mukund\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Finished with Dickens!
Finished with Tolstoy!
Finished with Twain!


#### Numerical Stuff

In [53]:
import os
import re #Regular expression operations
import string
import numpy as np
import pandas as pd #To create a dataframe of data
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer
#NLTK is an interesting library that was used in a Kaggle kernel and helps with a bunch of NLP stuff
import nltk
nltk.download("averaged_perceptron_tagger")
nltk.download("punkt")
nltk.download("stopwords")
from nltk import word_tokenize
from nltk import sent_tokenize
from nltk import FreqDist
from nltk.corpus import stopwords #for removing stopwords
from collections import OrderedDict
from _collections_abc import dict_keys

######################################Get part of speech for each token in the chapter##########################
def tokenToPos(ch):
            tokens = nltk.word_tokenize(ch)
            return [p[1] for p in nltk.pos_tag(tokens)]
######################################End of Function###########################################################

#Get all the Authors to go through:
dirList = list()
for root, dirs, files in os.walk("../Processed", topdown=False):
    for name in dirs:
        if (name != ".ipynb_checkpoints"):
            dirList.append(name)

#First create the dataFrame that will hold the results:
finalDFDict = dict()
finalDFDict = OrderedDict({"authors": dirList, "avgStopWordPerSent": set(['', '1', '2']),
                               "avgWordsPerSent":set(['', '1', '2']), "sentLengthVar":set(['', '1', '2']),
                               "lexicalDiv":set(['', '1', '2']), })
numFeaturesDF = pd.DataFrame(finalDFDict)

#Iterate through directory and process the data:
for dirs in dirList:
    totalStopWordCount = 0
    totalAvgNumWords = 0
    totalSentLengthVar = 0
    totalLexicalDiv = 0
    totalCommaPerSent = 0
    totalSemiPerSent = 0
    totalColonsPerSent = 0
    for file in os.listdir(os.fsencode(dirs)):
        fileName = os.fsencode(file).decode("utf-8")
        print("This book will be processed: " + dirs + " " + str(fileName))
        
        rawDataFile = open(dirs + "/"+ str(fileName), "r", encoding="utf-8")
        rawData = rawDataFile.read() #type(rawData) == string
        words = word_tokenize(rawData)
        allSentences = sent_tokenize(rawData)
        
        #Number of words in each sentence
        wordsPerSentence = np.array([len(word_tokenize(sent)) for sent in allSentences])
        
        #Avg number of words per sentence
        avgNumWord = wordsPerSentence.mean()
        totalAvgNumWords = avgNumWord + totalAvgNumWords
        
        #Sentence length variation
        sentLengthVar = wordsPerSentence.std()
        totalSentLengthVar = sentLengthVar + totalSentLengthVar
        
        #Lexical diversity
        lexicalDiv = len(set(word_tokenize(sent))) / float(len(word_tokenize(sent)))
        totalLexicalDiv = lexicalDiv + totalLexicalDiv
        
        #Count num of stopwords:
        stop_words = set(stopwords.words("english"))
        stopWordCount = len([w for w in str(rawData).lower().split() if w in stop_words])
        totalStopWordCount = stopWordCount + totalStopWordCount
        
        #Commas per sentence
        commaPerSent = words.count(',') / float(len(allSentences))
        totalCommaPerSent = commaPerSent + totalCommaPerSent
        
        #Semicolons per sentence
        semiPerSent = words.count(';') / float(len(allSentences))
        totalSemiPerSent = semiPerSent + totalSemiPerSent
        
        #Colons per sentence
        colonsPerSent = words.count(':') / float(len(allSentences))
        totalColonsPerSent = colonsPerSent + totalColonsPerSent
        
        #Bag of Words features:
        #Get most common words in the whole book
        #numTopWords = 10
        #allTokens = word_tokenize(rawData)
        #fDist = nltk.FreqDist(allTokens)
        #vocab = list(fDist.keys())[:numTopWords]
 
        #Use sklearn to create the bag for words feature vector for each chapter
        #vectorizer = CountVectorizer(vocabulary=vocab, tokenizer=word_tokenize)
        #fvsBow = vectorizer.fit_transform(rawDataFile.read().replace('\n', ' ')).toarray().astype(np.float64)

        #Normalise by dividing each row by its Euclidean norm
        #fvsBow /= np.c_[np.apply_along_axis(np.linalg.norm, 1, fvsBow)]
        #print(fvsBow)
        
        #
        #Get POS for each token in each chapter
        bookPos = tokenToPos(rawData)
 
        #Count frequencies for common POS types
        posList = ['NN', 'NNP', 'DT', 'IN', 'JJ', 'NNS']
        fvsSyntax = np.array([[rawData.count(pos) for pos in posList] for i in list(rawData)]).astype(np.float64)
 
        #Normalise by dividing each row by number of tokens in the chapter
        fvsSyntax /= np.c_[np.array([len(rawData)])]
        print(fvsSyntax)
        
    numFeaturesDF.at[dirList.index(dirs), "avgStopWordCountPerBook"] = (totalCount/len(os.listdir(os.fsencode(dirs))))
    #Friendly message to the user:
    print("Finished with " + str(dirs) + "'s books!")

#Push data to a CSV to read from later:
numFeaturesDF.to_csv(path_or_buf=os.getcwd() + "\\numFeatures.csv", header=True, index=True)

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Mukund\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Mukund\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Mukund\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
This book will be processed: Dickens Bleak_House.txt




KeyboardInterrupt: 