# Split whole text to page list

In [9]:
pageList = []

def read_Harry_Potter():
    file_name = "book.txt"
    with open(file_name, encoding="utf8") as f:
        lines = f.readlines()
        pageText = ""
        for line in lines:
            formatText = line.replace("\n", "")
            rows = formatText.split(' ')
            for text in rows:
                if text == '|':
                    pageList.append(pageText)
                    pageText = ""
                else:
                    if text != 'Page' and text != '' and not text.isdigit():
                        pageText = pageText + " " + text
        
        pageList.append(pageText)
        print(f'List size = {len(pageList)}')
        

read_Harry_Potter()

List size = 348


# Explore each page

In [12]:
import pandas
from pandas import DataFrame

df = DataFrame(pageList,columns=['Page'])
df.head(5)




Unnamed: 0,Page
0,"/ THE BOY WHO LIVED Mr. and Mrs. Dursley, of ..."
1,Harry Potter and the Philosophers Stone - J.K...
2,Harry Potter and the Philosophers Stone - J.K...
3,Harry Potter and the Philosophers Stone - J.K...
4,Harry Potter and the Philosophers Stone - J.K...


# Create CountVectorizer for extract features

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

# remove duplicate word
vectorizer = CountVectorizer(stop_words='english')
vectorizer.fit(pageList) #fit =>Learn a vocabulary dictionary of all tokens in the raw documents.
vectorizer.vocabulary_


{'boy': 509,
 'lived': 2692,
 'mr': 2974,
 'mrs': 2975,
 'dursley': 1398,
 'number': 3112,
 'privet': 3499,
 'drive': 1352,
 'proud': 3526,
 'say': 3885,
 'perfectly': 3313,
 'normal': 3089,
 'thank': 4792,
 'people': 3302,
 'expect': 1534,
 'involved': 2433,
 'strange': 4545,
 'mysterious': 3004,
 'just': 2493,
 'didn': 1209,
 'hold': 2255,
 'nonsense': 3085,
 'director': 1231,
 'firm': 1677,
 'called': 637,
 'grunnings': 2076,
 'drills': 1345,
 'big': 368,
 'beefy': 317,
 'man': 2789,
 'hardly': 2149,
 'neck': 3035,
 'did': 1208,
 'large': 2573,
 'mustache': 2995,
 'blonde': 426,
 'nearly': 3030,
 'twice': 5023,
 'usual': 5114,
 'came': 643,
 'useful': 5109,
 'spent': 4366,
 'time': 4856,
 'craning': 1009,
 'garden': 1896,
 'fences': 1630,
 'spying': 4412,
 'neighbors': 3045,
 'small': 4217,
 'son': 4311,
 'dudley': 1379,
 'opinion': 3154,
 'finer': 1659,
 'dursleys': 1399,
 'wanted': 5199,
 'secret': 3960,
 'greatest': 2030,
 'fear': 1614,
 'somebody': 4307,
 'discover': 1242,
 'thi

# Show frequency word of each page

In [16]:
count_vectorizer = CountVectorizer(stop_words='english')

sparse_matrix = count_vectorizer.fit_transform(pageList)  #fit_transform=>Learn the vocabulary dictionary and return document-term matrix.

doc_term_matrix = sparse_matrix.todense() #Convert tuple words into matrix

count_vectorizer = CountVectorizer()
sparse_matrix = count_vectorizer.fit_transform(pageList)  
doc_term_matrix = sparse_matrix.todense() 
df = DataFrame(doc_term_matrix, 
                  columns=count_vectorizer.get_feature_names())
df

Unnamed: 0,1473,1709,1945,31,382b,aaaaaaaaaaargh,aaaaarrrgh,aaaargh,aaah,aargh,...,yourselves,youth,yvonne,zabini,zigzagging,zombie,zoo,zoom,zoomed,zooming
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Create TfidfVectorizer with removed stop word

In [18]:
vectorizer = TfidfVectorizer(stop_words='english')
sparse_matrix = vectorizer.fit_transform(pageList)

doc_term_matrix = sparse_matrix.todense()
doc_term_matrix.shape
df = DataFrame(doc_term_matrix, 
                  columns=vectorizer.get_feature_names())
df

Unnamed: 0,1473,1709,1945,31,382b,aaaaaaaaaaargh,aaaaarrrgh,aaaargh,aaah,aargh,...,youngsters,youth,yvonne,zabini,zigzagging,zombie,zoo,zoom,zoomed,zooming
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Show most important word for each page

In [19]:
import pandas
import numpy as np
pandas.set_option('display.max_rows', df.shape[0]+1)
important_word_df = df.idxmax(axis=1)
result_df = important_word_df.to_frame()

# result_df['Page'] = result_df.index + 1
result_df.insert(0, 'Page', range(1, 1 + len(result_df)))
# result_df.insert(loc=0, column='Page', value=np.arange(len(result_df)))
result_df.columns =['Page', 'Word']
result_df

Unnamed: 0,Page,Word
0,1,dursley
1,2,dursley
2,3,dursley
3,4,cloaks
4,5,dursley
5,6,owls
6,7,dursley
7,8,cat
8,9,street
9,10,celebrating
