In [62]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from collections import Counter
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
import numpy as np

import plotly.express as px

nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [63]:
res = requests.get('https://www.gutenberg.org/files/215/215-h/215-h.htm')
res

<Response [200]>

In [64]:
soup = BeautifulSoup(res.content, 'html.parser')

In [65]:
raw = soup.get_text()
t = raw.find('Contents')
chapters = re.findall(r'\n{1,3}Chapter\s\w{1,3}\.(.*)\n{1,3}', raw)

len(chapters)

7

In [66]:
pos = raw.find('Chapter I.')
pos = raw.find('Chapter I.', pos+1)
raw = raw[pos:]

In [67]:
raw_chapter = []
for i in range(0, len(chapters)-1):
  start_pos = raw.find(chapters[i])
  end_pos = raw.find(chapters[i+1])
  raw_chapter.append(raw[start_pos:end_pos])
end_chapter = raw.find(chapters[-1])
end = raw.rfind('which is the song of the pack')
raw_chapter.append(raw[end_chapter:end])
raw_chapter[:100]

['Into the Primitive\n\r\n“Old longings nomadic leap,\r\nChafing at custom’s chain;\r\nAgain from its brumal sleep\r\nWakens the ferine strain.”\r\n\n\r\nBuck did not read the newspapers, or he would have known that trouble was\r\nbrewing, not alone for himself, but for every tide-water dog, strong of muscle\r\nand with warm, long hair, from Puget Sound to San Diego. Because men, groping\r\nin the Arctic darkness, had found a yellow metal, and because steamship and\r\ntransportation companies were booming the find, thousands of men were rushing\r\ninto the Northland. These men wanted dogs, and the dogs they wanted were heavy\r\ndogs, with strong muscles by which to toil, and furry coats to protect them\r\nfrom the frost.\r\n\n\r\nBuck lived at a big house in the sun-kissed Santa Clara Valley. Judge\r\nMiller’s place, it was called. It stood back from the road, half hidden\r\namong the trees, through which glimpses could be caught of the wide cool\r\nveranda that ran around its four sid

In [68]:
words_per_chapter = np.array([len(word_tokenize(chapter)) for chapter in raw_chapter])
sent_per_chapter = np.array([len(sent_tokenize(chapter)) for chapter in raw_chapter])
sent_per_chapter

array([183, 165, 275, 176, 321, 236, 265])

In [69]:
data = {
    'chapter': range(1, 8),
    'chapter_name': chapters,
    'text': raw_chapter,
    'word_count': words_per_chapter,
    'sent_count': sent_per_chapter,
    'word_per_sent': words_per_chapter / sent_per_chapter
}
df = pd.DataFrame.from_dict(data)
df

Unnamed: 0,chapter,chapter_name,text,word_count,sent_count,word_per_sent
0,1,Into the Primitive,Into the Primitive\n\r\n“Old longings nomadic ...,4425,183,24.180328
1,2,The Law of Club and Fang,The Law of Club and Fang\n\r\nBuck’s first day...,3823,165,23.169697
2,3,The Dominant Primordial Beast,The Dominant Primordial Beast\n\r\nThe dominan...,5904,275,21.469091
3,4,Who Has Won to Mastership,Who Has Won to Mastership\n\r\n“Eh? Wot I say?...,3739,176,21.244318
4,5,The Toil of Trace and Trail,The Toil of Trace and Trail\n\r\nThirty days f...,6320,321,19.688474
5,6,For the Love of a Man,For the Love of a Man\n\r\nWhen John Thornton ...,5640,236,23.898305
6,7,The Sounding of the Call,The Sounding of the Call\n\r\nWhen Buck earned...,7068,265,26.671698


In [70]:
# Tokenize words in each chapter
df['tokenized'] = df['text'].apply(lambda t: word_tokenize(t))
df.head()

Unnamed: 0,chapter,chapter_name,text,word_count,sent_count,word_per_sent,tokenized
0,1,Into the Primitive,Into the Primitive\n\r\n“Old longings nomadic ...,4425,183,24.180328,"[Into, the, Primitive, “, Old, longings, nomad..."
1,2,The Law of Club and Fang,The Law of Club and Fang\n\r\nBuck’s first day...,3823,165,23.169697,"[The, Law, of, Club, and, Fang, Buck, ’, s, fi..."
2,3,The Dominant Primordial Beast,The Dominant Primordial Beast\n\r\nThe dominan...,5904,275,21.469091,"[The, Dominant, Primordial, Beast, The, domina..."
3,4,Who Has Won to Mastership,Who Has Won to Mastership\n\r\n“Eh? Wot I say?...,3739,176,21.244318,"[Who, Has, Won, to, Mastership, “, Eh, ?, Wot,..."
4,5,The Toil of Trace and Trail,The Toil of Trace and Trail\n\r\nThirty days f...,6320,321,19.688474,"[The, Toil, of, Trace, and, Trail, Thirty, day..."


Count words in each chapter

In [71]:
characters = ['Buck', 'John|Thornton', 'Hans', 'Spitz', 'François', 'Perrault', 'Mercedes', 'Charles', 'Dave', 'Sol-leks', 'Hal']
list_counter = [Counter(token) for token in df['tokenized']]

In [72]:
from numpy.ma.core import cumsum
character_freq = dict()
for character in characters:
  freq = []
  for counter in list_counter:
    l = character.split('|')
    total = 0
    for e in l:
      total += counter[e]
    freq.append(total)
  character_freq[character] = cumsum(freq)

character_freq

{'Buck': masked_array(data=[ 43,  76, 152, 188, 220, 295, 358],
              mask=False,
        fill_value=999999),
 'Charles': masked_array(data=[ 0,  0,  0,  0, 17, 17, 17],
              mask=False,
        fill_value=999999),
 'Dave': masked_array(data=[ 1, 10, 16, 25, 25, 25, 25],
              mask=False,
        fill_value=999999),
 'François': masked_array(data=[ 6, 17, 38, 58, 59, 60, 60],
              mask=False,
        fill_value=999999),
 'Hal': masked_array(data=[ 0,  0,  0,  0, 37, 37, 37],
              mask=False,
        fill_value=999999),
 'Hans': masked_array(data=[ 0,  0,  0,  0,  0, 15, 18],
              mask=False,
        fill_value=999999),
 'John|Thornton': masked_array(data=[  0,   0,   0,   0,  24, 109, 142],
              mask=False,
        fill_value=999999),
 'Mercedes': masked_array(data=[ 0,  0,  0,  0, 21, 21, 21],
              mask=False,
        fill_value=999999),
 'Perrault': masked_array(data=[ 6, 16, 28, 37, 38, 39, 39],
              mask

In [73]:
df_character_freq = pd.DataFrame.from_dict(character_freq)
df_character_freq.index = range(1, len(df_character_freq)+1)

In [74]:
px.line(df_character_freq, title='Cumsum of freq of characters', markers=True)

1.   Spitz is the main rival of Buck when he comes to Alaska. He makes his first appreance in chapter 2, and last until chapter 4 (he died). In chapter 3, he apprear more frequently indicate the tension between Buck and him.
2.   The last and most beloveed owner of Buck, John Thornton makes his first appreance in chapter 5 and last until the last chapter. His frequency increases dramatically in chapter 6, which indicate their relationship strengthen.



## Let see how buck changes

We compare the frequency of word ('dog' or 'dogs') and ('wolf' or 'wolves')

In [99]:
natures = ['dog|dogs', 'wolf|wolves', 'John|Thornton']
buck_dog_wolf = dict()
for nature in natures:
  freq = []
  for counter in list_counter:
    l = nature.split('|')
    total = 0
    for e in l:
      total += counter[e]
    freq.append(total)
  buck_dog_wolf[nature] = freq
df_buck = pd.DataFrame.from_dict(buck_dog_wolf)
df_buck

Unnamed: 0,dog|dogs,wolf|wolves,John|Thornton
0,23,0,0
1,17,3,0
2,34,3,0
3,21,0,0
4,41,0,24
5,17,1,85
6,13,26,33


In [100]:
px.line(data_frame=df_buck)

As expected, throughout the story, most of the time, Buck is more like a dog, expecially after the John appears. The word dog appear more than wolf. However, he slowly comes back to his nature while he is with John, we can see a spike along with John's appearance. However, he finally becomes a wolf after John's death. We can see it by wolf curve above the dog curve.

As 