# Loading the book

In [1]:
with open('miracle_in_the_andes.txt', 'r') as file:
    book = file.read()

# The most used words (non-articles)

In [3]:
from platform import python_version
python_version()

'3.11.1'

In [5]:
!pip3.11 install nltk

Collecting nltk
  Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m0m
[?25hCollecting click
  Using cached click-8.1.3-py3-none-any.whl (96 kB)
Collecting joblib
  Downloading joblib-1.2.0-py3-none-any.whl (297 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m298.0/298.0 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting regex>=2021.8.3
  Downloading regex-2022.10.31-cp311-cp311-macosx_11_0_arm64.whl (287 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m287.2/287.2 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting tqdm
  Downloading tqdm-4.64.1-py2.py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.5/78.5 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: tqdm, regex,

In [13]:
from nltk import download
from nltk.corpus import stopwords

download('stopwords')

english_stopwords = stopwords.words('english')
english_stopwords[:10]

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/weiyilee/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

# The most used non stopwords words

In [14]:
from collections import Counter
from re import compile, findall

word_pattern = compile('[A-Za-z]+')
word_query = findall(word_pattern, book.lower())

word_freq_map = Counter(word_query)

non_stopwords_list = []

for word, freq in word_freq_map.items():
    if word not in english_stopwords:
        non_stopwords_list.append((word, freq))

sorted(non_stopwords_list, key=lambda word_key_pair: word_key_pair[1], reverse=True)[:10]

[('would', 575),
 ('us', 519),
 ('said', 292),
 ('roberto', 284),
 ('could', 252),
 ('one', 249),
 ('snow', 227),
 ('mountain', 183),
 ('time', 182),
 ('like', 165)]

# Sentiment Analysis: What is the most positive and the most negative chapter?

In [11]:
from nltk.sentiment import SentimentIntensityAnalyzer

In [15]:
download('vader_lexicon')
analyzer = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/weiyilee/nltk_data...


In [17]:
test_sentence_score = analyzer.polarity_scores('Hey, look how beautiful the trees are. I love them.')

### neg, neu, pos range from 0 ~ 1, compound ranges from -1 to 1.

### Chapters sentiment analysis

In [19]:
from re import compile, split

In [21]:
chapter_pattern = compile('Chapter [\d]+')

# There is an empty string as the first element that is irrelevant
chapters = split(chapter_pattern, book)[1:]

In [23]:
for index, single_chapter in enumerate(chapters):
    print(f'Chapter {index + 1}: {analyzer.polarity_scores(single_chapter)}')

Chapter 1: {'neg': 0.061, 'neu': 0.779, 'pos': 0.16, 'compound': 1.0}
Chapter 2: {'neg': 0.12, 'neu': 0.726, 'pos': 0.154, 'compound': 0.9991}
Chapter 3: {'neg': 0.145, 'neu': 0.751, 'pos': 0.105, 'compound': -0.9999}
Chapter 4: {'neg': 0.141, 'neu': 0.721, 'pos': 0.138, 'compound': -0.9963}
Chapter 5: {'neg': 0.118, 'neu': 0.742, 'pos': 0.141, 'compound': 0.9997}
Chapter 6: {'neg': 0.124, 'neu': 0.761, 'pos': 0.115, 'compound': -0.9979}
Chapter 7: {'neg': 0.136, 'neu': 0.761, 'pos': 0.103, 'compound': -0.9999}
Chapter 8: {'neg': 0.12, 'neu': 0.786, 'pos': 0.094, 'compound': -0.9998}
Chapter 9: {'neg': 0.097, 'neu': 0.824, 'pos': 0.079, 'compound': -0.9996}
Chapter 10: {'neg': 0.086, 'neu': 0.733, 'pos': 0.181, 'compound': 1.0}
