# Loading the book

In [1]:
with open("miracle_in_the_andes.txt", 'r') as file:
    book = file.read()

# The most used words(non-articles)

In [2]:
import re
pattern = re.compile("[a-zA-Z]+")
findings = re.findall(pattern, book.lower())
findings[:5]

['chapter', 'before', 'it', 'was', 'friday']

In [3]:
d = {}
for word in findings:
    if word in d.keys():
        d[word] = d[word] + 1
    else:
        d[word] = 1

In [4]:
d_list = [(word, count) for (word, count) in d.items()]
d_list = sorted(d_list, key=lambda x: x[1], reverse=True)  # Sort by count
d_list[:5]

[('the', 5346), ('and', 2795), ('i', 2729), ('to', 2400), ('of', 2060)]

### Install nltk(pip install nltk)

In [5]:
import nltk

from nltk.corpus import stopwords
english_stopwords = stopwords.words("english")

In [6]:
filtered_words = []

for word, count in d_list:  # Note the order is now (word, count)
    if word not in english_stopwords:
        filtered_words.append((word, count))

In [7]:
filtered_words[:10]

[('would', 575),
 ('us', 519),
 ('said', 292),
 ('roberto', 284),
 ('could', 252),
 ('one', 249),
 ('snow', 227),
 ('mountain', 183),
 ('time', 182),
 ('like', 165)]

# Sentiment analysis: What is the most positive and the most negative chapter?

### An example

In [8]:
from nltk.sentiment import SentimentIntensityAnalyzer

In [9]:
analyzer = SentimentIntensityAnalyzer()

In [10]:
scores = analyzer.polarity_scores("Hey, look how bad the trees are. I hate them. I really hate them! HATE, hate, hate!")

In [11]:
if scores["pos"] > scores["neg"]:
    print("It is a positive text")
else:
    print("It is a negative text")

It is a negative text


In [12]:
analyzer.polarity_scores(book)

{'neg': 0.116, 'neu': 0.76, 'pos': 0.125, 'compound': 1.0}

### Chapters sentiment analysis

In [13]:
import re
pattern = re.compile("Chapter [0-9]+")
chapters =  re.split(pattern, book)

In [17]:
chapters = chapters[1:]

In [19]:
for nr, chapter in enumerate(chapters):
    scores = analyzer.polarity_scores(chapter)
    print(nr, scores)

0 {'neg': 0.145, 'neu': 0.751, 'pos': 0.105, 'compound': -0.9999}
1 {'neg': 0.141, 'neu': 0.721, 'pos': 0.138, 'compound': -0.9963}
2 {'neg': 0.118, 'neu': 0.742, 'pos': 0.141, 'compound': 0.9997}
3 {'neg': 0.124, 'neu': 0.761, 'pos': 0.115, 'compound': -0.9979}
4 {'neg': 0.136, 'neu': 0.761, 'pos': 0.103, 'compound': -0.9999}
5 {'neg': 0.12, 'neu': 0.786, 'pos': 0.094, 'compound': -0.9998}
6 {'neg': 0.097, 'neu': 0.824, 'pos': 0.079, 'compound': -0.9996}
7 {'neg': 0.086, 'neu': 0.733, 'pos': 0.181, 'compound': 1.0}
