# Load the book

In [1]:
with open("miracle_in_the_andes.txt", 'r') as file:
    book = file.read()

In [3]:
type(book)

str

# How many chapters?

### With string methods

In [23]:
book.count("Chapter") # 10 chapters, one chapter embedded in the content

11

### With regular express (regex)

In [5]:
import re

In [27]:
pattern = re.compile("Chapter [0-9]+") #
pattern

re.compile(r'Chapter [0-9]+', re.UNICODE)

In [33]:
re.findall(pattern, book)[7:10]

['Chapter 8', 'Chapter 9', 'Chapter 10']

In [31]:
findings = re.findall(pattern, book)
len(findings)

10

# Find all sentences containing "love"

In [46]:
pattern = re.compile("[^.]* love [^.]*.") # ^除了, [^.]*. 所有开头不是.但是末尾是句号.
finding = re.findall(pattern, book)
len(finding) #结果不对，因为还有一种情况，love跟着一个,

49

In [49]:
# Fine tune the above pattern.
pattern = re.compile("[^.]* love[^a-zA-Z]+[^.]*.")
findings = re.findall(pattern, book)
len(findings)
findings[:1] #there is a space in the beginning.

[' As a young man, of course, I could not put these things into words, but I knew, and my teammates knew, that there was something special about the game, and under the guidance of the Christian Brothers we developed a passionate love for the sport that shaped our friendships and our lives.']

In [52]:
# Finer tuning
pattern = re.compile("[A-Z]{1}[^.]* love[^a-zA-Z]+[^.]*.") #开头大写,{1}只有一次
findings = re.findall(pattern, book)
findings[:1]

['As a young man, of course, I could not put these things into words, but I knew, and my teammates knew, that there was something special about the game, and under the guidance of the Christian Brothers we developed a passionate love for the sport that shaped our friendships and our lives.']

# Find the paragraphs containing "love"

In [85]:
pattern = re.compile("[^\n]+ love[^\n]+")
findings = re.findall(pattern, book)
findings[:1]

['To me, this is the essence of rugby. No other sport gives you such an intense sense of selflessness and unified purpose. I believe this is why rugby players all over the world feel such a passion for the game and such a feeling of brotherhood. As a young man, of course, I could not put these things into words, but I knew, and my teammates knew, that there was something special about the game, and under the guidance of the Christian Brothers we developed a passionate love for the sport that shaped our friendships and our lives. For eight years we played our hearts out for the Christian Brothers—a brotherhood of young boys with Latin names, playing a game with deep Anglo roots under Uruguay’s sunny skies, and proudly wearing the bright green shamrock on our uniforms. The game became so much a part of our lives, in fact, that when we graduated from Stella Maris at the age of sixteen, many of us could not bear the thought that our playing days were over. Our salvation came in the form of

# Find all chapter titles

### Method 1

In [86]:
pattern = re.compile("[a-zA-Z ,]+\n\n")
findings = re.findall(pattern, book)
findings = [item.strip("\n\n") for item in findings]
findings

['Before',
 'Everything Precious',
 'A Promise',
 'Breathe Once More',
 'Abandoned',
 'Tomb',
 'East',
 'The Opposite of Death',
 'I See a Man',
 'After']

### Method 2

In [88]:
pattern = re.compile("([a-zA-Z ,]+)\n\n") # ()只返回括号里的
findings = re.findall(pattern, book)
findings

['Before',
 'Everything Precious',
 'A Promise',
 'Breathe Once More',
 'Abandoned',
 'Tomb',
 'East',
 'The Opposite of Death',
 'I See a Man',
 'After']

# The most used words in book

In [67]:
pattern = re.compile("[a-zA-Z]+")
findings = re.findall(pattern, book.lower())
len(findings)
findings[:8]

['chapter', 'before', 'it', 'was', 'friday', 'the', 'thirteenth', 'of']

In [68]:
# Count words
dic = {}
for word in findings:
    if word not in dic:
        dic[word] = 1
    else:
        dic[word] += 1
    

In [78]:
dic_list = [(val, key) for key, val in dic.items()]
sorted(dic_list, reverse=True)[:10]

[(5346, 'the'),
 (2795, 'and'),
 (2729, 'i'),
 (2400, 'to'),
 (2060, 'of'),
 (1566, 'a'),
 (1430, 'was'),
 (1419, 'in'),
 (1226, 'we'),
 (1169, 'my')]

# Define function to find the occurrence of the word

In [89]:
def find(w):
    pattern = re.compile("[a-zA-Z]+")
    findings = re.findall(pattern, book.lower())
    dic = {}
    for word in findings:
        if word not in dic:
            dic[word] = 1
        else:
            dic[word] += 1
    try:
        return dic[w]
    except:
        return f"The book does not contain the word {w}"


In [90]:
find("love")

83

In [91]:
find("hate")

'The book does not contain the word hate'

# NLP Intelligent Method

## The most used words (non-articles)

In [111]:
# or pip install nltk library

# First check python version Jupyter lab is using
from platform import python_version
python_version()

# pip3.9 install nltk

'3.9.7'

In [121]:
import nltk
print(nltk.__version__)
nltk.download("stopwords")

3.7


[nltk_data] Downloading package stopwords to /Users/zhou/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [120]:
from nltk.corpus import stopwords
eng_stopwords = stopwords.words("english")
eng_stopwords[:8]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves']

# Filter book words by the eng_stopwords

In [127]:
filtered_words = []
for count, word in dic_list:
    if word not in eng_stopwords:
        filtered_words.append((word, count))


In [129]:
filtered_words[:3]

[('chapter', 11), ('friday', 1), ('thirteenth', 1)]

# Sentiment Analysis: What is the most positive and the most negative chapter

### An example

In [130]:
from nltk.sentiment import SentimentIntensityAnalyzer

In [133]:
nltk.download("vader_lexicon")
analyzer = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/zhou/nltk_data...


In [135]:
dir(analyzer)

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_amplify_ep',
 '_amplify_qm',
 '_but_check',
 '_idioms_check',
 '_least_check',
 '_never_check',
 '_punctuation_emphasis',
 '_sift_sentiment_scores',
 'constants',
 'lexicon',
 'lexicon_file',
 'make_lex_dict',
 'polarity_scores',
 'score_valence',
 'sentiment_valence']

In [136]:
analyzer.polarity_scores("Hey, the scenery is so beautiful, I really love here")

{'neg': 0.0, 'neu': 0.418, 'pos': 0.582, 'compound': 0.8945}

### Chapters Sentiment Analyzer

In [137]:
pattern = re.compile("Chapter [0-9]+")
chapters = re.split(pattern, book)

In [141]:
print(len(chapters))
#chapters

11


In [142]:
chapters = chapters[1:]

In [144]:
for num, chapter in enumerate(chapters):
    score = analyzer.polarity_scores(chapter)
    print(f"Chapter {num+1} Polarity Score is {score}")

Chapter 1 Polarity Score is {'neg': 0.061, 'neu': 0.779, 'pos': 0.16, 'compound': 1.0}
Chapter 2 Polarity Score is {'neg': 0.12, 'neu': 0.726, 'pos': 0.154, 'compound': 0.9991}
Chapter 3 Polarity Score is {'neg': 0.145, 'neu': 0.751, 'pos': 0.105, 'compound': -0.9999}
Chapter 4 Polarity Score is {'neg': 0.141, 'neu': 0.721, 'pos': 0.138, 'compound': -0.9963}
Chapter 5 Polarity Score is {'neg': 0.118, 'neu': 0.742, 'pos': 0.141, 'compound': 0.9997}
Chapter 6 Polarity Score is {'neg': 0.124, 'neu': 0.761, 'pos': 0.115, 'compound': -0.9979}
Chapter 7 Polarity Score is {'neg': 0.136, 'neu': 0.761, 'pos': 0.103, 'compound': -0.9999}
Chapter 8 Polarity Score is {'neg': 0.12, 'neu': 0.786, 'pos': 0.094, 'compound': -0.9998}
Chapter 9 Polarity Score is {'neg': 0.097, 'neu': 0.824, 'pos': 0.079, 'compound': -0.9996}
Chapter 10 Polarity Score is {'neg': 0.086, 'neu': 0.733, 'pos': 0.181, 'compound': 1.0}
