<a href="https://colab.research.google.com/github/ynebin/class2024Spring/blob/main/W11_0517.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<h1>Week 11.
<br>
Natural Language Processing 2</h1>

In [None]:
!git clone https://github.com/ErikaJacobs/Harry-Potter-Text-Mining.git

## Word count

<h3>Load data</h3>

In [None]:
import glob

text_paths = glob.glob("Harry-Potter-Text-Mining/Book Text/*.txt")

In [None]:
import pandas as pd

df = pd.DataFrame()

for text_path in sorted(text_paths):
  text_df = pd.read_csv(text_path, sep="@")
  df = pd.concat([df, text_df])

df = df.reset_index().drop("index", axis=1)

<h3>Tokenize</h3>

In [None]:
from nltk.tokenize import RegexpTokenizer

retokenize = RegexpTokenizer("[\w]+")

In [None]:
df["Tokens"] = df["Text"].str.lower().apply(retokenize.tokenize)

<h3>Number of words in each book</h3>

In [None]:
df["WordCount"] = df["Tokens"].str.len()

In [None]:
import matplotlib.pyplot as plt

titles=["Philosopher's Stone", "Chamber of Secrets", "Prisoner of Azkaban", "Goblet of Fire", "Order of the Phoenix", "Half Blood Prince", "Deathly Hallows"]

total_result = df.groupby("Book")["WordCount"].sum().reset_index()
total_result.plot(x="Book", y="WordCount", kind="bar", figsize=(15, 10), color=['#DC8458', '#950702', '#8E067D', '#2E8C44', '#395196', '#60A619','#ECA10A'], legend=False)

plt.xticks(range(7), titles)
plt.show()

<h3>Average number of words per chapter of each book</h3>

In [None]:
mean_result = df.groupby("Book")["WordCount"].mean().reset_index()
mean_result.plot(x="Book", y="WordCount", kind="bar", figsize=(15, 10), color=['#DC8458', '#950702', '#8E067D', '#2E8C44', '#395196', '#60A619','#ECA10A'], legend=False)

plt.xticks(range(7), titles)
plt.show()

<h3>Shortest chapter of each book</h3>

In [None]:
min_result = df.groupby("Book")["WordCount"].min().reset_index()
min_result.plot(x="Book", y="WordCount", kind="bar", figsize=(15, 10), color=['#DC8458', '#950702', '#8E067D', '#2E8C44', '#395196', '#60A619','#ECA10A'], legend=False)

plt.xticks(range(7), titles)
plt.show()

<h3>Longest chapter of each book</h3>

In [None]:
max_result = df.groupby("Book")["WordCount"].max().reset_index()
max_result.plot(x="Book", y="WordCount", kind="bar", figsize=(15, 10), color=['#DC8458', '#950702', '#8E067D', '#2E8C44', '#395196', '#60A619','#ECA10A'], legend=False)

plt.xticks(range(7), titles)
plt.show()

<h3>Shortest chapter of each book (DataFrame)</h3>

In [None]:
min_book = min_result["Book"]
min_word_count = min_result["WordCount"]

min_df = pd.DataFrame()

for b, wc in zip(min_book, min_word_count):
  min_df = pd.concat([min_df, df[(df["WordCount"] == wc) & (df["Book"] == b)]])

In [None]:
min_df

<h3>Longest chapter of each book (DataFrame)</h3>

In [None]:
max_book = max_result["Book"]
max_word_count = max_result["WordCount"]

max_df = pd.DataFrame()

for b, wc in zip(max_book, max_word_count):
  max_df = pd.concat([max_df, df[(df["WordCount"] == wc) & (df["Book"] == b)]])

In [None]:
max_df

## Sentiment analysis

<h3>Load data</h3>

In [None]:
sent_df = pd.read_csv("sent_df.csv", sep="@")

In [None]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk

nltk.download('vader_lexicon')
analyzer = nltk.sentiment.vader.SentimentIntensityAnalyzer()

<h3>Sentiment score</h3>

In [None]:
sent_df["Score"] = sent_df["Sentence"].apply(analyzer.polarity_scores)

In [None]:
sent_df["TotalScore"] = sent_df["Score"].apply(lambda x: x["compound"])
sent_df["PosScore"] = sent_df["Score"].apply(lambda x: x["pos"])
sent_df["NeuScore"] = sent_df["Score"].apply(lambda x: x["neu"])
sent_df["NegScore"] = sent_df["Score"].apply(lambda x: x["neg"])

In [None]:
sent_df = sent_df.drop(["Score"], axis=1)  # Drop the original score

sent_df["PosFlag"] = sent_df["TotalScore"].apply(lambda x: x >= 0.05)
sent_df["NeuFlag"] = sent_df["TotalScore"].apply(lambda x: x > -0.05 and x < 0.05)
sent_df["NegFlag"] = sent_df["TotalScore"].apply(lambda x: x <= -0.05)

In [None]:
sent_df.groupby("Book")["TotalScore"].mean()

In [None]:
print(f"There are {sent_df['PosFlag'].sum()} positive sentences.")
print(f"There are {sent_df['NegFlag'].sum()} negative sentences.")
print(f"There are {sent_df['NeuFlag'].sum()} neutral sentences.")

<h3>Visualization</h3>

In [None]:
sent_df.groupby(["Chapter", "Book"])["TotalScore"].mean().unstack().plot(subplots=True, figsize=(15, 10), ylim=(-0.3, 0.3))
plt.show()