# Extracting Sentiment Scores and Preparing Datasets

In [None]:
import os
import oseti
from nltk.sentiment import SentimentIntensityAnalyzer
import statistics
from tqdm import tqdm
import pandas as pd
import numpy as np

In [2]:
sia = SentimentIntensityAnalyzer()
analyzer = oseti.Analyzer()

The collection consists of 42 authors, among which 32 are males and 10 females. <br>
<br>
Male authors:<br>
Abe Kazue<br>
Fujimoto Hitoshi<br>
Fumizawa Ryuichi<br>
Hara Tamiki<br>
Hashioka Takeshi<br>
Hosoda Tamiki<br>
Hotta Yoshie<br>
Iida Momo<br>
Ikuguchi Juro<br>
Inoue Mitsuhara<br>
Ishida Koji<br>
Iwasaki Seiichiro<br>
Kajiyama Tohiyuki<br>
Kamezawa Miyuki<br>
Kanai Toshihiro<br>
Katsura Yoshihisa<br>
Kawakami Sokun<br>
Kokubo Hitoshi<br>
Kurita Tohei<br>
Kyo Kusao<br>
Nakai Masafumi<br>
Nakayama Shiro<br>
Nakazato Kisho<br>
Natsubori Masamoto<br>
Nishihara Kei<br>
Ochi Michio<br>
Oda Katsuzo<br>
Oda Makoto<br>
Oe Kenzaburo<br>
Saiki Hisao<br>
Takeda Taijun<br>
Tsukuda Jitsuo<br>
<br>
Female authors:<br>
<br>
Ariyoshi Sawako<br>
Hayashi Kyoko<br>
Hironaka Toshio<br>
Inada Mihoko<br>
Kora Chihoko<br>
Mikawa Kiyo<br>
Nakamoto Takako<br>
Ota Yoko<br>
Sata Ineko<br>
Takenishi Hiroko<br>
<br>
Among which there are 76 pieces written by males and 41 by females.

In [3]:
authors = os.listdir("preprocessed_texts")
male = []
female = []
male_folders = []
female_folders = []
for author in authors:
    data = author.split(" - ")
    if data[1] == "M":
        male.append(data[0])
        male_folders.append(author)
    elif data[1] == "F":
        female.append(data[0])
        female_folders.append(author)

## Sentiment analysis of the Pieces

Below, in the commented cells follows the code for SA. As the Oseti package seems to be written in pure Python, the process consumes some time, so the results are written in the files which can be located in "data" folder. <br>
The sentiment scores were found separately for the compund texts, the character's speech and author's speech for each work. The code, used for the extraction of the author's and direct speech is placed in "drafts" folder.

In [40]:
def analyze_piece(input_text:str):
    """employs sentiment analysis of the input text,
    returns list of sa score for every sentence, mean value of all sentiment scores,
    and mean value with dropped zero scores"""
    sa_results = analyzer.analyze(input_text)
    non_zero_score = [score for score in sa_results if score != 0]
    return sa_results, statistics.mean(sa_results), statistics.mean(non_zero_score)

In [None]:
# folders = os.listdir("preprocessed_texts") #overall sentiment
# for folder in tqdm(folders):
#     files = os.listdir(f"preprocessed_texts\\{folder}")
#     for doc in files:
#         with open(f"preprocessed_texts\\{folder}\\{doc}", encoding="utf-8") as file:
#             text = file.read()
#         results = analyze_piece(text)
#         text = str(results[0])
#         output_data = text[1:-1]
#         with open(f"data\\SA scores\\{doc}", mode="w", encoding="utf-8") as file:
#             file.write(output_data)

In [None]:
# folders = os.listdir("author's speech") # authors speech sentiment
# for folder in tqdm(folders):
#     files = os.listdir(f"author's speech\\{folder}")
#     for doc in files:
#         with open(f"author's speech\\{folder}\\{doc}", encoding="utf-8") as file:
#             text = file.read()
#         results = analyze_piece(text)
#         text = str(results[0])
#         output_data = text[1:-1]
#         with open(f"data\\SA scores author's speech\\{doc}", mode="w", encoding="utf-8") as file:
#             file.write(output_data)

In [None]:
# folders = os.listdir("direct speech") # direct speech sentiment
# for folder in tqdm(folders):
#     files = os.listdir(f"direct speech\\{folder}")
#     for doc in files:
#         with open(f"direct speech\\{folder}\\{doc}", encoding="utf-8") as file:
#             text = file.read()
#         results = analyze_piece(text)
#         text = str(results[0])
#         output_data = text[1:-1]
#         with open(f"data\\SA scores direct speech\\{doc}", mode="w", encoding="utf-8") as file:
#             file.write(output_data)

In [41]:
lengthes = {}
folders = os.listdir("texts")
male_works = []
female_works = []
for folder in folders:
    subfolders = os.listdir(f"texts\\{folder}")
    if folder in male_folders:
        male_works += subfolders
    elif folder in female_folders:
        female_works += subfolders
    for subfolder in subfolders:
        with open(f"texts\\{folder}\\{subfolder}", encoding="utf-8") as file:
            text = file.read()
        lengthes[subfolder] = len(text)

In [44]:
len(male_works)

76

In [45]:
len(female_works)

41

In [42]:
all_works_raw = female_works + male_works
all_works = [item[:-4] for item in all_works_raw]
ordered_lengthes = [lengthes[item] for item in all_works_raw]

In [43]:

gender= ["F" for i in female_works] + ["M" for i in male_works]

In [44]:
all_data = pd.DataFrame(columns=["Title", "Gender", "Length in characters", "Overall Sentiment", "Direct Speech Sentiment", "Author's Speech Sentiment",
                                  "Difference Value", "Exceeds?"])

In [45]:
docs = os.listdir("data\\SA scores") #extract numbers of overall sentiment for dataframe
ordered_overall_sent = []
for item in all_works_raw:
    with open (f"data\\SA scores\\{item}", encoding="utf-8") as file:
        file_data = file.read()
    file_data = file_data.split(', ')
    file_data = [float(item) for item in file_data]
    ordered_overall_sent.append(statistics.mean(file_data))

In [46]:
#extract numbers of the direct speech sentiment for dataframe
ordered_ds_sent = []
for item in all_works_raw:
    with open (f"data\\SA scores direct speech\\{item}", encoding="utf-8") as file:
        file_data = file.read()
    file_data = file_data.split(', ')
    file_data = [float(item) for item in file_data]
    ordered_ds_sent.append(statistics.mean(file_data))

In [47]:
#extract numbers of the author's speech sentiment for dataframe
ordered_aus_sent = []
for item in all_works_raw:
    with open (f"data\\SA scores author's speech\\{item}", encoding="utf-8") as file:
        file_data = file.read()
    file_data = file_data.split(', ')
    file_data = [float(item) for item in file_data]
    ordered_aus_sent.append(statistics.mean(file_data))

In [48]:
more_intensive = []
for i in range(len(ordered_ds_sent)):
    if ordered_ds_sent[i] > ordered_aus_sent[i]:
        more_intensive.append("Yes")
    else:
        more_intensive.append("No")

In [49]:
all_data["Title"] = all_works
all_data["Gender"] = gender
all_data["Length in characters"] = ordered_lengthes
all_data["Overall Sentiment"] = ordered_overall_sent
all_data["Direct Speech Sentiment"] = ordered_ds_sent
all_data["Author's Speech Sentiment"] = ordered_aus_sent
all_data["Difference Value"] = all_data["Direct Speech Sentiment"] - all_data["Author's Speech Sentiment"]
all_data["Exceeds?"] = more_intensive
all_data

Unnamed: 0,Title,Gender,Length in characters,Overall Sentiment,Direct Speech Sentiment,Author's Speech Sentiment,Difference Value,Exceeds?
0,有吉 佐和子 - 祈禱,F,30102,0.007671,0.065030,-0.035530,0.100560,Yes
1,林 京子 - ギヤマン ビードロ,F,112867,-0.011454,-0.054455,-0.010385,-0.044071,No
2,林 京子 - 二人の墓標,F,22251,-0.110104,-0.139089,-0.098741,-0.040348,No
3,林 京子 - 同期会,F,16390,0.016105,0.142857,0.017386,0.125471,Yes
4,林 京子 - 昭和二十年の夏,F,18826,-0.013255,0.500000,-0.015090,0.515090,Yes
...,...,...,...,...,...,...,...,...
112,大江 健三郎 - ヒロシマ・ノ—卜 7,M,10756,-0.119351,-0.257576,-0.111784,-0.145792,No
113,大江 健三郎 - 核状況のカナリア理論,M,16313,0.006504,-0.204678,0.023987,-0.228665,No
114,斎木寿夫 - 死者は裁かない,M,17833,-0.130481,-0.121918,-0.128818,0.006900,Yes
115,武田 泰淳 - 第一のボタン,M,78975,0.004374,0.045368,-0.023118,0.068487,Yes


In [50]:
all_data.replace(0, np.nan, inplace=True)
all_data.dropna(inplace=True)
all_data

Unnamed: 0,Title,Gender,Length in characters,Overall Sentiment,Direct Speech Sentiment,Author's Speech Sentiment,Difference Value,Exceeds?
0,有吉 佐和子 - 祈禱,F,30102,0.007671,0.065030,-0.035530,0.100560,Yes
1,林 京子 - ギヤマン ビードロ,F,112867,-0.011454,-0.054455,-0.010385,-0.044071,No
2,林 京子 - 二人の墓標,F,22251,-0.110104,-0.139089,-0.098741,-0.040348,No
3,林 京子 - 同期会,F,16390,0.016105,0.142857,0.017386,0.125471,Yes
4,林 京子 - 昭和二十年の夏,F,18826,-0.013255,0.500000,-0.015090,0.515090,Yes
...,...,...,...,...,...,...,...,...
112,大江 健三郎 - ヒロシマ・ノ—卜 7,M,10756,-0.119351,-0.257576,-0.111784,-0.145792,No
113,大江 健三郎 - 核状況のカナリア理論,M,16313,0.006504,-0.204678,0.023987,-0.228665,No
114,斎木寿夫 - 死者は裁かない,M,17833,-0.130481,-0.121918,-0.128818,0.006900,Yes
115,武田 泰淳 - 第一のボタン,M,78975,0.004374,0.045368,-0.023118,0.068487,Yes


In [56]:
import pandas as pd
import numpy as np

df = pd.read_excel('data\\results with dropped zero scores.xlsx')
df.replace(0, np.nan, inplace=True)
df.dropna(inplace=True)
df.to_excel('data\\results with dropped zero scores without zero scores.xlsx', index=False)

In [33]:
all_data.to_excel("data\\results without zero values.xlsx")

## Dataset with dropped zero sentiment scores

As the Oseti sentiment analysis is a rough instruments, it cannot identify the sentiment of a number of sentences as its sentiment dictionary is limited. In this case, it marks these sentences having zero scores. As these zero define only average score, we can skip them, modifying our observations.

In [14]:
 #extract numbers of overall sentiment for dataframe drop zero scores
ordered_overall_sent_n0 = []
for item in all_works_raw:
    with open (f"data\\SA scores\\{item}", encoding="utf-8") as file:
        file_data = file.read()
    file_data = file_data.split(', ')
    file_data = [float(item) for item in file_data]
    file_data_n0 = [item for item in file_data if item != 0]
    ordered_overall_sent_n0.append(statistics.mean(file_data_n0))
    

In [15]:
 #extract numbers of direct speech sentiment for dataframe drop zero scores
ordered_ds_sent_n0 = []
for item in all_works_raw:
    with open (f"data\\SA scores direct speech\\{item}", encoding="utf-8") as file:
        file_data = file.read()
    file_data = file_data.split(', ')
    file_data = [float(item) for item in file_data]
    file_data_n0 = [item for item in file_data if item != 0]
    ordered_ds_sent_n0.append(statistics.mean(file_data))

In [16]:
 #extract numbers of author's speech sentiment for dataframe drop zero scores
ordered_aus_sent_n0 = []
for item in all_works_raw:
    with open (f"data\\SA scores author's speech\\{item}", encoding="utf-8") as file:
        file_data = file.read()
    file_data = file_data.split(', ')
    file_data = [float(item) for item in file_data]
    file_data_n0 = [item for item in file_data if item != 0]
    ordered_aus_sent_n0.append(statistics.mean(file_data_n0))

In [18]:
more_intensive_n0 = []
for i in range(len(ordered_ds_sent_n0)):
    if ordered_ds_sent_n0[i] > ordered_aus_sent_n0[i]:
        more_intensive_n0.append("Yes")
    else:
        more_intensive_n0.append("No")

In [53]:
all_data_n0 = pd.DataFrame(columns=["Title", 'Gender',"Length in characters", "Overall Sentiment", "Direct Speech Sentiment", "Author's Speech Sentiment",
                                  "Difference Value", "Exceeds?"])

In [54]:
all_data_n0["Title"] = all_works
all_data_n0['Gender'] = gender
all_data_n0["Length in characters"] = ordered_lengthes
all_data_n0["Overall Sentiment"] = ordered_overall_sent_n0
all_data_n0["Direct Speech Sentiment"] = ordered_ds_sent_n0
all_data_n0["Author's Speech Sentiment"] = ordered_aus_sent_n0
all_data_n0["Difference Value"] = all_data_n0["Direct Speech Sentiment"] - all_data_n0["Author's Speech Sentiment"]
all_data_n0["Exceeds?"] = more_intensive_n0
all_data_n0

Unnamed: 0,Title,Gender,Length in characters,Overall Sentiment,Direct Speech Sentiment,Author's Speech Sentiment,Difference Value,Exceeds?
0,有吉 佐和子 - 祈禱,F,30102,0.016435,0.065030,-0.063263,0.128293,Yes
1,林 京子 - ギヤマン ビードロ,F,112867,-0.027478,-0.054455,-0.025238,-0.029218,No
2,林 京子 - 二人の墓標,F,22251,-0.230500,-0.139089,-0.203634,0.064546,Yes
3,林 京子 - 同期会,F,16390,0.041546,0.142857,0.045203,0.097654,Yes
4,林 京子 - 昭和二十年の夏,F,18826,-0.031272,0.500000,-0.035756,0.535756,Yes
...,...,...,...,...,...,...,...,...
112,大江 健三郎 - ヒロシマ・ノ—卜 7,M,10756,-0.197422,-0.257576,-0.188040,-0.069536,No
113,大江 健三郎 - 核状況のカナリア理論,M,16313,0.009970,-0.204678,0.037979,-0.242658,No
114,斎木寿夫 - 死者は裁かない,M,17833,-0.353362,-0.121918,-0.351449,0.229531,Yes
115,武田 泰淳 - 第一のボタン,M,78975,0.008466,0.045368,-0.041310,0.086679,Yes


In [55]:
all_data_n0.to_excel("data/results with dropped zero scores.xlsx")

In [92]:
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder


In [None]:

encoder = LabelEncoder()

All these datasets will be used in main.ipynb in which we analyse the extracted data.