In [None]:
import re
import pandas as pd
import pickle
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
tqdm.pandas()

# Text Pre-processing

In [None]:
from nltk import word_tokenize, sent_tokenize
import pandas as pd
import spacy
import gutenbergpy.textget

In [None]:
df = pd.read_csv('./data/metadata_added.csv', encoding_errors='ignore')

# Removing non-English works, works without year of first publication, renewed works, and works with various authors
metadata_df = df[(df['lang'] == 'English') & (df['year'].notnull()) & 
                 (df['renewed'] == False) & (df['author_id'] != 116)]

In [None]:
# Preamble, postamble patterns

foreword_pattern = r'(.|\n)*\*\*\* ?START OF(.|\n)*?\*\*\*'
transc_pattern = r'\[Transcriber((.|\n)*?)\]'
prod_pattern = r'Produced by(.|\n)*?\n\n\n'
prep_pattern = r'E-text prepared by(.|\n)*?\n\n'
prep_pattern2 = r'This etext was(.|\n)*?\n\n'
note_pattern = r'Note: (.|\n)*?\n\n'
illust1_pattern = r'Illustrated by.*?\n'
illust2_pattern = r'\[Illust.*?\n'
transc_note_pattern = r'(T|t)ranscriber(.)?s? (N|n)ote(.|\n)*?\n\n'
transc_note_pattern2 = r'TRANSCRIBER\'S NOTE(.|\n)*?\n\n'
license_pattern = r'\*\*\* ?END OF(?:.|\n)*'
end_pattern = r'End of(.+)Gutenberg(.+)'

In [None]:
# stopwords from spaCy

nlp = spacy.load('en_core_web_sm')
stop_words = spacy.lang.en.stop_words.STOP_WORDS

In [None]:
def _bad_word(word):
    if len(word) < 2:
        return True
    if not word.isalpha():
        return True
    if word in stop_words:
        return True
    return False

def get_bow(book_txt):
    '''
    input: Text of the SF novel
    output: bag-of-words for given novel (list of str)
    '''    
    book_bow = []

    book_txt = re.sub(foreword_pattern, "", book_txt, 1)
    book_txt = re.sub(transc_pattern, "", book_txt, 1)
    book_txt = re.sub(prod_pattern, "", book_txt, 1)
    
    book_txt = re.sub(illust1_pattern, "", book_txt, 1)
    book_txt = re.sub(illust2_pattern, "", book_txt, 1)
    book_txt = re.sub(transc_note_pattern, "", book_txt, 1)
    book_txt = re.sub(transc_note_pattern2, "", book_txt, 1)
    
    book_txt = re.sub(prep_pattern, "", book_txt, 1)
    book_txt = re.sub(prep_pattern2, "", book_txt, 1)
    
    book_txt = re.sub(note_pattern, "", book_txt, 1)
    book_txt = re.sub(license_pattern, "", book_txt, 1)
    book_txt = re.sub(end_pattern, "", book_txt, 1)

    # removing newline character
    book_txt = book_txt.replace('\n', ' ')

    # tokenization
    for sent in book_txt.split('.'):
        doc = nlp(sent)
        # lemmatization, removing proper noun 
        sent_bow = [token.lemma_.lower() for token in doc if token.pos_ not in ['PROPN', 'NNP', 'NNPS', 'NE', 'NNE', 'NR', 'pnc']]
        sent_bow = [w for w in sent_bow if not _bad_word(w)]
        if len(sent_bow) > 0:
            book_bow.append(sent_bow)
            
    return book_bow

In [None]:
for book_id in tqdm(tobe_used['book_id'].tolist()):
    
    with open(f'./data/original/{book_id}.txt', 'r') as f:
        book_txt = f.read()
    f.close()
    
    book_bow = get_bow(book_txt)
    
    with open(f'./data/processed_by_sent_propn_filtered/{book_id}.txt', 'w') as f:
        for line in book_bow:
            f.write(' '.join(line) + '\n')
    f.close()

# Word2Vec

In [None]:
from gensim.models import Word2Vec, KeyedVectors
from gensim.corpora import Dictionary

In [None]:
def process_bow(book_id):
    with open(f'./data/processed_by_sent_propn_filtered/{book_id}.txt', 'r') as f:
        bow = f.readlines()
    f.close()
    
    bow = [word.strip('\n').split(' ') for word in bow]
    bow = [word for word in bow if len(word) > 0]
    return bow

In [None]:
corpus = []

for book_id in metadata_df['book_id'].tolist():
    corpus += process_bow(book_id)
    
model = Word2Vec(sentences=corpus, vector_size=300)

In [None]:
def get_doc_vec(book_id):
    vec = np.zeros(300)
    bow = [item for sublist in process_bow(book_id) for item in sublist]
    length = 0
    
    for word in bow:
        try:
            vec += model.wv[word]
            length += 1
        except KeyError:
            continue
    
    return vec / length

In [None]:
doc_emb = []
for book_id in tqdm(metadata_df['book_id'].tolist()):
    doc_emb.append(get_doc_vec(book_id))
    
len(doc_emb)

# Network Construction

In [None]:
import networkx as nx
from networkx.algorithms.components import connected_components
from numpy.linalg import norm

In [None]:
doc_num = len(doc_emb)
sim_mat = np.zeros((doc_num, doc_num))

for i in range(doc_num):
    for j in range(doc_num):
        sim_mat[i][j] = np.dot(doc_emb[i], doc_emb[j]) / (norm(doc_emb[i]) * norm(doc_emb[j]))
        
print(len(sim_mat))
print(sim_mat)

In [None]:
threshold = 0.85
graph = nx.DiGraph()

for i in range(doc_num):
    graph.add_node(i, title=metadata_df.iloc[i]['title'], year=metadata_df.iloc[i]['year'], author=metadata_df.iloc[i]['author'], author_id=metadata_df.iloc[i]['author_id'])
    
for i in range(doc_num):
    for j in range(doc_num):
        if sim_mat[i][j] > threshold and metadata_df.iloc[i]['year'] < metadata_df.iloc[j]['year']:
            graph.add_edge(i,j, weight=1)

In [None]:
nx.write_gexf(graph, './SF_network.gexf')

# Author Distinctiveness

In [None]:
with open('./author_dict.pkl', 'rb') as f:
    author_dict = pickle.load(f)
    
f.close()

with open('./author_id_dict.pkl', 'rb') as f:
    author_id_dict = pickle.load(f)
    
f.close()

In [None]:
mod_list = []
author_list = []
doc_num = len(graph.nodes)

for auth, auth_list in author_dict.items():
    if len(auth_list) > 1:
        auth_comm = [set([str(idx) for idx in auth_list])]
        auth_comm += [{str(idx)} for idx in range(doc_num) if idx not in auth_list]
    
        author_list.append(auth)
        mod_list.append(nx.community.modularity(undi_graph, auth_comm))

In [None]:
# Cumulative probability plot

plt.figure(figsize=(15,10))
modified = np.add(np.multiply(mod_list, 10000), -10000*min(mod_list)+1)

x = np.linspace(min(modified), max(modified), 1000)
y = [np.sum(np.greater_equal(modified, x_val))/len(modified) for x_val in x]

new_x = []
new_y = []
y_prev = 1
x_prev = 1

for x_elem, y_elem in zip(x, y):
    if y_elem < y_prev:
        new_x.append(x_elem)
        new_y.append(y_elem)
        y_prev = y_elem
        x_prev = x_elem

plt.vlines([1, 2, 4, 8], ymin=0, ymax=2, linestyles='dashdot', color='grey', alpha=0.6)
plt.hlines([1, 0.1, 0.01], xmin=-5, xmax=16, linestyles='dashdot', color='grey', alpha=0.6)
plt.xlim((0.9, 16))
plt.ylim((0.002, 2))

        
# plt.plot(x, y, linewidth=1, color='blue')
plt.plot(new_x, new_y, linewidth=1, color='blue', marker='o', markersize=12)
plt.plot(new_x, new_y, linewidth=0, color='white', marker='o', markersize=7)
plt.xscale('log', base=2)
plt.yscale('log', base=10)

plt.xticks(fontsize=20)
plt.yticks(fontsize=20)
plt.xlabel(r'Distinctiveness $D$', fontsize=25, fontweight='bold')
plt.ylabel(r'Cumulative Probability $Pr (X \geq x)$', fontsize=25, fontweight='bold')

plt.tight_layout()
# plt.savefig('./distinctiveness_cumul.eps', transparent=True)

# Community Structure

In [None]:
undi_graph = nx.Graph(graph)
comm = nx.community.louvain_communities(undi_graph, seed=42)
for i, c in enumerate(comm):
    for node_id in c:
        undi_graph.nodes[node_id]['community'] = i

In [None]:
y12 = [h.get_height() for h in sns.histplot([undi_graph.nodes[nod]['year'] for nod in comm[12]], kde=False, binrange=(1870, 1970), binwidth=5).patches]
plt.close()
y19 = [h.get_height() for h in sns.histplot([undi_graph.nodes[nod]['year'] for nod in comm[19]], kde=False, binrange=(1870, 1970), binwidth=5).patches]
plt.close()
y30 = [h.get_height() for h in sns.histplot([undi_graph.nodes[nod]['year'] for nod in comm[30]], kde=False, binrange=(1870, 1970), binwidth=5).patches]
plt.close()
y32 = [h.get_height() for h in sns.histplot([undi_graph.nodes[nod]['year'] for nod in comm[32]], kde=False, binrange=(1870, 1970), binwidth=5).patches]
plt.close()
y35 = [h.get_height() for h in sns.histplot([undi_graph.nodes[nod]['year'] for nod in comm[35]], kde=False, binrange=(1870, 1970), binwidth=5).patches]
plt.close()

x = [data for data in range(1870, 1970, 5)]

comm_df = pd.DataFrame(index=x, data={
    'Community I': y12,
    'Community II': y30,
    'Community III': y19,
    'Community IV': y32,
    'Community V': y35
})

new_indices = {}
for year in range(1870, 1970, 5):
    new_indices[year] = f'{year}~{year+5}' 

comm_df.rename(index=new_indices, inplace=True)

comm_df

In [None]:
# Reference: https://stackoverflow.com/questions/41778964/using-both-log-and-stack-on-a-pandas-bar-plot

d = np.zeros(comm_df.shape)
for j in range(len(comm_df)):
    row = comm_df.iloc[j, :]
    g = np.zeros(len(row)+1)
    G = np.sum(row)
    g[1:] = np.cumsum(row)
    f = 10**(g/G*np.log10(G))
    f[0] = 0
    d[j, :] = np.diff(f)
    
pd.DataFrame(d, index=comm_df.index, columns=comm_df.columns)

In [None]:
data_df = pd.DataFrame(d, index=comm_df.index, columns=comm_df.columns)

fig, ax = plt.subplots()
fig.set_figheight(15)
fig.set_figwidth(30)

ax.set_xlim(1865, 1975)
ax.set_ylim(0.999, 1000)
ax.set_xlabel('Year', fontsize=30, fontweight='bold')
ax.set_ylabel('The total number of works', fontsize=30, fontweight='bold')
ax.set_yscale('log')

# comm_ratio_df.plot(kind='bar', stacked=True, ax=ax, color=['red', 'blue', 'orange', 'green', 'purple'], alpha=0.8, align='center', width=0.6)
data_df.plot(kind='bar', stacked=True, ax=ax, color=['red', 'blue', 'orange', 'green', 'purple'], alpha=0.8, align='center', width=0.6)
plt.xticks(fontsize=25, rotation=75)
plt.yticks(fontsize=25)

ax.get_legend().remove()
'''
for i in range(len(d)-1):
    for j in range(5):
        ax.plot([i+0.3, i+0.7], [sum(data_df.iloc[i, :j+1]), sum(data_df.iloc[i+1, :j+1])], color='black', ls='--', zorder=1)
'''

ax.legend(loc='upper left', fontsize=35, ncols=3)

plt.tight_layout()
# plt.savefig('./community_ratio_stacked_log.eps', transparent=True)

## Prominent Tags

In [None]:
with open('./novel_subjects.pkl', 'rb') as f:
    fiction_subject = pickle.load(f)
    
f.close()

In [None]:
metadata_df['community'] = metadata_df.apply(lambda row: undi_graph.nodes[str(row.name)]['community'], axis=1)

In [None]:
tags = [d for _, d in fiction_subject.items()]
tags_set = set([x for xs in tags for x in xs])

comm_sbj_matrix = pd.DataFrame(data=0, index=[d for d in range(37)], columns = list(tags_set), dtype=int)
for book_id, comm in zip(metadata_df['book_id'].tolist(), metadata_df['community'].tolist()):
    for elem in fiction_subject[str(book_id)]:
        comm_sbj_matrix.loc[comm, elem] += 1

In [None]:
idf = np.log(np.divide(884, np.add(1, comm_sbj_matrix.astype(bool).sum(axis=0))))
tag_idf = comm_sbj_matrix.mul(idf, axis=1)

In [None]:
# Community number and name
# Community I: 12
# Community II: 30
# Community III: 19
# Community IV: 32
# Community V: 35

# Top 20 prominent tags of Community II

tag_idf.loc[30, :].sort_values(ascending=False)[:20]

## Diversity

In [None]:
def get_period_diversity(G, start_year, end_year):
    comm_dict = {}
    for nod in G.nodes:
        if G.nodes[nod]['year'] >= start_year and G.nodes[nod]['year'] < end_year:
            comm_dict[G.nodes[nod]['community']] = comm_dict.get(G.nodes[nod]['community'], 0) + 1
            
    comm_dist = np.zeros(5)
    comm_dist[:len(comm_dict)] = list(comm_dict.values())
    comm_dist /= np.sum(comm_dist)
    return scipy.stats.entropy(comm_dist, base=2)

In [None]:
works = []
years = []
diversity = []
for year in range(1810, 2000):
    years.append(year)
    diversity.append(get_period_diversity(undi_graph, year-5, year+5))
    w = 0
    for nod in undi_graph.nodes:
        if undi_graph.nodes[nod]['year'] >= year-5 and undi_graph.nodes[nod]['year'] < year+5:
            w += 1
    works.append(w)
    
len(works)

In [None]:
fig, ax = plt.subplots()
fig.set_figheight(15)
fig.set_figwidth(30)

ax.plot(years, diversity, color='r', marker='o', linewidth=1, markersize=16)
ax.plot(years, diversity, color='w', marker='o', linewidth=0, markersize=10)
ax.set_xlabel('Year', fontsize=30)
ax.set_ylabel('Diversity $H$', color='r', fontsize=30)
ax.tick_params(axis='y', colors='red', labelsize=20)
ax.tick_params(axis='x', labelsize=20)

ax.set_xlim([1865, 1975])

plt.tight_layout()
# plt.savefig('./comm_diversity.eps', transparent=True, format='eps')