# Requirements

In [None]:
import pandas as pd
import numpy as np
import string
import re

import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from PIL import Image

from collections import Counter
from stop_words import get_stop_words
# from nltk.corpus import stopwords  # I DONT KNOW WHY BUT THIS IS NOT WORKING :(

from joblib import Parallel, delayed

from proj_funs import extract_subjects

In [None]:
import multiprocessing
multiprocessing.cpu_count()

In [None]:
STOP_WORDS = get_stop_words('english') 
STOP_WORDS[:10]

In [None]:
# does not contain non
while "non" in STOP_WORDS: 
    STOP_WORDS.remove("non")
    print("non removed")

# Data Import

In [None]:
df = pd.read_csv("openlibrary_works.csv")
df.info()

In [None]:
df.head()

# Subject extraction (EDA)

## Non-Truncated

In [None]:
N_WORKS = len(df.index) # 50000

print("Beginning subject extraction...")
subjects = Parallel(n_jobs=-1)(
    delayed(extract_subjects)(k, row, N_WORKS) for k, row in df[:N_WORKS].iterrows())
subjects = np.concatenate(subjects, axis=0)
print("Subject extraction complete.")

print("\nSample result:")
subjects[:10]

In [None]:
# unpack arrays to one string
subjects_str = ' '.join(np.ravel(subjects))
print("Complete.")

# save as txt file
text_file = open("allsubj_notrunc_100ksample.txt", "wt")
n = text_file.write(subjects_str)
text_file.close()

##### Word Cloud

In [None]:
# create the wordcloud object
# https://stackoverflow.com/questions/59148244/keeping-words-together-in-wordcloud

word_could_dict = Counter(subjects)
wordcloud = WordCloud(background_color='white').generate_from_frequencies(word_could_dict)
wordcloud.to_file("word_whole.png")

# plot the wordcloud object
plt.imshow(wordcloud, interpolation='bilInear')
plt.axis('off')
plt.show()


In [None]:
# create a dictionary of word frequencies
sub_dictionary = word_could_dict
# sort the dictionary
word_freq = {k: v for k, v in sorted(
    sub_dictionary.items(), reverse=True, key=lambda item: item[1])}

# use words_ to print relative word frequencies
rel_freq = wordcloud.words_

In [None]:
rel_freq_df = pd.DataFrame.from_dict(
    rel_freq,
    orient='index',
    columns=["Freqency"]
)
rel_freq_df['Subject'] = rel_freq_df.index
rel_freq_df.reset_index(drop=True, inplace=True)
rel_freq_df


## Trucated

In [None]:
N_WORKS = len(df.index) # 50000

print("Beginning subject extraction...")
subjects = Parallel(n_jobs=-1)(
    delayed(extract_subjects)(k, row, N_WORKS, collated = False) for k, row in df[:N_WORKS].iterrows())
subjects = np.concatenate(subjects, axis=0)
print("Subject extraction complete.")

print("\nSample result:")
subjects[:10]

In [None]:
subjects_str = ' '.join(subjects)
print("Complete.")

# save as txt file
text_file = open("allsubj_split.txt", "wt")
n = text_file.write(subjects_str)
text_file.close()

In [None]:
subjects_str[:100]

### Word cloud

In [None]:
# create the wordcloud object
# https://stackoverflow.com/questions/59148244/keeping-words-together-in-wordcloud

word_could_dict = Counter(subjects)
wordcloud = WordCloud(background_color='white').generate_from_frequencies(word_could_dict)
wordcloud.to_file("word_trunc.png")

# plot the wordcloud object
plt.imshow(wordcloud, interpolation='bilInear')
plt.axis('off')
plt.show()

In [None]:
# create a dictionary of word frequencies
sub_dictionary = wordcloud.process_text(subjects_str)
# sort the dictionary
word_freq = {k: v for k, v in sorted(
    sub_dictionary.items(), reverse=True, key=lambda item: item[1])}

# use words_ to print relative word frequencies
rel_freq = wordcloud.words_


In [None]:
rel_freq_df = pd.DataFrame.from_dict(
    rel_freq,
    orient='index',
    columns=["Freqency"]
)
rel_freq_df['Subject'] = rel_freq_df.index
rel_freq_df.reset_index(drop=True, inplace=True)
rel_freq_df 


### Barcharts

In [None]:
sns.set_style("whitegrid")
sns.histplot(data=rel_freq_df, x="Freqency").set(xlim=(0, 1))


In [None]:
fig, ax = plt.subplots()
# the size of A4 paper
fig.set_size_inches(11.7, 8.27)

sns.barplot(
    x="Freqency",
    y="Subject",
    data=rel_freq_df[rel_freq_df["Freqency"] > 0.1],
    palette="Blues_r",
    ax=ax
)

fig.show()


In [None]:
rel_freq_df.to_csv("freq.csv")

# Subject Selection

Preliminary subject genres are based on:
Paper: http://cs231n.stanford.edu/reports/2017/pdfs/814.pdf
Literature guide: https://blog.reedsy.com/book-genres/

In [None]:
subj_maps = {
    "fiction" : ["fiction","fictitious"],   # Fiction
    "fantasy" : ["fantasy"],   # Fantasy/ Magical Realism
    "mystery" : ["mystery","detective"],   # Mystery/ Horror/ Thriller & Suspense - crime
    "romance" : ["romance"],   # Romance
    "womens" : ["women"],   # Women’s
    "grapic_novel" : ["grapic","pictorial","picture"],   # Graphic Novel/ Comics
    "nonfiction" : ["nonfiction"],   # Nonfiction
    "biography" : ["biography"],   # Memoir & Autobiography/ Biography
    "dining" : ["dining","cooking","food"],   # Food & Drink
    "art" : ["art","poetry","music","architecture","design","arts","picture","photography"],   # Art & Photography
    "history" : ["history","war","historical","histoire"],   # History
    "humor" : ["humor"],   # Humor 
    "religion" : ["religion","church","religious","bible","catholic","christian","christianity","theology"],   # Religion & Spirituality
    "social_sci" : ["culture","sociology","communication","religious","civilization","public","religion","customs","social","criticism","government","politics","war","economic","management","world","administration"],   # Social Sciences
    "humanities" : ["ethics","art","arts","customs","culture","behavior","criticism","language","psychology","philosophy","therapy"],
    "business" : ["business", "economic","economics","finance","trade","administration","management"],
    "politics" : ["government","legislation","military","politics","war","economic","law","congresses","political","policy","international"],
    "sci_tech" : ["physics","science","chemistry","mathematics","research","mathematical","sciences","computer","computers","industrial","health","medical","engineering","environmental","architecture","technology"],   # Science & Technology
    "medicine" : ["medical","diseases","health","disease","medicine","nursing","animals"],   # health & medicine
    "educational" : ["education","guidebooks","teaching","study","medical","bibliography","textbooks","research","manuals","handbooks","dictionaries","theory","animals"],   # Education/ guide/ how-to
    "childrens" : ["childrens","children","family","child","nursery","",""],   # Childrens
    "young_adult" : ["juvenile"]   # YA
}

# Data Cleaning

In [None]:
def get_cover(row):
    # remove brackets
    curr_covers = row["covers"][1:-1].split(", ")
    # split into array of cover IDs
    curr_covers = np.array(Parallel(n_jobs=-1)(delayed(int)(cover) for cover in curr_covers))

    return(curr_covers)

In [None]:
def map_1subject(curr_subjects, subj_map):
    if any(x in curr_subjects for x in subj_map[1]): 
        return(subj_map[0])

In [None]:
def map_subjects(curr_subjects):
    mapped_subjects = Parallel(n_jobs=-1)(delayed(map_1subject)(curr_subjects, subj_map) for subj_map in subj_maps.items())
    while None in mapped_subjects: mapped_subjects.remove(None)
    
    return(mapped_subjects)


In [None]:
def clean_subj_cover_data(k, row, n_works = 4301727, report_freq=1*10**4):
    curr_subjects = extract_subjects(row = row, collated=False)
    cleaned_data = [row["key"], get_cover(row), map_subjects(curr_subjects)]

    # === PROGRESS REPORT === 
    if k % (report_freq) == 0:
        print('{0:-8} / {1:}'.format(k, n_works))

    return(cleaned_data)

In [None]:
df_clean_data = Parallel(n_jobs=-1)(delayed(clean_subj_cover_data)(k, row) for k, row in df.iterrows())

df_clean = pd.DataFrame(df_clean_data, columns= ["key","cover", "subjects"])
df_clean

In [None]:
df_clean.to_csv("df_clean.csv")

In [None]:
# keep only works that have cover and subject
df = df_clean
df=df[df['subjects'].astype(bool)]
# df_clean = df_clean[df_clean.all(1)]

In [None]:
df.info()

In [None]:
# create new row for each cover
df = df.reset_index(drop=True)
lstcol = df.cover.values
lstcollist = []
indexlist = []
countlist = []
for ii in range(len(lstcol)):
    lstcollist.extend(lstcol[ii])
    indexlist.extend([ii]*len(lstcol[ii]))
    countlist.extend([jj for jj in range(len(lstcol[ii]))])
df = pd.merge(
    df.drop("cover",axis=1),
    pd.DataFrame({"cover":lstcollist,"lstcol_num":countlist},index=indexlist),
    left_index=True,right_index=True).reset_index(drop=True)

df

#REF
# what is extend: https://stackoverflow.com/questions/252703/what-is-the-difference-between-pythons-list-methods-append-and-extend
# ravel method from https://stackoverflow.com/questions/27263805/pandas-column-of-lists-create-a-row-for-each-list-element

In [None]:
df[df.lstcol_num>0]
df = df.drop([62]) # weird cover has -1 as cover number

In [None]:
df.to_csv("df_clean_extended.csv")

WE SHALL NOT USE EXTENDED BC OF RECURRING COVER IMGS

In [None]:
df = pd.read_csv("df_clean_extended.csv")

In [None]:
df = df[df["lstcol_num"]==0]

In [None]:
df = df.drop(columns=["lstcol_num","Unnamed: 0"])
df

In [None]:
df.to_csv("df_clean_uniqkey.csv")

## Subj viz

In [None]:
# create a dictionary of word frequencies
subjs = [item for sublist in df.subjects.values for item in sublist]
word_could_dict = Counter(subjs)
wordcloud = WordCloud().generate_from_frequencies(word_could_dict)
sub_dictionary = word_could_dict
# sort the dictionary
word_freq = {k: v for k, v in sorted(
    sub_dictionary.items(), reverse=True, key=lambda item: item[1])}

# use words_ to print relative word frequencies
rel_freq = wordcloud.words_

In [None]:
rel_freq_df = pd.DataFrame.from_dict(
    rel_freq,
    orient='index',
    columns=["Freqency"]
)
rel_freq_df['Subject'] = rel_freq_df.index
rel_freq_df.reset_index(drop=True, inplace=True)
rel_freq_df
