# Import Necessary Libraries

In [None]:
import os, re
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import PCA
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from pandas import DataFrame, Series

# Set adjustable parameters

In [None]:
# Insert characters to clean here. Each character must be in
# quotation marks, separated with a comma. They all should be
# between the square brackets.
characters_to_remove = ['』','。', '！', '，', '：', '、', '（',
                      '）', '；', '？', '〉', '〈', '」', '「',
                      '『', '“', '”', '!', '"', '#', '$', '%',
                      '&', "'", '(', ')', '*', '+', ',', '-',
                      '.', '/', "《", "》", "·"]

# Do you want to analyze common characters or a custom 
# vocabulary? "common" for common characters, "custom"
# for custom vocabulary.
common_custom = "common"

# How many of the most common characters do you want to
# include in your analysis? None will include all of them.
# This will be ignored if you specified "custom" above.
number_of_features = 100

# What is the custom vocabulary you would like to analyze?
# This will be ignored if you specified "common" above.
custom_vocabulary = ["之", "乎", "者", "也"]

# You can specify an n-gram range to analyze. (1, 1) will
# just include 1-grams. (1, 3) will include 1-grams, 2-grams
# and 3-grams. (2, 2) will include just 2-grams. (1, 1) is
# often a good starting place.
ngrams = (1, 1)

# You can specify how many components you wish to calculate.
# Generally, 2 is plenty.
components = 2

# Provide the name of the folder with your texts
# I recommend calling this "corpus"
corpusfolder = "corpus"

# Specify the name of your metadata file.
# I recommend "metadata.txt"
metadatafilename = "metadata.txt"

# Decide if you want to divide texts into equal lengths
# True to break apart, False to not break apart
break_apart = False

# If dividing texts, how long should sections be?
# If break_apart is False, this will be ignored
divlimit = 10000

# If you are not diving the texts, do you want to normalize
# the results? This will calculate how often each character
# occurs per 1,000 characters. You should ALWAYS set this to
# True if break_apart is False. I recommend leaving this as is.
if not break_apart:
    normalize = True
else:
    normalize = False

# Set up plot parameters

In [None]:
# How do you want to color the plot labels? You can color
# them with "genre", "author", or "era".
# If you have more than 15 unique categories, you may
# encounter an error. You can add more colors to the
# colors list in the code below, or reduce the number
# of categories.
plot_label_color = "genre"

# Do you want to include text labels for each point?
# If so, set annotate to True. Otherwise, set to False.
annotate = False

# How big would you like the plot to be? In inches.
# (7,7) is 7 by 7 inches, (11, 8.5) is 11 inches wide
# by 8.5 inches tall.
plot_size = (11, 11)

# If you want to save the plot to disk, please give it
# a name. You can save it with a pdf, png, tif, svg, jpg,
# raw, eps, ps, pgf, bmp, gif, rgba, or svgz extension.
plot_name = "text.pdf"


# Set Up Functions (YOU DON'T NEED TO CHANGE ANYTHING AFTER THIS)

In [None]:
def clean(instring):
    instring = re.sub(r'~~~START\|.+?\|START~~~', "", instring)
    instring = re.sub(r'[a-zA-Z0-9]', "", instring)
    instring =  re.sub("\s+", "", instring)
    unwanted_chars = characters_to_remove
    for char in unwanted_chars:
        instring = instring.replace(char, "")
    return instring

def textBreak(inputstring):
    divlim = divlimit
    loops = len(inputstring)//divlim
    save = []
    for i in range(0, loops):
        save.append(inputstring[i * divlim: (i + 1) * divlim])
    return save

def info_for_graph(input_list):
    unique_values = set(input_list)
    unique_labels = [i for i in range(0, len(unique_values))]
    unique_dictionary = dict(zip(unique_values, unique_labels))
    class_list = []
    for item in input_list:
        class_list.append(unique_dictionary[item])
    return unique_labels, np.array(class_list), unique_values

# Set Up Empty Variables For Information

In [None]:
info_list = []
title_list = []
author_list = []
era_list = []
genre_list = []
section_number = []
title_author = {}
title_era = {}
title_genre = {}
metadata = {}

# Load in Metadata

In [None]:
# This should be a tab separated file with the columns
# filename (minus .txt), title, author, era, genre
# Add the name of the metadata file
metadatafile = open(metadatafilename, "r", encoding="utf8")
metadatastring = metadatafile.read()
metadatafile.close()

lines = metadatastring.split("\n")
for line in lines:
    cells = line.split("\t")
    metadata[cells[0]] = cells[1:]

# Load in Texts

In [None]:
for root, dirs, files in os.walk(corpusfolder):
    for filename in files:
        if filename[0] != ".":
            f = open(root + "/" + filename, "r", encoding="utf8")
            c = f.read()
            f.close()
            c = clean(c)
            
            metainfo = metadata[filename[:-4]]
            
            title_author[metainfo[0]] = metainfo[1]
            title_era[metainfo[0]] = metainfo[2]
            title_genre[metainfo[0]] = metainfo[3]
            
            if not break_apart:
                info_list.append(c)
                title_list.append(metainfo[0])
                author_list.append(metainfo[1])
                era_list.append(metainfo[2])
                genre_list.append(metainfo[3])

            else:
                broken_sections = textBreak(c)
                
                info_list.extend(broken_sections)

                title_list.extend([metainfo[0] for i in 
                                   range(0,len(broken_sections))])
                author_list.extend([metainfo[1] for i in 
                                    range(0,len(broken_sections))])
                era_list.extend([metainfo[2] for i in 
                                 range(0,len(broken_sections))])
                genre_list.extend([metainfo[3] for i in 
                                   range(0,len(broken_sections))])
                section_number.extend([i for i in range(0, len(broken_sections))])

# Set Up Vectorizer

In [None]:
if common_custom == "common":
    vectorizer = CountVectorizer(analyzer="char",ngram_range=ngrams,
                             max_features = number_of_features)
elif common_custom == "custom":
    vectorizer = CountVectorizer(analyzer="char",ngram_range=ngrams,
                             vocabulary = custom_vocabulary)

# Vectorize the Documents

In [None]:
word_count_matrix=vectorizer.fit_transform(info_list)
vocab = vectorizer.get_feature_names()

if normalize:
    dense_words = word_count_matrix.toarray()
    corpus_dataframe = DataFrame(dense_words, columns=vocab)
    doclengths = corpus_dataframe.sum(axis=1)
    thousand = Series([1000 for i in range(0,len(doclengths))])
    adjusteddoclengths = thousand.divide(doclengths)
    per_thousand = corpus_dataframe.multiply(adjusteddoclengths, axis = 0)
    dense_words = per_thousand.as_matrix()
else:
    dense_words = word_count_matrix.toarray()

# Prepare the PCA object

In [None]:
pca = PCA(n_components=components)

# Perform the PCA

In [None]:
my_pca = pca.fit(dense_words).transform(dense_words)

# Set up and create plot

In [None]:
from sys import platform
if platform == "linux" or platform == "linux2":
    print("Sorry, I can't see the appropriate fonts, defaulting to Japanese")
    matplotlib.rc('font', family="TakaoPGothic")
elif platform == "win32" or platform == "win64":
    matplotlib.rc('font', family="SimHei")
elif platform == "darwin":
    matplotlib.rc('font', family='STHeiti')
    
plt.figure(figsize=plot_size)

if plot_label_color == "genre":
    unique_labels, info_labels, unique_genres = info_for_graph(genre_list)
elif plot_label_color == "author":
    unique_labels, info_labels, unique_genres = info_for_graph(author_list)
elif plot_label_color == "era":
    unique_labels, info_labels, unique_genres = info_for_graph(era_list)

colors = ["red", "blue", "magenta", "cyan","black", "gray", "pink", 
          "orange", "yellow", "green", "brown", "beige", "purple",
          "lavender", "lightblue"]
colors = colors[0:len(unique_labels)]

# This code is partially adapted from brandonrose.org/clustering
for color, each_class, label in zip(colors, unique_labels, unique_genres):
    plt.scatter(my_pca[info_labels == each_class, 0],
               my_pca[info_labels == each_class, 1],
               label = label, color = color)


annotate_plot = annotate
if annotate_plot:
    for i, text_label in enumerate(title_list):
        plt.annotate(text_label,  xy = (my_pca[i, 0], my_pca[i, 1]),
                     xytext=(my_pca[i, 0], my_pca[i, 1]), 
                     size=8)

plt.title("Principal Component Analysis")
plt.xlabel("PC1: " + "{0:.2f}".format(pca.explained_variance_ratio_[0] * 100)+"%")
plt.ylabel("PC2: " + "{0:.2f}".format(pca.explained_variance_ratio_[1] * 100)+"%")
plt.legend()

plt.savefig(plot_name)
plt.show()


plt.figure(figsize=plot_size)

loadings = pca.components_

plt.scatter(loadings[0], loadings[1], alpha=0)

plt.title("Principal Component Loadings")
plt.xlabel("PC1: " + "{0:.2f}".format(pca.explained_variance_ratio_[0] * 100)+"%")
plt.ylabel("PC2: " + "{0:.2f}".format(pca.explained_variance_ratio_[1] * 100)+"%")

for i, txt in enumerate(vocab):
    plt.annotate(txt, (loadings[0, i], loadings[1, i]), horizontalalignment='center',
                 verticalalignment='center', size=8)
    
plt.savefig(plot_name[:-4]+"_loadings"+plot_name[-4:])   
plt.show()
