In [1]:
import os, re
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import PCA
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from pandas import DataFrame, Series

def clean(instring):
    instring = re.sub(r'~~~START\|.+?\|START~~~', "", instring)
    
    instring = re.sub(r'[a-zA-Z0-9]', "", instring)
    
    # Decide on unwanted characters
    unwanted_chars = ['』','。', '！', '，', '：', '、', '（',
                      '）', '；', '？', '〉', '〈', '」', '「',
                      '『', '“', '”', '!', '"', '#', '$', '%',
                      '&', "'", '(', ')', '*', '+', ',', '-',
                      '.', '/', "《", "》", "·"]
    
    for char in unwanted_chars:
        instring = instring.replace(char, "")
    
    return instring

def textBreak(inputstring):
    # Decide how long each section should be
    divlim = 10000
    
    loops = len(inputstring)//divlim
    
    save = []
    
    for i in range(0, loops):
        save.append(inputstring[i * divlim: (i + 1) * divlim])
    
    return save

def info_for_graph(input_list):
    unique_values = set(input_list)
    
    unique_labels = [i for i in range(0, len(unique_values))]
    unique_dictionary = dict(zip(unique_values, unique_labels))
    
    class_list = []
    for item in input_list:
        class_list.append(unique_dictionary[item])
    
    return unique_labels, np.array(class_list), unique_values

info_list = []
title_list = []
author_list = []
era_list = []
genre_list = []
section_number = []
title_author = {}
title_era = {}
title_genre = {}

# Decide whether or not to break the text apart
break_apart = False

# Decide whether or not to normalize
normalize = True

metadata = {}

# Add the name of the metadata file
metadatafile = open("metadata.txt", "r", encoding="utf8")
metadatastring = metadatafile.read()
metadatafile.close()

lines = metadatastring.split("\n")
for line in lines:
    cells = line.split("\t")
    metadata[cells[0]] = cells[1:]

# Here you will set the name of the directory the files are stored in.
for root, dirs, files in os.walk("corpus"):
    for filename in files:
        if filename[0] != ".":
            f = open(root + "/" + filename, "r", encoding="utf8")
            c = f.read()
            f.close()
            c = re.sub("\s+", "", c)
            c = clean(c)
            
            metainfo = metadata[filename[:-4]]
            
            title_author[metainfo[0]] = metainfo[1]
            title_era[metainfo[0]] = metainfo[2]
            title_genre[metainfo[0]] = metainfo[3]
            
            if not break_apart:
                info_list.append(c)
                title_list.append(metainfo[0])
                author_list.append(metainfo[1])
                era_list.append(metainfo[2])
                genre_list.append(metainfo[3])

            else:
                broken_sections = textBreak(c)
                
                info_list.extend(broken_sections)

                title_list.extend([metainfo[0] for i in 
                                   range(0,len(broken_sections))])
                author_list.extend([metainfo[1] for i in 
                                    range(0,len(broken_sections))])
                era_list.extend([metainfo[2] for i in 
                                 range(0,len(broken_sections))])
                genre_list.extend([metainfo[3] for i in 
                                   range(0,len(broken_sections))])
                section_number.extend([i for i in range(0, len(broken_sections))])

# Decide how to construct the vectorizer
vectorizer = CountVectorizer(analyzer="char",ngram_range=(1,1),
                             max_features = 100)

word_count_matrix = vectorizer.fit_transform(info_list)

if normalize:
    vectorizer = CountVectorizer(analyzer="char", ngram_range=(1,1),
                                 max_features=100)
    word_count_matrix=vectorizer.fit_transform(info_list)
    vocab = vectorizer.get_feature_names()

    dense_words = word_count_matrix.toarray()

    corpus_dataframe = DataFrame(dense_words, columns=vocab)

    doclengths = corpus_dataframe.sum(axis=1)

    thousand = Series([1000 for i in range(0,len(doclengths))])

    adjusteddoclengths = thousand.divide(doclengths)

    per_thousand = corpus_dataframe.multiply(adjusteddoclengths, axis = 0)

    print(per_thousand)

    word_count_matrix = per_thousand.as_matrix()

vocab = vectorizer.get_feature_names()

pca = PCA(n_components = 2)

if not normalize:
    dense_words = word_count_matrix.toarray()
else:
    dense_words = word_count_matrix
    
my_pca = pca.fit(dense_words).transform(dense_words)

unique_labels, info_labels, unique_genres = info_for_graph(genre_list)

# Make a color list, the same length as unique labels
colors = ["red", "magenta", "blue"]

plt.figure()

# This code is partially adapted from brandonrose.org/clustering
for color, each_class, label in zip(colors, unique_labels, unique_genres):
    plt.scatter(my_pca[info_labels == each_class, 0],
               my_pca[info_labels == each_class, 1],
               label = label, color = color)

#Decide whether or not to annotate the plot
annotate_plot = False
if annotate_plot:
    for i, text_label in enumerate(title_list):
        plt.annotate(text_label,  xy = (my_pca[i, 0], my_pca[i, 1]),
                     xytext=(my_pca[i, 0], my_pca[i, 1]), 
                     size=8)

plt.title("Principal Component Analysis")
plt.xlabel("PC1: " + "{0:.2f}".format(pca.explained_variance_ratio_[0] * 100)+"%")
plt.ylabel("PC2: " + "{0:.2f}".format(pca.explained_variance_ratio_[1] * 100)+"%")
plt.legend()

plt.show()

loadings = pca.components_

plt.scatter(loadings[0], loadings[1], alpha=0)

plt.title("Principal Component Loadings")
plt.xlabel("PC1: " + "{0:.2f}".format(pca.explained_variance_ratio_[0] * 100)+"%")
plt.ylabel("PC2: " + "{0:.2f}".format(pca.explained_variance_ratio_[1] * 100)+"%")

for i, txt in enumerate(vocab):
    plt.annotate(txt, (loadings[0, i], loadings[1, i]), horizontalalignment='center',
                 verticalalignment='center', size=8)
    
plt.show()

            一          三          上          下          不          与  \
0   27.877143   6.845936  26.273591  11.841618  34.353028   4.687307   
1   27.817947  10.084481  20.016744  18.190121  43.001751   3.957683   
2   33.331367   6.725267  25.721196  14.453425  39.407705   5.781370   
3   29.895996   8.547728  20.337698  13.979536  36.885764   2.863278   
4   38.371217   7.975542  15.722156  12.288242  29.767971   4.083773   
5   42.140946   4.799286  10.776093   6.458519  63.157895   8.474576   
6   41.840849   4.217528  11.221968   6.539955  55.385244   8.546532   
7   33.805693   9.313420  19.724803  16.507965  28.553450   7.159230   
8   32.400955   3.266042  10.132487   7.366650  39.909375   2.819335   
9   32.770973   8.023090   9.766867   8.821963  60.276255  11.699624   
10  31.887596  13.490596  14.924652  12.000145  35.605667   8.982989   
11  30.403190   6.492741  15.408553  12.897865  34.868650  10.731604   
12  25.658167   9.200013  13.090122  17.731512  42.880336  11.94

  if self._edgecolors == str('face'):
