In [1]:
# top-level dependencies
import os
import spacy
import nltk
import string
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
from collections import Counter
from os import path

# notebook config
%matplotlib inline
pd.options.display.max_rows = 999
pd.options.display.max_columns = 100
pd.options.display.float_format = '{:40,.4f}'.format

In [2]:
file='volume1.txt'

def read_txt_data(fname):
    with open(os.getcwd()+'/' + fname,'r',) as f:
        # this way of reading the file gives a list of lines.
        data_text = f.read()
        f.close
    return data_text

In [3]:
text = read_txt_data(file)

In [4]:
def tokenize_and_stem(text):
    #remove line breaks and numerics
    text = text.replace('[^a-zA-ZʻāēīūĀĒĪŪ]', '').replace('\n', ' ')
    # remove punctuation first
    text = text.translate(str.maketrans('','',string.punctuation))
    # convert to all lower
    text = str.lower(text)
    # tokenize by word
    tokens = nltk.word_tokenize(text)
    # get stop words
    stop_words = set(nltk.corpus.stopwords.words('english'))
    # remove stop words
    tokens = [word for word in tokens if word not in stop_words]
    # stem
    #stems = [stemmer.stem(t) for t in tokens]
    return tokens

In [5]:
tokens = tokenize_and_stem(text)

In [9]:
from scipy.cluster.hierarchy import ward, dendrogram
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
# create a TF-IDF vectorizer:
tfidf_vectorizer = TfidfVectorizer(#min_df=0.005,
                             #max_df=0.7,
                             max_features=500,
                             tokenizer=tokenize_and_stem,
                             ngram_range=(1,3),
                             use_idf=True)

In [10]:
vectors = tfidf_vectorizer.fit_transform(tokens)

terms = tfidf_vectorizer.get_feature_names()

In [None]:
# now calculate the tf-idf cosine difference
# this helps us cluster documents that might be similar
dist = 1 - cosine_similarity(vectors)

# use ward clustering to find similar docs; 
# cluster analysis as an analysis of variance problem instead of using distance metrics or measures of association
# agglomerative clustering algorithm: start out at the leaves and work its way to the trunk, so to speak. 
# It looks for groups of leaves that it forms into branches, the branches into limbs and eventually into the trunk. 
linkage_matrix = ward(dist) #define the linkage_matrix using ward clustering pre-computed distances
#titles = (df['Title'] + ' ' + df['Card #'].astype(str) + ' ' + 

In [None]:
import sklearn as sk

from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
stop_words = stopwords.words('english')

import string
from nltk.stem import PorterStemmer
import re
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")

from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer
from sklearn.cluster import KMeans

In [None]:
def print_top_terms(k):
    for i in range(k):
        print("Cluster %d words:\n" % i, end='')
        indices = centroids[i,:50]
        t = [terms[i] for i in indices]
        print(t)
        print('\n')

In [None]:
vectorizer = TfidfVectorizer(#min_df=0.005,
                             #max_df=0.7,
                             max_features=500,
                             tokenizer=tokenize_and_stem,
                             ngram_range=(1,3),
                             use_idf=True)

vectors = vectorizer.fit_transform(lower_nostop)

terms = vectorizer.get_feature_names()

In [None]:
terms

In [None]:
# place tf-idf values in a pandas data frame
vec_tfidf_scores = pd.DataFrame(first_vector_tfidfvectorizer.T.todense(), index=terms, columns=["tfidf"])
vec_tfidf_scores.sort_values(by=["tfidf"],ascending=False)

In [None]:
# find the appropriate cluster number
plt.figure(figsize=(16, 8))
wcss = []
for i in range(1, 50):
    kmeans = KMeans(n_clusters = i, init = 'k-means++', random_state = 42)
    kmeans.fit(vectors)
    wcss.append(kmeans.inertia_)
plt.plot(range(1, 50), wcss)
plt.title('The Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('Inertia')
plt.show()

In [None]:
# letʻs look at the n clusters around the elbow
ks = [10]
for k in ks:
    km = KMeans(n_clusters=k)
    km = km.fit(vectors)
    centroids = km.cluster_centers_.argsort()[:, ::-1]
    indices = centroids[0,:5]
    print('#################\nCluster terms at k=%d:' % k)
    print_top_terms(k)

In [None]:
from scipy.cluster.hierarchy import ward, dendrogram
from matplotlib import pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity



# create a TF-IDF vectorizer:
tfidf_vectorizer = TfidfVectorizer(#min_df=0.005,
                             #max_df=0.7,
                             max_features=500,
                             tokenizer=tokenize_and_stem,
                             ngram_range=(1,3),
                             use_idf=True)
# calculate TF-IDF
# this gives a weight to words based on their frequency in a document and the inverse frequency across all documents
# idea: words that are frequent in a document, but also very frequent in other documents, might just be noisy
# idea: words that are frequent in a document, and infrequent across other docuemnts, can help determine the article's topic
tfidf_matrix = tfidf_vectorizer.fit_transform(lower_nostop) #fit the vectorizer to synopses
terms = tfidf_vectorizer.get_feature_names()

# now calculate the tf-idf cosine difference
# this helps us cluster documents that might be similar
dist = 1 - cosine_similarity(tfidf_matrix)

# use ward clustering to find similar docs; 
# cluster analysis as an analysis of variance problem instead of using distance metrics or measures of association
# agglomerative clustering algorithm: start out at the leaves and work its way to the trunk, so to speak. 
# It looks for groups of leaves that it forms into branches, the branches into limbs and eventually into the trunk. 
linkage_matrix = ward(dist) #define the linkage_matrix using ward clustering pre-computed distances
#titles = (df['Title'] + ' ' + df['Card #'].astype(str) + ' ' + df['Drawing']).tolist()
fig, ax = plt.subplots(figsize=(15, 20)) # set size
# visualize the linkage matrix with a dendogram
ax = dendrogram(linkage_matrix, orientation="right", leaf_font_size=9);

plt.tick_params(\
    axis= 'x',          # changes apply to the x-axis
    which='both',      # both major and minor ticks are affected
    bottom='off',      # ticks along the bottom edge are off
    top='off',         # ticks along the top edge are off
    labelbottom='off')

plt.tight_layout() #show plot with tight layout


plt.show()
