In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import pandas as pd
import re
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn import cluster

### Create corpus from the same author and same-size random sample

In [2]:
df = pd.read_csv('../../data/shakespare_william_works_preprocessed.tsv', sep = '\t')

In [5]:
au_list = ['AuburnRed',
 'rthstewart',
 'Jojoinabox',
 'literarypeerelief',
 'Lady_Loki',
 'Shayheyred',
 'literarypeerelief',
 'Mangaluva',
 'rthstewart',
 'RandomRavenclaw9']

In [6]:
df_all = []
for au in au_list:
    df_Au = df[df.Author == au]
    df_all.append(df_Au)   
df_all = pd.concat(df_all)

In [25]:
len(df_all)

217

In [21]:
doc_cnt = 0
word_cnt = 0
for au in au_list:
    df_Au = df[df.Author == au]
    df_Rand = df.sample(len(df_Au))

    for doc in df_Rand.Text.tolist():
        doc = process_sentence(doc)
        word_cnt += len(doc)
        doc_cnt += 1

### General functions

In [7]:
def bootstrap_resample(li):
    if len(li) > 0:
        ave_original = np.average(li)
        aves = []
        for i in range(1000):
            sample = []
            for i in range(len(li)):
                sample.append(random.choice(li))
            aves.append(np.average(sample))
        tail = sorted(aves)[24]
        head = sorted(aves)[975]
        return (ave_original, tail, head)

In [8]:
def process_sentence(sentence):
    # input: sentence as a string
    # output: a list of words in the sentence
    sentence = re.sub(r'aA|aa', 'a', sentence)
    sentence = re.sub(r'\\xe2........|\\xc|\\xa|\\n|[0123456789*_]', '', sentence).lower()
    sentence = re.findall(u'(?u)\\b\\w\\w+\\b', sentence)
    return sentence

In [12]:
def kmeans_acc(k_means, labels):
    prec = 0
    for i in range(len(k_means.labels_)):
        if k_means.labels_[i] == labels[i]:
            prec += 1
    prec = prec/len(k_means.labels_)
    print(prec)

### tfidf weighting

In [9]:
# create a corpus to give to sklearn
def create_corpus_for_voc(df):
    doc = []
    for i in df.Text.tolist():
        #Remove some non-ascii characters and 'aa's
        i = re.sub(r'aA|aa', 'a', i)
        i = re.sub(r'\\xe2........|\\xc|\\xa|\\n|[0123456789*_]', '', i)
        i = i.lower()
        doc.append(i)  
    return doc

In [26]:
vectorizer = TfidfVectorizer(min_df=1)
corpus_all = create_corpus_for_voc(df_all)
vectorizer.fit(corpus_all)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [29]:
data = []
labels = []
label = 0

for au in au_list:
    df_Au = df[df.Author == au]
    corpus = create_corpus_for_voc(df_Au)
    au_dist = vectorizer.transform(corpus).todense()
    data.extend(au_dist.tolist())
    labels.extend([label for i in range(len(au_dist))])
    label += 1
   

In [30]:
len(data)

217

In [31]:
k_means = cluster.KMeans(n_clusters=10)
k_means.fit_predict(data) 

kmeans_acc(k_means, labels)

0.07834101382488479


### LDA

In [32]:
from gensim import corpora, models, similarities
from nltk.corpus import stopwords

In [33]:
# create a corpus to give to sklearn
def create_sentence_list(df):
    doc = []
    stops = set(stopwords.words("english"))
    for line in df.Text.tolist():
        line = process_sentence(line)
        line = [word for word in line if word not in stops]
        doc.append(line)  
    return doc

In [34]:
def create_lda_input(sentences):
    id2word = corpora.dictionary.Dictionary(sentences)
    corpus = [id2word.doc2bow(sentence) for sentence in sentences]
    return id2word, corpus

In [35]:
doc = create_sentence_list(df_all)
id2word,corpus = create_lda_input(doc)
lda = models.ldamodel.LdaModel(corpus=corpus,id2word=id2word,num_topics=20)

In [37]:
data = []
labels = []
label = 0

for au in au_list:
    df_Au = df[df.Author == au]
    doc = create_sentence_list(df_Au)
    au_dist = []
    for i in range(len(doc)):
        dist = lda.get_document_topics(id2word.doc2bow(doc[i]), minimum_probability = 0)
        dist = [i[1] for i in dist]
        au_dist.append(dist)
    data.extend(au_dist)
    labels.extend([label for i in range(len(au_dist))])
    label += 1
   

In [39]:
k_means = cluster.KMeans(n_clusters=10)
k_means.fit_predict(data) 

kmeans_acc(k_means, labels)

0.15668202764976957


### Doc2vec

In [40]:
from gensim import utils
from gensim.models.doc2vec import LabeledSentence
from gensim.models import doc2vec

In [41]:
def read_corpus(doc):
    for i, line in enumerate(doc):
        yield doc2vec.TaggedDocument(line,[i])

In [45]:
doc = create_sentence_list(df_all)
corpus = list(read_corpus(doc))
model = doc2vec.Doc2Vec(corpus)

In [48]:
model

<gensim.models.doc2vec.Doc2Vec at 0x12a3b9160>

In [46]:
data = []
labels = []
label = 0

for au in au_list:
    df_Au = df[df.Author == au]
    doc = create_sentence_list(df_Au)
    au_dist = []
    for doc_id in range(len(corpus)):
        inferred_vector = model.infer_vector(corpus[doc_id].words)
        au_dist.append(inferred_vector)

    data.extend(au_dist)
    labels.extend([label for i in range(len(au_dist))])
    label += 1
   

In [47]:
k_means = cluster.KMeans(n_clusters=10)
k_means.fit_predict(data) 

kmeans_acc(k_means, labels)

0.10092165898617511
