# Disputed Author Project (dap)

In [None]:
import pandas as pd
import numpy as np

In [None]:
# Sample corpus for testing
d1 = {
    'meta_author': 'Plato',
    'meta_title': 'Symposium',
    'meta_text': 'This is the text of the Symposium.  It is just an example for testing.'
}

d2 = {
    'meta_author': 'Aristotle',
    'meta_title': 'Nico. Ethics',
    'meta_text': 'This is a second text, by a different author.  It is still just for testing.'
}

d3 = {
    'meta_author': 'Xenophon',
    'meta_title': 'Apology',
    'meta_text': 'This is a third text.  It is also just used for testing.  Do not get excited.'
}

dataset = pd.DataFrame(data=[d1, d2, d3], columns=['meta_author', 'meta_title', 'meta_text'])

In [None]:
# Convert the text into tf_idf values
from sklearn.feature_extraction.text import TfidfVectorizer

vec = TfidfVectorizer()

tf_idf_vec = vec.fit_transform(dataset['meta_text'])

In [None]:
# Convert the tf_idf matrix into a data frame with labels
tf_idf_df = pd.DataFrame(columns=vec.get_feature_names(), data=tf_idf_vec.toarray())

In [None]:
# Combine the original data set with the tf_idf dataframe
cluster_df = dataset.join(tf_idf_df, lsuffix='_meta')

In [None]:
cluster_df = cluster_df.set_index('meta_title')

In [None]:
# These are our final data sets.  y is author, X contains the text data
y_set = cluster_df['meta_author']
X_set = cluster_df.drop('meta_author', 1)

In [None]:
# Cluster the model using a Gaussian Mixture Method
from sklearn.mixture import GMM

cluster = GMM()

cluster.fit(X_set.drop('meta_text', 1))

In [None]:
# Naive way to graph a cluster is just reduce the dimensions to 2 with PCA
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
pca_plotdata = pca.fit_transform(X_set.drop('meta_text', 1))

In [None]:
# Graph the results on a 2D plane
import matplotlib.pyplot as plt
from matplotlib.colors import LogNorm

%matplotlib inline

plt.ylabel('PCA #1')
plt.xlabel('PCA #0')
plt.ylim(ymin=-1, ymax=1)
plt.title('Clusters of Texts')
plt.scatter(pca_plotdata[:,0], pca_plotdata[:,1])


plt.show()