In [44]:
import doctest
import numpy as np
import pandas as pd
import re
import sqlite3

import CleanData as cd

## Get Data

In [64]:
title = 'Roman_Imperial_Coinage'
fname = '../Data/' + title + '.sqlite'
conn = sqlite3.connect(fname)

In [65]:
query = '''
SELECT
  emperor,
  description,
  material,
  denomination,
  subject,
  mint,
  date
FROM
  allData JOIN emperors ON
    startDate >= emperors.start
    AND startDate <= emperors.end;
'''
pd.read_sql(query, conn).to_csv("temp", encoding='utf-8')
df = pd.read_csv("temp", encoding='utf-8')
df = df.replace(np.nan, '', regex=True)
# df.tail()

## Pre-Process Data

In [66]:
df['DupCheck'] = df.apply(lambda row: cd.makeDupCheckCol(
                                                         row['material'], 
                                                         row['denomination'], 
                                                         row['subject'], 
                                                         row['mint'],
                                                         row['date']
                                                        ), axis=1)
df.drop(['DupCheck'], axis=1, inplace=True)
df['cleanDescription'] = df['description'].apply(cd.cleanDescription)

## Actual Machine Learning

In [69]:
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

In [70]:
# Split data to training and testing data
X = df['cleanDescription']

y = df['emperor']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [71]:
# Create Machine Learning pipeline that creates "dictionary", converts to
# tf-idf, and puts the tf-idf into a Multinomial Naive Bayes
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', SGDClassifier(loss='hinge', penalty='l2',
                                           alpha=1e-3, random_state=42,
                                           max_iter=5, tol=None)),
                    ])

In [72]:
text_clf.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...ty='l2', power_t=0.5, random_state=42, shuffle=True,
       tol=None, verbose=0, warm_start=False))])

In [73]:
predicted = text_clf.predict(X_test)
np.mean(predicted == y_test) 

0.33288078065337295

In [30]:
word_vect = CountVectorizer()
word_vect.fit_transform(X_train)

<13456x1819 sparse matrix of type '<class 'numpy.int64'>'
	with 229708 stored elements in Compressed Sparse Row format>

In [31]:
word_freq = word_vect.vocabulary_
word_freq = sorted(word_freq.items(), key=lambda x: x[1])
word_freq.reverse()

In [32]:
word_freq

[('zones', 1818),
 ('zenonis', 1817),
 ('zenobia', 1816),
 ('zeno', 1815),
 ('youthful', 1814),
 ('youth', 1813),
 ('younger', 1812),
 ('young', 1811),
 ('yoke', 1810),
 ('xxxxxxx', 1809),
 ('xxxxxmvlfel', 1808),
 ('xxxxxavg', 1807),
 ('xxxxx', 1806),
 ('xxxx', 1805),
 ('xxxviii', 1804),
 ('xxx', 1803),
 ('xxv', 1802),
 ('xx', 1801),
 ('xvxxx', 1800),
 ('xvotxx', 1799),
 ('xv', 1798),
 ('xsic', 1797),
 ('xmvlxx', 1796),
 ('xiix', 1795),
 ('xcvit', 1794),
 ('xcvi', 1793),
 ('xcaess', 1792),
 ('xc', 1791),
 ('writing', 1790),
 ('wreath', 1789),
 ('wread', 1788),
 ('wrapped', 1787),
 ('woman', 1786),
 ('wolf', 1785),
 ('wings', 1784),
 ('winglike', 1783),
 ('winged', 1782),
 ('wing', 1781),
 ('wide', 1780),
 ('whip', 1779),
 ('wheel', 1778),
 ('wheatears', 1777),
 ('wearing', 1776),
 ('weapons', 1775),
 ('waves', 1774),
 ('waved', 1773),
 ('wave', 1772),
 ('waters', 1771),
 ('water', 1770),
 ('warrior', 1769),
 ('war', 1768),
 ('wand', 1767),
 ('wall', 1766),
 ('walking', 1765),
 ('waitin