In [1]:
import doctest
import numpy as np
import pandas as pd
import re
import sqlite3

## Get Data

In [2]:
title = 'Roman_Imperial_Coinage'
fname = '../Data/' + title + '.sqlite'
conn = sqlite3.connect(fname)

In [3]:
query = '''
SELECT
  emperor,
  description
FROM
  OCRE JOIN emperors ON
    startDate >= emperors.start
    AND startDate <= emperors.end;
'''
pd.read_sql(query, conn).to_csv("temp", encoding='utf-8')
df = pd.read_csv("temp", encoding='utf-8')
# df.tail()

## Pre-Process Data

In [4]:
def cleanDescription(desc):
    """
    Parameters
    ----------
    desc : str
        String of the original description
        
    Returns
    -------
    Returns a description with just the description of the 
    obverse and reversed with only letters and spaces.
    
    DocTest
    -------
    >>> cleanDescription('Struck Silver. (obverse) Bust of Gallienus. (reverse) Concordia, draped, standing left.')
    'Bust of Gallienus Concordia draped standing left'
    """
    lst = re.split(". \([a-z]+\)", desc)
    
    if len(lst) == 2:
        cleanedStr = lst[1]
    elif len(lst) == 3:
        cleanedStr = lst[1] + lst[2]
    else:
        cleanedStr = ""
    
    cleanedStr = re.sub(r'[^a-zA-Z ]+', '', cleanedStr).strip()
    
    return cleanedStr

# Run DocTest(s)
doctest.testmod()

TestResults(failed=0, attempted=1)

In [5]:
df['cleanDescription'] = df['description'].apply(cleanDescription)

## Actual Machine Learning

In [22]:
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

In [23]:
# Split data to training and testing data
X = df['cleanDescription']
y = df['emperor']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [25]:
# Create Machine Learning pipeline that creates "dictionary", converts to
# tf-idf, and puts the tf-idf into a Multinomial Naive Bayes
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB()),
                    ])

In [26]:
text_clf = MultinomialNB().fit(X_train_tfidf, y_train)

In [28]:
predicted = text_clf.predict(X_test)
np.mean(predicted == y_test) 

ValueError: could not convert string to float: 'Head of Caracalla laureate bearded right Caracalla in military attire on horse galloping right holding raised spear in hand before fallen foeman'