# Pandas practice, with a glimpse at fuzzy matching

In [63]:
import csv, os, sys, math, ast, string
from collections import Counter
import pandas as pd
from difflib import SequenceMatcher

Let's import some optically scanned data; this happens to be derived from the index to the *Book Review Digest,* and has already been extensively processed by Wenyi Shang. I've further focused this list on historical fiction and mysteries, for reasons connected to a project of Madeleine McQuilling's.

In [62]:
histmyst = pd.read_csv('histandmyst.tsv', sep = '\t')
histmyst.head()

Unnamed: 0,year,genre,author,shorttitle
0,21,historical fiction,"Buchan, J:",Path of the king
1,21,historical fiction,"Williams, E. w.",Court of Belshazzar
2,21,historical fiction,"Kryshanovskaya, V. I.",Torchrbearers of Bohemia
3,21,historical fiction,Phillpotts. E.,Eudocia
4,21,historical fiction,Wilson. M.,Forging of the pikes


As you can see, there are some errors.

Let's try to match this to library metadata. First, however, there's a sneaky problem, which is that there are leading spaces in one column.

In [64]:
histmyst.iloc[0, 3]

' Path of the king'

In [65]:
histmyst['shorttitle'] = histmyst['shorttitle'].str.strip()

In [57]:
testdf = pd.DataFrame({'shorttitle': ['Path of the king', 'Court of Belshazzar'], 'numcopies': [6, 9]})
testdf.head()

Unnamed: 0,shorttitle,numcopies
0,Path of the king,6
1,Court of Belshazzar,9


In [58]:
merged = testdf.merge(histmyst, left_on = 'shorttitle', right_on = 'shorttitle' how = 'left')


(2, 5)

In [29]:
hathi = pd.read_csv('hathific1930-49.tsv', sep = '\t')

In [60]:
merged = histmyst.merge(hathi, on = 'shorttitle', how = 'inner')

In [61]:
merged.shape

(552, 26)

In [24]:
hathi.loc[hathi.author == 'Christie, Agatha', :]

Unnamed: 0,docid,author,authordate,inferreddate,latestcomp,datetype,startdate,enddate,imprintdate,genres,...,recordid,instances,allcopiesofwork,copiesin25yrs,enumcron,volnum,title,parttitle,earlyedition,shorttitle
381,inu.30000020640763,"Christie, Agatha",1890-1976.,1930.0,1930.0,s,1930,,1930,,...,6059512,1,1.0,1.0,,,The mysterious Mr. Quin / | $c: Aghatha Christie.,,True,The mysterious Mr. Quin
2016,mdp.39015001153629,"Christie, Agatha",1890-1976.,1933.0,1933.0,s,1933,,1933,,...,628161,1,1.0,1.0,,,Thirteen at dinner.,,True,Thirteen at dinner
2280,pst.000011280453,"Christie, Agatha",1890-1976.,1933.0,1933.0,t,1954,1933.0,1954,,...,7031743,1,1.0,1.0,,,Lord Edgware dies / | $c: Agatha Christie.,,True,Lord Edgware dies
2510,mdp.39015008016746,"Christie, Agatha",1890-1976.,1934.0,1934.0,s,1934,,1934,,...,628122,1,1.0,1.0,,,Murder in the Calais coach / | $c: Agatha Chri...,,True,Murder in the Calais coach
2511,mdp.39015000682040,"Christie, Agatha",1890-1976.,1934.0,1934.0,s,1934,,1962,,...,628125,1,1.0,1.0,,,"Mr. Parker Pyne, detective / | $c: by Agatha C...",,True,"Mr. Parker Pyne, detective"
3715,mdp.39015020689215,"Christie, Agatha",1890-1976.,1936.0,1936.0,s,1936,,1936,,...,628099,1,1.0,1.0,,,The A. B. C. murders; | a new Poirot mystery.,,True,The A. B. C. murders; a new Poirot mystery
4097,mdp.39076005032938,"Christie, Agatha",1890-1976.,1936.0,1936.0,s,1936,,1936,,...,9904778,1,1.0,1.0,,,Surprise endings by Hercule Poirot.,,True,Surprise endings by Hercule Poirot
4301,mdp.39015046448521,"Christie, Agatha",1890-1976.,1937.0,1937.0,s,1937,,1937,,...,628148,1,1.0,1.0,,,Perilous Journeys of Hercule Poirot : | includ...,,True,Perilous Journeys of Hercule Poirot : includin...
4302,mdp.39015046391333,"Christie, Agatha",1890-1976.,1937.0,1937.0,s,1937,,1937,Mystery fiction,...,628153,1,2.0,1.0,,,Poirot loses a client / | $c: by Agatha Christie.,,True,Poirot loses a client
5150,uc1.32106001973962,"Christie, Agatha",1890-1976.,1938.0,1938.0,s,1938,,1938,,...,7127420,1,1.0,1.0,,,"Appointment with death, a Poirot mystery.",,True,"Appointment with death, a Poirot mystery"
