<a href="https://colab.research.google.com/github/syphax/CrounseCode/blob/master/Extract_AP_Latin_Vocab_Tables.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Script to extract vocab tables from ~`https://apcentral.collegeboard.org/media/pdf/ap-latin-draft-course-framework.pdf`~

https://apcentral.collegeboard.org/media/pdf/ap-latin-revised-framework-preview.pdf

Store this file in the Colab workspace and rename to `ap-vocab.pdf`

In [9]:
# Run the next line if these are not already installed
# !pip install PyPDF2 tabula-py


In [10]:
# Tabula creashes my kernel!
# from tabula import read_pdf

import camelot.io as camelot

#from unidecode import unidecode
import unicodedata

from pathlib import Path

import numpy as np
import pandas as pd

In [11]:

fn = Path('..') / 'files' / 'ap-latin-revised-framework-preview.pdf'

fn_list = Path('..') / 'files' / 'ap-latin-draft-course-framework-vocab-list-revised'

In [12]:
tables = camelot.read_pdf(fn, pages="94-122", flavor='lattice') #, table_areas=['50,50,600,700'])

print(len(tables))

29


In [16]:
#df_vocab = pd.concat([table.df for table in tables])

# The vocab tables are 3 columns wide; keep only thoze

df_vocab = pd.concat([table.df for table in tables if table.df.shape[1] == 4])

df_vocab.columns = ['Vocabulary', 'Part of Speech', 'Definition', 'Suggested Reading']

df_vocab = df_vocab.reset_index(drop=True)

print(df_vocab.shape)

print(df_vocab)


(1029, 4)
                                  Vocabulary  Part of Speech  \
0                                 Vocabulary  Part of Speech   
1                                 a, ab, abs     preposition   
2                     abeo, -ire, -ii, -itum            verb   
3                        absum, abesse, afui            verb   
4     accedo (adc-), -ere, -cessi, \n-cessum            verb   
...                                      ...             ...   
1024                          votum, -i (n.)            noun   
1025                         vox, vocis (f.)            noun   
1026  vulnero (volnero), -are, -avi, \n-atum            verb   
1027             vulnus (volnus), -eris (n.)            noun   
1028               vultus (voltus), -us (m.)            noun   

                                             Definition  Suggested Reading  
0                                            Definition  Suggested Reading  
1               (with abl.) from, away from, out of, by            

In [6]:
# # There is a short table that doesn't get picked up by camelot, so we add 2 words manually:

# new_rows = pd.DataFrame([
#     ['vōtum, -ī', 'a promise to a god, solemn pledge, religious engagement, vow', '5.1'],
#     ['vōx, vōcis', 'a voice, sound, tone, utterance, cry, call', '2.4']
# ], columns=['Required Vocabulary', 'Definition', 'Suggested Reading'])

# df_vocab = pd.concat([df_vocab, new_rows], ignore_index=True)

In [18]:
# Add clean column:

def remove_accents(text):
    return ''.join(c for c in unicodedata.normalize('NFKD', text)
                  if not unicodedata.combining(c))

df_vocab['Base Word'] = (df_vocab['Vocabulary']
                   .str.split(pat='[,\s]')
                   .str[0]
                   .apply(remove_accents))

In [21]:
# Remove table headers:

df_vocab = df_vocab[df_vocab['Vocabulary'] != 'Vocabulary']

In [22]:
df_vocab

Unnamed: 0,Vocabulary,Part of Speech,Definition,Suggested Reading,Base Word
1,"a, ab, abs",preposition,"(with abl.) from, away from, out of, by",1.1,a
2,"abeo, -ire, -ii, -itum",verb,"to go from, go away, go off, go forth, go, \nd...",1.1,abeo
3,"absum, abesse, afui",verb,"to be away from, be absent",5.3,absum
4,"accedo (adc-), -ere, -cessi, \n-cessum",verb,"to go to, come to, come near, draw near, \napp...",2.1,accedo
5,"accendo (adc-), -ere, -cendi, \n-censum",verb,"to kindle, set on fire, inflame",3.6,accendo
...,...,...,...,...,...
1024,"votum, -i (n.)",noun,"a promise to a god, solemn pledge, vow",5.1,votum
1025,"vox, vocis (f.)",noun,"a voice, sound, tone, utterance, cry, call",2.4,vox
1026,"vulnero (volnero), -are, -avi, \n-atum",verb,"to wound, harm, pain, distress; damage",6.2,vulnero
1027,"vulnus (volnus), -eris (n.)",noun,a wound,5.3,vulnus


In [23]:
df_vocab.tail(40)

Unnamed: 0,Vocabulary,Part of Speech,Definition,Suggested Reading,Base Word
988,"verbum, -i (n.)",noun,a word,3.6,verbum
989,vero,adverb,"in truth, in fact, certainly, truly, to be sur...",1.4,vero
990,"vertex (vortex), -icis (f.)",noun,"a whirl, whirlpool, vortex; the highest point,...",3.1,vertex
991,"verto (vorto), -ere, -i, versum",verb,"to turn, turn up, turn back, direct",1.4,verto
992,"verus, -a, -um",adjective,"true, real, actual, genuine",1.1,verus
993,"vester, -tra, -trum",adjective,"your, yours, of you (plural)",3.4,vester
994,"vestigium, -i (n.)",noun,"a footstep, step, footprint; a trace, mark, tr...",2.3,vestigium
995,"vestis, -is (f.)",noun,"clothes, clothing, attire",5.6,vestis
996,"vetus, -eris",adjective,"old, aged, advanced in years",4.4,vetus
997,"vexo, -are, -avi, -atum",verb,"to shake, jolt, toss violently; to annoy, trou...",5.3,vexo


In [24]:
# Save as CSV and XLSX

df_vocab.to_csv(fn_list.with_suffix('.csv'), index=False)

df_vocab.to_excel(fn_list.with_suffix('.xlsx'), sheet_name='AP Vocab List')

In [26]:
df_vocab.groupby('Base Word').agg({'Vocabulary':'nunique'}).sort_values('Vocabulary', ascending=False).head(12)

Unnamed: 0_level_0,Vocabulary
Base Word,Unnamed: 1_level_1
volo,2
liber,2
amicus,2
adversus,2
hic,2
eo,2
labor,2
tantum,2
princeps,2
quis,2
