In [7]:
import re
import pickle
from datetime import datetime, timedelta
import pdfplumber
import ebisu
import pandas as pd
pdf = pdfplumber.open("data/GRE1450.pdf").pages

In [46]:
def parse_row(row):
    assert isinstance(row, list)
    assert len(row) == 5
    if row[0] == '例例句句': # 例句
#         print(f"例句{row[1]}")
        return True, row[1]
    elif re.search(r'[\u4e00-\u9fa5]+', row[0]):  # 含有中文，扔掉
        return None
    elif row[0] == '' or re.search(r'^Day ', row[0]):
        return None
    else: # 单词
        meaning = row[2].replace('\u2028','')
        meaning = [i.replace('\n', '') for i in re.split(r'\n(?=\()', meaning)]
        meaning = '\n'.join(meaning)
        # https://jrgraphix.net/research/unicode_blocks.php
        meaning = re.sub(r'[\u2f00-\u2fdf]', '', meaning)   # Kangxi
        meaning = re.sub(r'[\u3400-\u4dbf]', '', meaning)   # CJK Extras
        meaning = re.sub(r'[\uf900-\ufaff]', '', meaning)   # CJK Extras
#         meaning = row[2].replace('\n', ' ')
#         print(f"单词{row[0]} 音标{row[1]} 释义{meaning} 同义词{row[3]}")
        return False, (row[0], row[1], meaning, row[3])

In [4]:
all_rows = list()
for i in range(len(pdf)):
    print(f"Reading page {i}...")
    parsed = pdf[i].extract_tables()
    assert len(parsed) == 1
    all_rows.extend(parsed[0])

Reading page 0...
Reading page 1...
Reading page 2...
Reading page 3...
Reading page 4...
Reading page 5...
Reading page 6...
Reading page 7...
Reading page 8...
Reading page 9...
Reading page 10...
Reading page 11...
Reading page 12...
Reading page 13...
Reading page 14...
Reading page 15...
Reading page 16...
Reading page 17...
Reading page 18...
Reading page 19...
Reading page 20...
Reading page 21...
Reading page 22...
Reading page 23...
Reading page 24...
Reading page 25...
Reading page 26...
Reading page 27...
Reading page 28...
Reading page 29...
Reading page 30...
Reading page 31...
Reading page 32...
Reading page 33...
Reading page 34...
Reading page 35...
Reading page 36...
Reading page 37...
Reading page 38...
Reading page 39...
Reading page 40...
Reading page 41...
Reading page 42...
Reading page 43...
Reading page 44...
Reading page 45...
Reading page 46...
Reading page 47...
Reading page 48...
Reading page 49...
Reading page 50...
Reading page 51...
Reading page 52...
Rea

In [51]:
word_count = 0
last_word = None

word_row = {'word': [], 'pron': [], 'mean': [], 'syn': []}
example = {'word': [], 'ex': []}

for row in all_rows:
    parsed = parse_row(row)
    if parsed is not None:
        is_example, data = parsed
        if not is_example:
            word, pron, mean, syn = data
            last_word = word
            word_row['word'].append(word)
            word_row['pron'].append(pron)
            word_row['mean'].append(mean)
            word_row['syn'].append(syn)
        else:
            assert last_word is not None
            example['word'].append(last_word)
            example['ex'].append(parsed[1])

data_df = pd.DataFrame(word_row)
ex_df = pd.DataFrame(example)
data_df = data_df.merge(ex_df, on='word', how='left')

In [None]:
import pickle
pickle.dump(data_df, open('data/GRE1450.fmknowledge', 'wb'))

In [None]:
import ebisu

defaultModel = (4., 4., 24.) # alpha, beta, and half-life in hours

In [None]:
from datetime import datetime, timedelta
date0 = datetime(2017, 4, 19, 22, 0, 0)

database = [dict(factID=1, model=defaultModel, lastTest=date0),
            dict(factID=2, model=defaultModel, lastTest=date0 + timedelta(hours=11))]

In [None]:
oneHour = timedelta(hours=1)

now = date0 + timedelta(hours=11.1)
print("On {},".format(now))
for row in database:
    recall = ebisu.predictRecall(row['model'],
                                 (now - row['lastTest']) / oneHour,
                                 exact=True)
    print("Fact #{} probability of recall: {:0.1f}%".format(row['factID'], recall * 100))