In [1]:
import torch

import pyprob
from pyprob import Model
import pyprob.distributions as dists

import calendar
import string



In [2]:
class DateParser(Model):
    def __init__(self):
        super().__init__(name="Date with Unkown Format")
        
        self.possible_dividers = ['\\', '/', '-', ' ', '_', ':', '.']
        
        self.longest_string = len('31 / November / 2000')
        self.all_symbols = list(string.ascii_uppercase) + \
                           [str(d) for d in range(10)] + \
                           self.possible_dividers + \
                           [' ']
    
    def get_index(self, letter):
        return self.all_symbols.index(letter)
    
    def pad(self, date_string):
        return date_string + ' ' * (self.longest_string - len(date_string))

    def forward(self):

        # all dates are between 0 AD and 4000 AD
        # sanple each digit such that the year is usually close to 2019
        year_1 = int(pyprob.sample(dists.Categorical(torch.tensor(
            [0.05, 0.4, 0.4, 0.03, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02]
        ))).item())
        year_2 = int(pyprob.sample(dists.Categorical(torch.tensor(
            [0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.1, 0.5] if year_1 == 1 else
            [0.5, 0.1, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05] if year_1 == 2 else
            [0.1]*10
        ))).item())
        year_3 = int(pyprob.sample(dists.Categorical(torch.tensor([0.1]*10))).item())
        year_4 = int(pyprob.sample(dists.Categorical(torch.tensor([0.1]*10))).item())
        year = int("".join(str(d) for d in [year_1, year_2, year_3, year_4]))

        # sample month and day given the year
        month = int(pyprob.sample(dists.Categorical(torch.tensor([1/12]*12))).item()) +1
        num_days = calendar.monthrange(year, month)[1]             # number of days in this month
        day_probs = [1/num_days]*num_days + [0.]*(31-num_days)     # probs of which day it is (in fixed length vector)
        day = int(pyprob.sample(dists.Categorical(torch.tensor(day_probs))).item()) + 1

        # sample format used to write day, month and year
        yy = pyprob.sample(dists.Categorical(torch.tensor([0.5, 0.5]))).item()  # either yy or yyyy
        m = pyprob.sample(dists.Categorical(torch.tensor([0.25]*4))).item()   # either m, mm or e.g. 'JAN'
        d = pyprob.sample(dists.Categorical(torch.tensor([0.5, 0.5]))).item()   # either d or dd
        
        real_date = {'day': day, 'month': month, 'year': year}

        # put day, month and year in right format
        if d:
            day = str(day)
        else:  # dd format
            day = str(day).zfill(2)
        # do month
        if m == 0:
            month = str(month)
        elif m == 1:
            month = str(month).zfill(2)
        elif m == 2:
            month = calendar.month_name[month]
        else:
            month = calendar.month_abbr[month]
        # do year
        if yy:
            year = str(year).zfill(2)[-2:]
        else:  # yyyy
            year = str(year).zfill(4)
        
        # sample order of day, month, year
        # m/d/y or d/m/y or y/m/d (never y/d/m)
        order = pyprob.sample(dists.Categorical(torch.tensor([1/3]*3))).item()
        if order == 0:
            date = [month, day, year]
        elif order == 1:
            date = [day, month, year]
        else:
            date = [year, month, day]
            
        # select dividers
        num_div = len(self.possible_dividers)
        divider1 = int(pyprob.sample(dists.Categorical(torch.tensor([1/num_div]*num_div))).item())
        divider2 = int(pyprob.sample(dists.Categorical(torch.tensor([1/num_div]*num_div))).item())
        divider1 = self.possible_dividers[divider1]
        divider2 = self.possible_dividers[divider2]
        
        # sometimes put space before/after dividers
        space1 = bool(pyprob.sample(dists.Categorical(torch.tensor([0.9, 0.1]))).item())
        space2 = bool(pyprob.sample(dists.Categorical(torch.tensor([0.9, 0.1]))).item())
        space3 = bool(pyprob.sample(dists.Categorical(torch.tensor([0.9, 0.1]))).item())
        space4 = bool(pyprob.sample(dists.Categorical(torch.tensor([0.9, 0.1]))).item())

        date =  "".join([date[0],
                         ' ' if space1 else '',
                         divider1,
                         ' ' if space2 else '',
                         date[1],
                         ' ' if space3 else '',
                         divider2,
                         ' ' if space4 else '',
                         date[2]]).upper()

        # pad with spaces so tha number of observations is constant
        padded_date = self.pad(date)
        
        # observe each letter from a categorical distribution
        for i, letter in enumerate(padded_date):
            probs = torch.ones(len(self.all_symbols))*0.001
            probs[self.get_index(letter)] = 1.
            pyprob.observe(dists.Categorical(probs),
                           name=f"letter_{i}")
        
        return date, real_date
    
    def get_observes(self, date_string):
        date_string = self.pad(date_string)
        return {f"letter{i}": torch.tensor(self.get_index(letter))
                for i, letter in enumerate(date_string)}


In [3]:
model = DateParser()

In [4]:
for i in model.prior_distribution(20).values_numpy():
    print(i[0])

Time spent  | Time remain.| Progress             | Trace | Traces/sec
0d:00:00:00 | 0d:00:00:00 | #################### | 20/20 | 111.14       
JUNE:16\50
67 /NOVEMBER-24
04-17  2103
05 1\ 75
24 05\34
JUNE\20.1742
67  AUGUST  17
SEPTEMBER:14 /14
26: 1_05
2064/JUN.14
MAY\04-1933
51/JUN\02
3946-JUNE-21
10 .6/03
09.03_69
JANUARY-24\10
1185:MAY.15
25.JUN\2087
29\01 2165
16.DECEMBER 1944


In [5]:
model.get_observes('16:DECEMBER 1944')

{'letter0': tensor(27),
 'letter1': tensor(32),
 'letter2': tensor(41),
 'letter3': tensor(3),
 'letter4': tensor(4),
 'letter5': tensor(2),
 'letter6': tensor(4),
 'letter7': tensor(12),
 'letter8': tensor(1),
 'letter9': tensor(4),
 'letter10': tensor(17),
 'letter11': tensor(39),
 'letter12': tensor(27),
 'letter13': tensor(35),
 'letter14': tensor(30),
 'letter15': tensor(30),
 'letter16': tensor(39),
 'letter17': tensor(39),
 'letter18': tensor(39),
 'letter19': tensor(39)}

In [6]:
post = model.posterior_distribution(
    observe=model.get_observes('16:DECEMBER 1944'),
    inference_engine=pyprob.InferenceEngine.LIGHTWEIGHT_METROPOLIS_HASTINGS,
    num_traces=10000
)

Time spent  | Time remain.| Progress             | Trace       | Accepted|Smp reuse| Traces/sec
0d:00:01:26 | 0d:00:00:00 | #################### | 10000/10000 |  42.64% |  93.75% | 115.36       


In [46]:
post.sample()

('02/2/1482', {'day': 2, 'month': 2, 'year': 1482})