In [5]:

import random
from datetime import datetime, timedelta
from random import randint, randrange

from IPython.display import display
import ipywidgets as widgets

# Data generation

## Reference Dates
- Includes the weekday to reduce LLM mistakes

Strategy 1:
Generate Question phrases with GPT and add a random reference date, then ask the LLM to generate the answer.


Strategy 2:
Filter twitter data for scheduling phrases using DateBERT, then use instruction evolution to complicate and add specificity to that dataset.
Generate Answer descriptors as before.







In [6]:


def generate_random_datetime_string():
    # List of weekdays
    weekdays = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
    
    # Generate random date between 2021-2025
    start_date = datetime(2021, 1, 1)
    end_date = datetime(2025, 12, 31)
    time_between_dates = end_date - start_date
    days_between_dates = time_between_dates.days
    random_days = random.randrange(days_between_dates)
    random_date = start_date + timedelta(days=random_days)
    
    # Generate random time
    random_hour = random.randint(0, 23)
    random_minute = random.randint(0, 59)
    random_datetime = random_date.replace(hour=random_hour, minute=random_minute)
    
    # Format the string
    weekday = weekdays[random_datetime.weekday()]
    formatted_string = f"{weekday} {random_datetime.strftime('%Y-%m-%d %H:%M')}"
    
    return formatted_string

# Example usage
with open('random.txt', 'w') as f:
    for _ in range(80000):
        f.write(generate_random_datetime_string() + '\n')

# Format -> "mon 5pm"
# with open()





# Date Checker

In [7]:
def format_ical_datetime(ical_datetime_str):
    """
    Convert iCal datetime string (e.g. '20240405T151500') to formatted date string
    
    Args:
        ical_datetime_str (str): Datetime string in iCal format (YYYYMMDDTHHmmss)
    
    Returns:
        str: Formatted datetime string (YYYY-MM-DD HHmm)
    """
    dt = datetime.strptime(ical_datetime_str, '%Y%m%dT%H%M%S')
    return f"{dt.strftime('%A %Y-%m-%d %H%M')}"

format_ical_datetime('20240405T151500')




'Friday 2024-04-05 1515'

In [8]:
# Create text input widget
ical_input = widgets.Text(
    # value='20240405T151500',
    description='iCal DateTime:',
    style={'description_width': 'initial'}
)

# Create output widget to display result
output = widgets.Output()

# Update function
def on_value_change(change):
    with output:
        output.clear_output()
        try:
            result = format_ical_datetime(change['new'])
            display(widgets.HTML(result))
        except ValueError:
            display("Invalid format. Use: YYYYMMDDTHHmmss")

# Register callback
ical_input.observe(on_value_change, names='value')

# Display widgets
display(ical_input)
display(output)



Text(value='', description='iCal DateTime:', style=DescriptionStyle(description_width='initial'))

Output()

# Data generation strategy

Generate a random description-output pair


Possible strategies:
1. Backdate the reference date to a random date in the past that is within a reasonable range (depends on the phrase) : THIS IS LIKELY TO BE TOO INFLEXIBLE
    - possible flaws include potentially generating nonsense intervals: next year implies a date much later than next week and would need to be accounted for by rule based generations. 
2. Use the reference date and hope that the LLM is good at reasoning about the future. Do some sanity checks on the output ie implied day matches the date provided.
3. Randomise a reference and target date and use the LLM to generate a phrase that will describe the interval between the two dates. I imagine it could be worse at that.



# Basic programmatic patterns


The following patterns will be generated procedurally due to their simplicity, complex patterns can then be evolved from the basic subset using evol-instruct technique.
- mon 12am
- tomorrow
- next tues 1900
- this tues 230pm


In [9]:
import random
import os
from tqdm import tqdm



def ftime(hour, minute, format_str):
    """
    Format hours and minutes to a 12-hour format with am/pm.
    
    Args:
        hour (int): Hour in 24-hour format (0-23)
        minute (int): Minute (0-59)
        format_str (str, optional): Format string using placeholders:
            - {h}: 12-hour hour
            - {m}: minutes (zero-padded)
            - {p}: am/pm
            - {H}: 24-hour hour
            Default: "{h}{p}" for no minutes, "{h}:{m}{p}" for minutes
        
    Returns:
        str: Time formatted according to the specified format
    """
    # Determine am/pm
    period = "am" if hour < 12 else "pm"
    
    # Convert to 12-hour format
    hour_12 = hour if hour <= 12 else hour - 12
    hour_12 = 12 if hour_12 == 0 else hour_12  # Handle midnight (0:00) as 12am

    
    # Format the output using the provided format string
    return format_str.format(
        h=hour_12,
        m=f"{minute:02d}",
        p=period,
        H=hour
    )


## 1. Basic day - time phrase ("monday 10am")

In [10]:
def weekday_time(
    # mon 2pm
    target_weekday,
    target_time: tuple[int],
    language : str,
    refdate : datetime,
    time_fmt : str
):
    """Generate iCalendar format data for a scheduling event.
    
    Args:
        target_weekday (int): The weekday for the event (0=Monday, 6=Sunday)
        target_time (tuple[int]): A tuple of (hour, minute) in 24-hour format
        language (str): Language code (e.g., 'en-US')
        refdate (datetime): Reference date used as the starting point
        
    Returns:
        list[str]: Lines of iCalendar format data for the event
    """
    days = ["monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday"]

    # Convert reference date to datetime object
    if isinstance(refdate, str):
        ref_date = datetime.datetime.fromisocalendar(refdate)
    else:
        ref_date = refdate
    
    # Format reference date for output
    ref_date_str = ref_date.strftime('%A %Y-%m-%d %H%M')
    
    
    # Get the weekday number of the reference date (0 = Monday, 6 = Sunday)
    ref_weekday = ref_date.weekday()
    
    
    # Calculate days until the next occurrence of the random day
    days_until = (target_weekday - ref_weekday) % 7
    if days_until == 0:
        days_until = 7  # If it's the same day, go to next week
    
    # Calculate the date of the next occurrence
    target_date = ref_date + timedelta(days=days_until)
    
    # Set a random hour for the event (between 9 AM and 9 PM)

    H, M = target_time
    target_date = target_date.replace(hour=H, minute=M, second=0, microsecond=0)
    
    # Format the start and end times for iCalendar
    start_str = target_date.strftime("%Y%m%dT%H%M%S")
    end_str = (target_date + timedelta(hours=1)).strftime("%Y%m%dT%H%M%S")
    
    # Set a simple event title
    event_title = f"Event on {target_date.strftime("%A")}"

    input_phrase = f"{days[target_weekday]} {ftime(*target_time, time_fmt)}"
    

    return [
        "BEGIN:VCALENDAR",
        "VERSION:2.0", 
        "PRODID:-//BERTiCal//EN",
        f"INPUT:{input_phrase}",
        f"LANGUAGE:{language}",
        f"REFDATE:{ref_date_str}",
        "BEGIN:VEVENT",
        f"SUMMARY:{event_title}",
        f"DTSTART:{start_str}",
        f"DTEND:{end_str}",
        "END:VEVENT",
        "END:VCALENDAR"
    ]





    
def random_date(start=datetime(2010,1,1), end=datetime(2098,1,1), min_hour=0, max_hour=24):
    """
    This function will return a random datetime between two datetime 
    objects, with time constrained between min_hour and max_hour (default none).
    """
    delta = end - start
    int_delta = (delta.days * 24 * 60 * 60) + delta.seconds
    random_second = randrange(int_delta)
    random_datetime = start + timedelta(seconds=random_second)
    
    # Constrain the time to be between min_hour and max_hour
    hour = random_datetime.hour
    if hour < min_hour or hour > max_hour:
        # Replace the time with a random hour within the allowed range
        new_hour = randrange(min_hour, max_hour + 1)
        random_datetime = random_datetime.replace(hour=new_hour, minute=randrange(60))
    
    return random_datetime

# Example usage
ref_date = random_date()  # Friday 2024-04-05 15:15
target_time = (10, 00)  # 10:00 AM

language = "en-US"

ical_lines = weekday_time(randint(0,6), target_time, language, ref_date, "{h}:{m}{p}")

# Print the complete iCalendar output
for line in ical_lines:
    print(line)

"""example output
BEGIN:VCALENDAR
VERSION:2.0
PRODID:-//BERTiCal//EN
INPUT:monday 10am
LANGUAGE:en-US
REFDATE:Friday 2024-04-05 1515
BEGIN:VEVENT
SUMMARY:Event on Thursday
BEGIN:VALARM
END:VALARM
DTSTART:20240411T100000
DTEND:20240411T110000
END:VEVENT
END:VCALENDAR
"""



BEGIN:VCALENDAR
VERSION:2.0
PRODID:-//BERTiCal//EN
INPUT:tuesday 10:00am
LANGUAGE:en-US
REFDATE:Wednesday 2042-02-19 1029
BEGIN:VEVENT
SUMMARY:Event on Tuesday
DTSTART:20420225T100000
DTEND:20420225T110000
END:VEVENT
END:VCALENDAR


'example output\nBEGIN:VCALENDAR\nVERSION:2.0\nPRODID:-//BERTiCal//EN\nINPUT:monday 10am\nLANGUAGE:en-US\nREFDATE:Friday 2024-04-05 1515\nBEGIN:VEVENT\nSUMMARY:Event on Thursday\nBEGIN:VALARM\nEND:VALARM\nDTSTART:20240411T100000\nDTEND:20240411T110000\nEND:VEVENT\nEND:VCALENDAR\n'

In [11]:
# Generate 16000 examples of "monday 10am" with different dates
import os
from tqdm import tqdm
from datetime import datetime, timedelta

# Ensure the data directory exists
os.makedirs("./data", exist_ok=True)


num_examples = 16000


def rng(a, b):
    return lambda : randint(a, b)


def gen_weekday_time(refdate, time_fmt, weekday, hour, minute):
    if not isinstance(minute, int):
        minute = minute()
    if not isinstance(refdate, datetime):
        refdate = refdate()
    if not isinstance(hour, int):
        hour = hour()
        

    return weekday_time(
        weekday(),
        (hour,minute),
        language='en-US',
        refdate=refdate,
        time_fmt=time_fmt
    )
    

formats = [
    "{h}{p}",              # "10am"
    "{h}:{m}{p}",          # "10:30am"
    "{H}:{m}",            # "10:30"
    "{h} {p}",            # "10 am"
    "{h}:{m} {p}",        # "10:30 am"
    "{h} o'clock {p}",    # "10 o'clock am"
]


formats = {
    "hour_meridian": lambda : gen_weekday_time(
        refdate = random_date,
        time_fmt= "{h}{p}",
        weekday = rng(0,6),
        hour = rng(0,23),
        minute = 0
    ),
    "hour_minute_meridian": lambda : gen_weekday_time(
        refdate = random_date,
        time_fmt = "{h}:{m}{p}",
        weekday = rng(0,6),
        hour = rng(0,23),
        minute = rng(0, 59)
    ),
    "hour_minute_no_colon_meridian": lambda : gen_weekday_time(
        refdate = random_date,
        time_fmt="{h}{m}{p}",
        weekday=rng(0,6),
        hour = rng(0,23),
        minute = rng(0, 59)
    ),
    "24hour_minute": lambda : gen_weekday_time(
        refdate = random_date,
        time_fmt = "{H}:{m}",
        weekday = rng(0,6),
        hour = rng(0,23),
        minute = rng(0, 59)
    ),
    "24hour_no_colon": lambda : gen_weekday_time(
        refdate = random_date,
        time_fmt = "{H}{m}",
        weekday = rng(0,6),
        hour = rng(0,23),
        minute = rng(0, 59)
    )
}

# Generate one example of each format
print("Examples of each format:")
for key, format_generator in formats.items():
    example = format_generator()
    print(f"Format {key}:\n {'\n\t'.join(example)}")


Examples of each format:
Format hour_meridian:
 BEGIN:VCALENDAR
	VERSION:2.0
	PRODID:-//BERTiCal//EN
	INPUT:tuesday 8pm
	LANGUAGE:en-US
	REFDATE:Thursday 2079-01-05 0032
	BEGIN:VEVENT
	SUMMARY:Event on Tuesday
	DTSTART:20790110T200000
	DTEND:20790110T210000
	END:VEVENT
	END:VCALENDAR
Format hour_minute_meridian:
 BEGIN:VCALENDAR
	VERSION:2.0
	PRODID:-//BERTiCal//EN
	INPUT:sunday 3:28am
	LANGUAGE:en-US
	REFDATE:Tuesday 2046-09-11 1054
	BEGIN:VEVENT
	SUMMARY:Event on Sunday
	DTSTART:20460916T032800
	DTEND:20460916T042800
	END:VEVENT
	END:VCALENDAR
Format hour_minute_no_colon_meridian:
 BEGIN:VCALENDAR
	VERSION:2.0
	PRODID:-//BERTiCal//EN
	INPUT:friday 1104pm
	LANGUAGE:en-US
	REFDATE:Sunday 2011-12-18 0305
	BEGIN:VEVENT
	SUMMARY:Event on Friday
	DTSTART:20111223T230400
	DTEND:20111224T000400
	END:VEVENT
	END:VCALENDAR
Format 24hour_minute:
 BEGIN:VCALENDAR
	VERSION:2.0
	PRODID:-//BERTiCal//EN
	INPUT:saturday 3:44
	LANGUAGE:en-US
	REFDATE:Thursday 2089-05-12 1817
	BEGIN:VEVENT
	SUMMARY:Eve

# Relative from reference:

- "in 2 days"
- "in 2 hrs

In [12]:
from dataclasses import dataclass

from datetime import datetime, timedelta

from dateutil.rrule import rrulestr, DAILY, WEEKLY, MONTHLY

@dataclass
class Pattern:
    phrases: list[str]
    expr: str

    def __add__(self, other: 'Pattern') -> 'Pattern':
        pass

# Base lambda template
schedule_formatter = lambda phrase, params: (
    lambda phrase, params: {
        "input": phrase.format(**params),
        "ical": f"""
            BEGIN:VCALENDAR
            VERSION:2.0
            PRODID:-//BERTiCal//EN
            BEGIN:VEVENT
            DTSTAMP:{datetime.now().strftime('%Y%m%dT%H%M%SZ')}
            {_handle_time(datetime)}
            {_handle_recurrence()}
            {_handle_duration(params)}
            SUMMARY:{params.get('summary', 'Generated Event')}
            END:VEVENT
            END:VCALENDAR
        """
    }
)


relative_exps = [
    Pattern([
        "in {x} days",
        "{x} days from now",
        "{x}d from now",
        "in {x} days time",
        "{x} days hence",
        "{x} days later",
        "after {x} days",
        "{x} days after",
        "in {x} days' time",
        "{x} days ahead",
        "{x} days forward"
    ], lambda phrase, x: """
        BEGIN:VCALENDAR
        VERSION:2.0
        PRODID:-//BERTiCal//EN
        INPUT:{phrase.format(x)}
        LANGUAGE:en-US
        REFDATE:{REF}
        BEGIN:VEVENT
        SUMMARY:Event on Monday
        DTSTART:20740903T120000
        DTEND:{dtstring(REF + x)}
        END:VEVENT
        END:VCALENDAR
    """),
    Pattern([
        "tomorrow",
        "tom",
        "tmr",
        "tmrw",
        "next day"
    ], lambda phrase, x: """
        BEGIN:VCALENDAR
        VERSION:2.0
        PRODID:-//BERTiCal//EN
        INPUT:{phrase}
        LANGUAGE:en-US
        REFDATE:{REF}
        BEGIN:VEVENT
        SUMMARY:Event Tomorrow
        DTSTART:{dtstring(REF + 1)}
        DTEND:{dtstring(REF + 1)}
        END:VEVENT
        END:VCALENDAR
    """),
    Pattern([
        "yesterday",
        "yest",
        "yday",
        "a day ago"
    ], lambda phrase, x: """
        BEGIN:VCALENDAR
        VERSION:2.0
        PRODID:-//BERTiCal//EN
        INPUT:{phrase}
        LANGUAGE:en-US
        REFDATE:{REF}
        BEGIN:VEVENT
        SUMMARY:Event Yesterday
        DTSTART:{dtstring(REF - 1)}
        DTEND:{dtstring(REF - 1)}
        END:VEVENT
        END:VCALENDAR
    """)

]




    


}

TypeError: unhashable type: 'list'

In [None]:
import os
import json
from tqdm import tqdm

# Create the main data directory if it doesn't exist
data_dir = "data"
if not os.path.exists(data_dir):
    os.makedirs(data_dir)

# Number of files per format
num_files = 16
examples_per_file = 16000

print(f"Generating {num_examples} examples for each format, split into {num_files} files with {examples_per_file} examples each")

# For each format, create a subfolder and generate the examples
for format_name, format_generator in tqdm(formats.items(), desc="Formats"):
    # Create a subfolder for this format
    format_dir = os.path.join(data_dir, format_name)
    if not os.path.exists(format_dir):
        os.makedirs(format_dir)
    else: continue
    
    # Generate examples and save them to files
    for file_idx in tqdm(range(num_files), desc=f"Files for {format_name}", leave=False):
        examples = []
        for _ in range(examples_per_file):
            example = format_generator()
            examples.append(example)
        
        # Save to a text file
        file_path = os.path.join(format_dir, f"{format_name}_{file_idx+1}.txt")
        with open(file_path, 'w') as f:
            for example in examples:
                f.write('\n'.join(example))
                f.write('\n\n')  # Add a blank line between examples
        
        # Also save as JSON for easier processing if needed
        json_path = os.path.join(format_dir, f"{format_name}_{file_idx+1}.json")
        with open(json_path, 'w') as f:
            json.dump(examples, f, indent=2)

print(f"Data generation complete. Generated {num_examples} examples for each of the {len(formats)} formats.")


NameError: name 'num_examples' is not defined