In [10]:
import os
import yaml
import requests
import openai
import textwrap

In [9]:
def fmt(text):
    wrapper = textwrap.TextWrapper(width=80, initial_indent="", subsequent_indent="")
    return wrapper.fill(text)

In [35]:
import re
import textwrap

def wrap_markdown_text(text, width=120):
    # Split the text into lines and wrap each line to no more than 80 columns,
    # except for lines inside tables
    lines = text.split("\n")
    wrapped_lines = []
    in_table = False
    for line in lines:
        # Check if the line starts or ends a table
        if re.match(r"^\s*\|", line):
            in_table = True
        elif re.match(r"^\s*$", line):
            in_table = False

        # Wrap the line if it's not inside a table
        if not in_table:
            wrapped_line = textwrap.fill(line, width=width)
        else:
            wrapped_line = line

        wrapped_lines.append(wrapped_line)

    # Join the wrapped lines back into a single string
    wrapped_text = "\n".join(wrapped_lines)

    return wrapped_text

In [41]:
import re
import textwrap

def wrap_markdown_text(text):
    # Split the text into lines and wrap each line to no more than 80 columns,
    # except for lines inside tables, code blocks, and triple-quoted text
    lines = text.split("\n")
    wrapped_lines = []
    in_table = False
    in_code_block = False
    in_triple_quotes = False
    for line in lines:
        # Check if the line starts or ends a table
        if re.match(r"^\s*\|", line):
            in_table = True
        elif re.match(r"^\s*$", line):
            in_table = False

        # Check if the line is inside a code block
        if re.match(r"^```", line):
            in_code_block = not in_code_block

        # Check if the line is inside triple quotes
        if re.match(r'^\s*"""', line):
            in_triple_quotes = not in_triple_quotes

        # Wrap the line if it's not inside a table, code block, or triple-quoted text
        if not in_table and not in_code_block and not in_triple_quotes:
            wrapped_line = textwrap.fill(line, width=80)
        else:
            wrapped_line = line

        wrapped_lines.append(wrapped_line)

    # Join the wrapped lines back into a single string
    wrapped_text = "\n".join(wrapped_lines)

    return wrapped_text


In [3]:

def get_gh_token():
    config_path = os.path.join(os.path.expanduser("~"), ".config", "gh", "hosts.yml")

    with open(config_path, "r") as f:
        config = yaml.safe_load(f)

    if "github.com" in config and "oauth_token" in config["github.com"]:
        return config["github.com"]["oauth_token"]

    raise Exception("No GitHub token found in config file")

token = get_gh_token()

In [43]:
import requests

def get_documentation_issues_and_comments(owner, repo, token):
    # Retrieve all issues for the repository
    issues_url = f"https://api.github.com/repos/{owner}/{repo}/issues?state=all&per_page=100&labels=documentation"
    issues_response = requests.get(issues_url, headers={"Authorization": f"Bearer {token}"})
    issues = issues_response.json()

    # Retrieve all comments for each issue and concatenate them onto the issue string
    for issue in issues:
        issue_number = issue["number"]
        comments_url = f"https://api.github.com/repos/{owner}/{repo}/issues/{issue_number}/comments?per_page=100"
        comments_response = requests.get(comments_url, headers={"Authorization": f"Bearer {token}"})
        comments = comments_response.json()

        # Concatenate issue and comments as text
        text = f"# {issue['title']}\n\n{issue['body']}\n\n"
        for comment in comments:
            text += f"## Comment by {comment['user']['login']} on {comment['created_at']}:\n\n{comment['body']}\n\n"
        
        # Yield the issue text with all comments concatenated
        yield issue_number, text

issues = get_documentation_issues_and_comments("webdataset", "webdataset", token)
print(wrap_markdown_text(next(iter(issues))[1]))

# Clarification on what `to_tuple()` does?

Sorry I'm confused -- It's unclear from the documentation (and my reading of
[the code](https://github.com/webdataset/webdataset/blob/cb1aa32aca3f5fa3f214c38
a2145b14cd28629cc/webdataset/compat.py#L66) ) what the purpose of `.to_tuple()`
is.

It is used throughout the documentation (e.g., ["Getting
Started"](https://webdataset.github.io/webdataset/gettingstarted/),  ["How it
Works"](https://webdataset.github.io/webdataset/howitworks/#how-it-works)) and
[README](https://github.com/webdataset/webdataset#readme) yet never described
what it actually is or does.  It seems to be absolutely crucial since it appears
in every example, but... what is it?   And how are we to learn which arguments
to use with it?

For example, what is the difference between `.to_tuple("png", "json")` and
`.to_tuple("png;jpg;jpeg", "json")`?  Why is "json" the last argument?

Does this mean that it's going to produce JSON text for the Python tuple
`("png", "jpg", "jpeg")`

In [39]:
import openai

openai.api_key = open(os.path.expanduser("~/.ssh/openai-key")).read().strip()

model_id = 'gpt-3.5-turbo'

# model_id = 'gpt-4.0'

initial_prompt = """
Here is a discussion of an issue from the WebDataset issue tracker.

Please summarize this issue in the form of an FAQ entry.

Summarize the question in a single sentence and precede it with "Q: ".

Then leave a blank line and start your answer with "A: ". Provide a single paragraph for an answer. You may include a code example.

Use markdown formatting to make the answer more readable.
"""

initial_context = [
    {'role': 'system', 'content': initial_prompt}
]

def chatgpt(prompt, conversation, role='user', model_id=model_id):
    if conversation is None:
        conversation = initial_context.copy()
    conversation.append({'role': role, 'content': prompt})
    try:
        response = openai.ChatCompletion.create(
            model=model_id,
            messages=conversation
        )
    except openai.error.InvalidRequestError as e:
        msg = f"Error: {str(e)[:80]}..."
        return msg, conversation
    # api_usage = response['usage']
    # print('Total token consumed: {0}'.format(api_usage['total_tokens']))
    # stop means complete
    # print(response['choices'][0].finish_reason)
    # print(response['choices'][0].index)
    rrole, response = response.choices[0].message.role, response.choices[0].message.content
    conversation.append({'role': rrole, 'content': response})
    return response, conversation


In [44]:

issue_number, issue_text = next(iter(issues))
response, _ = chatgpt(issue_text, None)
print(wrap_markdown_text(response))

Q: Why do periods in the base part of the filename cause issues in WebDataset
and how can it be resolved?

A: Periods in the base part of the filename cause issues in WebDataset as the
periods are used to support multiple extensions like ".seg.jpg". It is
recommended to deal with this during dataset creation by avoiding such
conventions. Also, in several places, you can use "glob" patterns like "*.mp3"
to match extensions. Mapping the filenames between the tariterator and the
tarfile_to_samples stages in the pipeline is possible, but not recommended.


In [48]:
import sys
from itertools import islice
for issue_number, issue_text in issues:
    print(issue_number, repr(issue_text[:80]), file=sys.stderr)
    ofile = "faq/issue-%04d.md" % issue_number
    if os.path.exists(ofile):
        continue
    response, _ = chatgpt(issue_text, None)
    with open(ofile, "w") as f:
        f.write(wrap_markdown_text(response))


79 "# Guidance on using Webdataset for small embeddings\n\nI'm trying to use Webdatase"
73 '# Adding option to shuffle shards before splitting to workers, based on current '
71 '# Unexpected Shuffling Behavior\n\nEven with buffer size and initial are set to th'
68 '# Program stop at some iteration\n\nHi @tmbdev Thanks for sharing the excellent li'
66 '# Pytorch Lightning integration\n\nHi\r\n\r\nCurrently, webdataset dataset using the d'
49 '# Path separator not decoded correctly\n\nI tried to open a local tar file in Wind'
36 '# Using webdataset with torch-xla in multiprocessing context\n\nI actually wrote a'


In [51]:
import glob

result = ""
for fname in sorted(glob.glob("faq/issue-*.md")):
    entry = open(fname).read()
    result += entry + "\n\n---\n\n"
    
with open("FAQ.md", "w") as f:
    f.write(result)