# Convert to Markdown with ChatGPT

In [None]:
import re
import pandas as pd
from openai import OpenAI
import streamlit as st
from tqdm.auto import tqdm

# progress bar
tqdm.pandas()

client = OpenAI(api_key=st.secrets["OPENAI_API_KEY"])

## Parsing the $\LaTeX$ File
- A Prompt Pattern Catalog to Enhance Prompt Engineering
- https://arxiv.org/abs/2302.11382

In [None]:
with open("./data/2302.11382v1", "r") as f:
  doc = f.read()
doc[:100]

In [None]:
def get_name(s):
  s = s.split('}')[0][1:].replace('\\', '').strip()
  return re.sub(r'\s+', ' ', s)

In [None]:
doc, ref = doc.split("\\begin{thebibliography}")
ref = "\\begin{thebibliography}" + ref

In [None]:
sections = [
  (get_name(s), '}'.join(s.split('}')[1:]).strip())
  for s in doc.split("\\section")[1:]
]
pd.DataFrame(sections, columns=['section', 'latex'])

In [None]:
subsections = [
  (n1, "", s) if i==0 else (n1, get_name(s), '}'.join(s.split('}')[1:]).strip())
  for n1, t in sections
  for i, s in enumerate(t.split("\\subsection"))
]
pd.DataFrame(subsections, columns=['section', 'subsection', 'latex'])

In [None]:
subsubsections = [
  (n1, n2, "", s) if i==0 else (n1, n2, get_name(s), '}'.join(s.split('}')[1:]).strip())
  for n1, n2, t in subsections
  for i, s in enumerate(t.split("\\subsubsection"))
]
pd.DataFrame(subsubsections, columns=['section', 'subsection', 'subsubsection', 'latex'])

In [None]:
chunks = [
  (n1, n2, n3, f"\\section{{{n1}}}\n\\subsection{{{n2}}}\n\\subsubsection{{{n3}}}\n"+s)
  for n1, n2, n3, s in subsubsections if len(s) > 100
]
chunks.append(("Authors and Abstract", '', '', doc.split("\\section")[0]))
chunks.append(("Bibliography", '', '', ref))

df_chunks = pd.DataFrame(chunks, columns=['section', 'subsection', 'subsubsection', 'latex'])
df_chunks

In [None]:
toc_mkdn = df_chunks[['section','subsection','subsubsection']].drop_duplicates().to_markdown()
print(toc_mkdn)

In [None]:
history = [
  {
    "role": "system",
    "content": [
      {
        "type": "text",
        "text": "You convert a part of LaTeX document to a markdown text. Do NOT print anything else. Ignore auxiliary latex tags, but keep the citation code for reference."
      }
    ]
  },
]

def mk_msg(text, role='user'):
  return {
    "role": "user",
    "content": [
      {
        "type": "text",
        "text": f"{text}"
      }
    ]
  }

def latex_to_mkdn(latex_chunk):
  history.append(mk_msg(latex_chunk))
  response = client.chat.completions.create(
    model="gpt-4o",
    messages=history,
    temperature=0.3,
    max_tokens=4095,
    top_p=1,
    frequency_penalty=0,
    presence_penalty=0
  )
  history.append(response.choices[0].message)
  return response.choices[0].message.content

In [None]:
df_chunks['text'] = df_chunks.latex.progress_apply(latex_to_mkdn)

In [None]:
df_chunks

In [None]:
df_chunks.to_csv("./data/2302.11382v1.csv")