<a href="https://colab.research.google.com/github/tomasonjo/blogs/blob/master/h20_llm/LLM_train_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install openai

In [None]:
import json
import pandas as pd
import openai

openai.api_key = "OPENAI_KEY"

In [None]:
examples = """
# Who played in Top Gun?
MATCH (m:Movie)<-[r:ACTED_IN]-(a)
RETURN {actor: a.name, role: r.role} AS result
# What is the plot of the Copycat movie?
MATCH (m:Movie {title: "Copycat"})
RETURN {plot: m.plot} AS result
# Did Luis Guzmán appear in any other movies?
MATCH (p:Person {name:"Luis Guzmán"})-[r:ACTED_IN]->(movie)
RETURN {movie: movie.title, role: r.role} AS result
# Do you know of any matrix movies?
MATCH (m:Movie)
WHERE toLower(m.title) CONTAINS toLower("matrix")
RETURN {movie:m.title} AS result
# How many reviews does each Matrix movie have?
MATCH (m:Movie)<-[:RATED]-(u:User)
WHERE m.title CONTAINS 'Matrix'
WITH m, count(*) AS reviews
RETURN m.title AS movie, reviews
ORDER BY reviews DESC LIMIT 5;
# Recommend me a similar movie to Crimson Tide
MATCH (m:Movie {title: 'Crimson Tide'})<-[:RATED]-
      (u:User)-[:RATED]->(rec:Movie)
WITH rec, COUNT(*) AS usersWhoAlsoWatched
ORDER BY usersWhoAlsoWatched DESC LIMIT 25
RETURN rec.title AS recommendation, usersWhoAlsoWatched
# Find me a good comedy?
MATCH (m:Movie)-[:IN_GENRE]->(:Genre {name:"Comedy"})
RETURN {movie: m.title} AS result
ORDER BY m.imdbRating DESC LIMIT 1
# When was Copycat released?
MATCH (m:Movie {title:"CopyCat"})
RETURN {year: m.year} AS result
"""

In [None]:
system = f"""
You are an assistant that has only one task.
You need to generate 100 Cypher query examples based on the movie dataset.
Do not respond with any explanations and do not apologize.
Here are some query examples:
{examples}
Respond with format where each line represents one example:
{{"instruction": "Who played in Top Gun?", 'output': "MATCH (m:Movie)<-[r:ACTED_IN]-(a) RETURN {{actor: a.name, role: r.role}} AS result"}}
Do not return any examples that cannot be inferred from provided queries, so no new node labels of relationship types.
Do not include examples I have provided and do not use Matrix in the examples.
"""

In [None]:
def parse_response(text):
  # Split the response string by newline characters to get individual JSON strings
  response_list = text.split('\n')

  # Parse each JSON string as a dictionary and append it to a list
  parsed_list = []
  for r in response_list:
      if r.strip() != '':
        try:
          parsed_list.append(json.loads(r))
        except:
          pass

  return parsed_list

In [None]:
# GPT-4 has a output token limit around 1100 tokens
# So we do 20 requests and increase the temperature to
# encourage various Cypher examples

training_data = []
for i in range(20):
    print(f"Create {i} batch of examples")
    completions = openai.ChatCompletion.create(
        model="gpt-4",
        temperature=0.6,
        max_tokens=6000,
        messages=[
            {"role": "system", "content": system},
            {"role": "user", "content": "Generate 10 examples"},
        ],
    )
    response = completions.choices[0].message.content
    training_data.extend(parse_response(response))

df = pd.DataFrame.from_records(training_data)
df["instruction"] = [
    "Create a Cypher statement to answer the following question: " + el
    for el in df["instruction"]
]
df.to_csv("train.csv", index=False)

In [None]:
#from google.colab import files
#files.download('train.csv') 

In [None]:
val_data = []
for i in range(4):
    print(f"Create {i} batch of examples")
    completions = openai.ChatCompletion.create(
        model="gpt-4",
        temperature=0.6,
        max_tokens=6000,
        messages=[
            {"role": "system", "content": system},
            {"role": "user", "content": "Generate 10 examples"},
        ],
    )
    response = completions.choices[0].message.content
    val_data.extend(parse_response(response))

df = pd.DataFrame.from_records(val_data)
df["instruction"] = [
    "Create a Cypher statement to answer the following question: " + el
    for el in df["instruction"]
]
df.to_csv("val.csv", index=False)

In [None]:
#from google.colab import files
#files.download('vald.csv') 