In [2]:
import json

with open("schema.json") as schema_file:
    schema = json.load(schema_file)

In [4]:
print(json.dumps(schema, indent=4))

{
    "$schema": "http://json-schema.org/draft-07/schema",
    "type": "array",
    "items": {
        "type": "object",
        "properties": {
            "instruction": {
                "type": "string"
            },
            "input": {
                "type": "string"
            },
            "output": {
                "type": "string"
            }
        },
        "additionalProperties": false,
        "required": [
            "instruction",
            "input",
            "output"
        ]
    },
    "additionalItems": false
}


In [5]:
with open("alpaca_data_cleaned.json") as dataset:
    data = json.load(dataset)

In [6]:
data[0]

{'instruction': 'Give three tips for staying healthy.',
 'input': '',
 'output': '1. Eat a balanced and nutritious diet: Make sure your meals are inclusive of a variety of fruits and vegetables, lean protein, whole grains, and healthy fats. This helps to provide your body with the essential nutrients to function at its best and can help prevent chronic diseases.\n\n2. Engage in regular physical activity: Exercise is crucial for maintaining strong bones, muscles, and cardiovascular health. Aim for at least 150 minutes of moderate aerobic exercise or 75 minutes of vigorous exercise each week.\n\n3. Get enough sleep: Getting enough quality sleep is crucial for physical and mental well-being. It helps to regulate mood, improve cognitive function, and supports healthy growth and immune function. Aim for 7-9 hours of sleep each night.'}

In [26]:
queries = [item["instruction"] for item in data if not item["input"]]

In [27]:
queries[0]

'Give three tips for staying healthy.'

In [28]:
len(queries)

32603

In [29]:
# random sample of 100
import random

sample_queries = random.sample(queries, 100)

In [30]:
sample_queries

['Describe the applicability of GPT models to analytics.',
 'Answer the question: What is an artificial neural network?',
 'Imagine you are writing a script for a movie. Describe the house of the main character.',
 'Create a step-by-step tutorial on setting up a cloud based development environment.',
 'List five ways to perform data visualization with Python.',
 'Come up with a sentence using the word "meditation".',
 'Imagine you are in a forest and describe what you see.',
 'Fully simplify the following expression: [(2+3+4+5)÷2] + [(2*5+8)÷3].',
 'Compose a thank you message to a teacher.',
 'A shop has six apples and two oranges. What is the ratio of apples to oranges?',
 'Generate a descriptive phrase that explains the colors of a sunset.',
 'Design a 3-step tutorial explaining how to use the app Snapchat.',
 'Describe a situation in which two people might disagree but still work together to solve a problem.',
 'Create an example of a data structure that can store employees’ names 

### Enter OPENAI API KEY BELOW

In [1]:
import openai
# personal
openai.api_key = ""
openai.Model.list()



AuthenticationError: Invalid authorization header

In [32]:
model = "gpt-3.5-turbo-instruct"

In [33]:
prompt = """
Ouput whether the following queries are of type FACTUAL or CREATIVE
only output the word FACTUAL or CREATIVE and nothing else

examples:
Prompt: Generate five adjectives to describe an orange
CREATIVE

Prompt: Generate a comically bad pun
CREATIVE

Prompt: What is the Ideal Gas Law?
FACTUAL

Explain the physiological process of breathing.
FACTUAL

Prompt: """

In [38]:
print(prompt)
print(sample_queries[0])


Ouput whether the following queries are of type FACTUAL or CREATIVE
only output the word FACTUAL or CREATIVE and nothing else

examples:
Prompt: Generate five adjectives to describe an orange
CREATIVE

Prompt: Generate a comically bad pun
CREATIVE

Prompt: What is the Ideal Gas Law?
FACTUAL

Explain the physiological process of breathing.
FACTUAL

Prompt: 
Describe the applicability of GPT models to analytics.


In [74]:
query = sample_queries[2]

In [75]:
chat_prompt= prompt + query

In [76]:
chat_prompt

'\nOuput whether the following queries are of type FACTUAL or CREATIVE\nonly output the word FACTUAL or CREATIVE and nothing else\n\nexamples:\nPrompt: Generate five adjectives to describe an orange\nCREATIVE\n\nPrompt: Generate a comically bad pun\nCREATIVE\n\nPrompt: What is the Ideal Gas Law?\nFACTUAL\n\nExplain the physiological process of breathing.\nFACTUAL\n\nPrompt: Imagine you are writing a script for a movie. Describe the house of the main character.'

In [81]:
result = openai.Completion.create(
  model="gpt-3.5-turbo-instruct",
  prompt=chat_prompt,
  max_tokens=8,
  temperature=0
)
result["choices"][0]["text"].strip()

'CREATIVE'

In [89]:
tagged_queries = []
sample_queries = random.sample(queries, 700)

In [90]:
for query in sample_queries:
    chat_prompt= prompt + query
    result = openai.Completion.create(
      model="gpt-3.5-turbo-instruct",
      prompt=chat_prompt,
      max_tokens=8,
      temperature=0
    )
    tagged_queries.append((query, result["choices"][0]["text"].strip()))

In [91]:
tagged_queries

[('Describe the biggest challenge facing health care in the United States today.',
  'FACTUAL'),
 ('Design a campaign ad for a candidate running for public office.',
  'CREATIVE'),
 ('Explain why people are concerned about climate change.', 'CREATIVE'),
 ('List the steps to creating a 3D model using Blender.', 'FACTUAL'),
 ('Analyze the causes of the U.S civil war.', 'FACTUAL'),
 ('Generate a list of popular cities on the east coast of the US.', 'FACTUAL'),
 ('Generate a list of questions to ask participants before a free web design workshop.',
  'CREATIVE'),
 ('Explain the concept of Big Data and what it means for companies and customers.',
  'FACTUAL'),
 ('Generate a few words for a crossword puzzle on the topic of computer science.',
  'CREATIVE'),
 ('Suggest five books that are suitable for teenagers.', 'CREATIVE'),
 ('Generate a restaurant menu item.', 'CREATIVE'),
 ('Suggest three healthy snacks to pack on a hike.', 'CREATIVE'),
 ('What is the pH level of pineapple juice?', 'FACT

In [92]:
tagged_queries[0][1]

'FACTUAL'

In [93]:
creative_queries = [query[0] for query in tagged_queries if query[1] == "CREATIVE"]

In [94]:
creative_queries

['Design a campaign ad for a candidate running for public office.',
 'Explain why people are concerned about climate change.',
 'Generate a list of questions to ask participants before a free web design workshop.',
 'Generate a few words for a crossword puzzle on the topic of computer science.',
 'Suggest five books that are suitable for teenagers.',
 'Generate a restaurant menu item.',
 'Suggest three healthy snacks to pack on a hike.',
 'Generate a marketing slogan for a newly launched vitamin supplement.',
 'Create a list 5 jobs that require programming skills.',
 'Create five short headlines for a news story about a movie star.',
 'Invent a new work for the color purple.',
 'Suggest four content marketing strategies for a small business.',
 'Generate an opening dialogue for a conversation about rain.',
 'Propose a solution to eliminate cheating in schools.',
 'Write a product description for an iPhone 12.',
 'Identify and explain a new trend in digital marketing.',
 'Create a style

In [95]:
len(creative_queries)

363