In [26]:
import pyarrow as pa
import pyarrow.parquet as pq
import pandas as pd

# Define the conversational data with nested structure
conversations = [
    [
        {"from": "human", "value": "What is 2+2?"},
        {"from": "gpt", "value": "Quizzable"}
    ],
    [
        {"from": "human", "value": "Explain why the sky is blue."},
        {"from": "gpt", "value": "Quizzable"}
    ],
    [
        {"from": "human", "value": "What is the capital of France?"},
        {"from": "gpt", "value": "Quizzable"}
    ]
]

# Define the schema with a nested list of structs for 'conversations'
schema = pa.schema([
    ("conversations", pa.list_(
        pa.struct([
            pa.field("from", pa.string()),
            pa.field("value", pa.string())
        ])
    ))
])

# Create a DataFrame (for easier conversion)
df = pd.DataFrame({"conversations": conversations})

# Convert the DataFrame to a PyArrow Table with the defined schema
table = pa.Table.from_pandas(df, schema=schema)

# Save the dataset as a Parquet file
pq.write_table(table, "conversational_dataset_nested.parquet")

print("Dataset saved as conversational_dataset_nested.parquet!")


Dataset saved as conversational_dataset_nested.parquet!


In [27]:
import pyarrow as pa
import pyarrow.parquet as pq
import pandas as pd

# Define the schema with a nested list of structs for 'conversations'
schema = pa.schema([
    ("conversations", pa.list_(
        pa.struct([
            pa.field("from", pa.string()),
            pa.field("value", pa.string())
        ])
    ))
])

# Function to prompt for user input and GPT output
def get_conversation_entry():
    # Ask for user input and GPT output
    user_input = input("Human: ")
    gpt_output = input("GPT: ")

    # Check if "exit" is entered in either field to break the loop
    if user_input.lower() == "exit" or gpt_output.lower() == "exit":
        return None  # Return None to indicate the loop should exit
    
    # Store the conversation as a list of dicts
    conversation = [
        {"from": "human", "value": user_input},
        {"from": "gpt", "value": gpt_output}
    ]
    
    return conversation

# Function to append data to the Parquet file
def append_to_parquet(new_conversation):
    # Read the existing Parquet file into a PyArrow Table
    try:
        existing_table = pq.read_table("conversational_dataset_nested.parquet")
    except:
        # If the file doesn't exist, create an empty table
        existing_table = pa.Table.from_pandas(pd.DataFrame({"conversations": []}), schema=schema)
    
    # Convert new data into the same format
    new_df = pd.DataFrame({"conversations": [new_conversation]})
    
    # Convert new DataFrame to PyArrow Table with the same schema
    new_table = pa.Table.from_pandas(new_df, schema=existing_table.schema)
    
    # Concatenate the existing table with the new table (append)
    combined_table = pa.concat_tables([existing_table, new_table])
    
    # Save the updated dataset back to the Parquet file
    pq.write_table(combined_table, "conversational_dataset_nested.parquet")

# Main loop to repeatedly ask for input and append to Parquet
while True:
    # Get a conversation entry
    conversation = get_conversation_entry()
    
    if conversation is None:
        print("Exiting the conversation entry loop.")
        break  # Exit the loop if "exit" was entered in either field
    
    # Append the new conversation to the Parquet file
    append_to_parquet(conversation)
    
    # Ask if the user wants to add another conversation
    continue_input = input("Do you want to add another conversation? (y/n): ")
    if continue_input.lower() != 'y':
        print("Exiting the conversation entry loop.")
        break


Exiting the conversation entry loop.
