<a href="https://colab.research.google.com/github/asokraju/LangChainDatasetForge/blob/main/Datagen.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import random

# Expanded Data Pools

DURATION_POOL = [f"{i} days" for i in range(1, 31)] + [f"{i} weeks" for i in range(1, 9)]
DESTINATIONS_POOL = ["Paris", "New York", "Tokyo", "London", "Sydney", "Cairo", "Rio", "Cape Town", "Moscow", "Beijing"]
ACTIVITIES_POOL = ["sightseeing", "trekking", "culinary experiences", "museum visits", "beach relaxation", "mountain climbing"]
BUDGET_POOL = [f"${i}000" for i in range(1, 21)]
ACCOMMODATION_POOL = ["hotel", "hostel", "B&B", "luxury resort", "rented apartment"]
TRANSPORTATION_POOL = ["bus", "train", "flight", "rented car", "bicycle"]

# Additional Elements

TRAVELERS_POOL = [f"{i} persons" for i in range(1, 11)]
SEASON_POOL = ["spring", "summer", "fall", "winter"]
MEAL_PREF_POOL = ["local cuisine", "vegetarian meals", "vegan options", "seafood delights", "fast food"]
TRAVEL_TYPE_POOL = ["solo", "couple", "family", "group", "business", "backpacking"]
BOOKING_MODE_POOL = ["travel agency", "online platform", "direct booking", "last minute deals"]
GUIDE_PREF_POOL = ["guided tours", "self-exploration", "audio guide", "local guide", "group tour", "private tour"]
LANG_PREF_POOL = ["English", "French", "Spanish", "local language", "multilingual"]
CULTURAL_INTEREST_POOL = ["historical sites", "modern attractions", "folk performances", "art galleries", "music concerts"]

def generate_travel_plan():
    elements = {
        "Duration": random.choice(DURATION_POOL),
        "Destinations": ", ".join(random.sample(DESTINATIONS_POOL, random.randint(1, 3))),
        "Activities": ", ".join(random.sample(ACTIVITIES_POOL, random.randint(1, 3))),
        "Budget": random.choice(BUDGET_POOL),
        "Accommodation": random.choice(ACCOMMODATION_POOL),
        "Transportation": ", ".join(random.sample(TRANSPORTATION_POOL, random.randint(1, 3))),
        "Travelers": random.choice(TRAVELERS_POOL),
        "Season": random.choice(SEASON_POOL),
        "Meal Preference": random.choice(MEAL_PREF_POOL),
        "Travel Type": random.choice(TRAVEL_TYPE_POOL),
        "Booking Mode": random.choice(BOOKING_MODE_POOL),
        "Guide Preference": random.choice(GUIDE_PREF_POOL),
        "Language Preference": random.choice(LANG_PREF_POOL),
        "Cultural Interest": random.choice(CULTURAL_INTEREST_POOL)
    }

    # Drop some keys for variability
    num_elements_to_use = random.randint(6, 10)  # Using between 6 and 10 elements
    keys_to_use = random.sample(list(elements.keys()), num_elements_to_use)

    # Construct the travel plan
    plan_elements = [f"{key}: {elements[key]}" for key in keys_to_use]
    travel_plan = ", ".join(plan_elements) + '.'

    return travel_plan

# Generate 100k structured travel plans

data_points = 100_000
structured_data = [generate_travel_plan() for _ in range(data_points)]

# For demonstration, let's print the first 5 entries
for entry in structured_data[:5]:
    print(entry)
    print("---------------------")

# To save the structured data into a text file
with open("structured_data.txt", "w") as f:
    for entry in structured_data:
        f.write(entry + "\n---------------------\n")


In [0]:
!pip install langchain
!pip install tqdm
!pip install OpenAI

In [0]:
import os
import openai
openai.api_type = "azure" 
openai.api_base =  "https://kf-llm.openai.azure.com/" # Your Azure OpenAI resource's endpoint value.
openai.api_key = "d6e77295870f4e0fb5f44e2b96838801"
os.environ["OPENAI_API_KEY"] = "d6e77295870f4e0fb5f44e2b96838801"
os.environ["OPENAI_API_VERSION"] = "2023-03-15-preview"
os.environ["SERPAPI_API_KEY"] = "4ea6865054c8eb8608723170dd33e12314d8e182c10c16ee15565ccf03526e8e"


In [0]:
import pandas as pd

In [0]:
import numpy as np

# Read the file into a list
with open("structured_data.txt", "r") as file:
    lines = file.readlines()

# Split the list by the separator to get individual entries
entries = [entry.strip() for entry in ''.join(lines).split("---------------------") if entry.strip() != ""]

# Convert the list into a NumPy array
structured_data_array = np.array(entries)

# Display the first few entries
print(structured_data_array[:5])

In [0]:
import random

def pick_style():
    styles = [
        "Narrative", "Persuasive", "Expository", "Journalistic",
        "Satirical", "Stream-of-Consciousness", "Epistolary", "Conversational",
        "Didactic", "Slang or Colloquial"
    ]

    return random.choice(styles)

chosen_style = pick_style()
# print(f"Randomly selected style: {chosen_style}")


plan_list = [
    {"user_plan": plan, "style": pick_style()} for plan in structured_data_array
]
plan_list[0]

In [0]:


from langchain import PromptTemplate

demo_template='''
I want you to come up with a unstructered text for the following plan: {user_plan}.
Use the writing style: {style}
for example: I'm thinking of a solo trip for about 2 weeks, primarily focusing on sightseeing and museum visits in Paris. I have a budget of around $5000.
'''

prompt=PromptTemplate(
    input_variables=['user_plan','style'],
    template=demo_template
    )

prompt.format(user_plan=structured_data_array[0], style=pick_style())




In [0]:
# from langchain.llms import OpenAI
from langchain.chains import LLMChain
from langchain.llms import AzureOpenAI

# llm=OpenAI(temperature=0.5)

llm = AzureOpenAI(
    deployment_name="kf-gpt",
    model_name="gpt-35-turbo",temperature= 0.3
)
chain=LLMChain(llm=llm,prompt=prompt)

In [0]:
from tqdm import tqdm

summary = []
for item in tqdm(plan_list[:10], desc='Processing topics'):
    summary.append(chain.run(item))

In [0]:
for struct, unstruct in zip(structured_data_array, summary):
    print(struct)
    print(unstruct)
    print("-----------------------")

In [0]:

topic_list = [
    {"user_plan": user_plan, "user_input": user_input}
    for user_plan, user_input in zip(structured_data_array, summary)
]

# Convert the list of dictionaries to a DataFrame
df = pd.DataFrame(topic_list)

# Save the DataFrame to a CSV file
df.to_csv('dataset.csv', index=False, encoding='utf-8')
df.head()

In [0]:
topic_list[0]

# extra

In [0]:
import numpy as np

# Read the file into a list
with open("structured_data.txt", "r") as file:
    lines = file.readlines()

# Split the list by the separator to get individual entries
entries = [entry.strip() for entry in ''.join(lines).split("---------------------") if entry.strip() != ""]

# Convert the list into a NumPy array
structured_data_array = np.array(entries)

# Display the first few entries
print(structured_data_array[:5])
