In [1]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_groq import ChatGroq
from dotenv import load_dotenv
import os
load_dotenv()

True

In [2]:
from langchain.chains import ConversationChain
from langchain.memory import  ConversationBufferWindowMemory
from langchain_core.prompts.prompt import PromptTemplate

In [10]:
from langchain_core.prompts import ChatPromptTemplate

from langchain_core.output_parsers import StrOutputParser
from langchain_core.output_parsers import JsonOutputParser

In [4]:
# setting up groq api key
os.environ["GROQ_API_KEY"] = os.getenv("GROQ_API_KEY")

# chat set up
GROQ_LLM = ChatGroq(temperature=0.5, model_name="llama3-8b-8192")

## Basic Chains

- 1. Chat Chain
- 2. Chat Reviewer
- 3. Requirement Extractor
- 4. Problem Search
- 5. Code Generation
- 6. Code and Requirement Reviewer

## Other Functions
- 1. Code Saver
- 2. Class Generator
- 3. Code implementation and data saver.  

### Chat Chain

In [8]:
# Now we can override it and set it to "AI Assistant"

chat_template = """<|begin_of_text|><|start_header_id|>system<|end_header_id|>
You are a Data Science Consultant. You have over 20 years of experience in the field. 
You are currently working with a client to create a synthetic data for his project. 
The data is a real world based daa exhibits the same characteristics as the real data.

Initially you don't know anything about the project, that's why you want to ask the client some questions to understand the data requirements.
This is the work flow you have been following:
    1. Initiate a besutiful and interactive sessons with the user.
    2. Ask questions about the project the client is working on.
    3. Ask for the purpose of this data in relation to the project.
    4. Ask for possible columns and the data types. You can also suggest based on the project.
    5. After for the intended data size. You can also suggest based on the project. The client can also specify the size.
    6. Collate all the information and present back to the client for review.
    7. Review thw generated code with the client requirements.
    8. After User has verified that all the requirements are met, respond with - ALL REQUIREMENTS RECORDED.

<|eot_id|><|start_header_id|>user<|end_header_id|>
Current conversation:
{history}
Human: {input}
<|eot_id|><|start_header_id|>assistant<|end_header_id|>
AI Assistant:"""

PROMPT = PromptTemplate(input_variables=["history", "input"], template=chat_template)
chat_conversation = ConversationChain(
    prompt=PROMPT,
    llm=GROQ_LLM,
    verbose=True,
    memory=ConversationBufferWindowMemory(k=10, ai_prefix="AI Assistant"),
    output_parser=StrOutputParser(),
)

In [9]:
chat_conversation.predict(input="Hi there!")



[1m> Entering new ConversationChain chain...[0m
Prompt after formatting:
[32;1m[1;3m<|begin_of_text|><|start_header_id|>system<|end_header_id|>
You are a Data Science Consultant. You have over 20 years of experience in the field. 
You are currently working with a client to create a synthetic data for his project. 
The data is a real world based daa exhibits the same characteristics as the real data.

Initially you don't know anything about the project, that's why you want to ask the client some questions to understand the data requirements.
This is the work flow you have been following:
    1. Initiate a besutiful and interactive sessons with the user.
    2. Ask questions about the project the client is working on.
    3. Ask for the purpose of this data in relation to the project.
    4. Ask for possible columns and the data types. You can also suggest based on the project.
    5. After for the intended data size. You can also suggest based on the project. The client can also s

"Hello! It's great to meet you! I'm your Data Science Consultant, and I'll be helping you with creating synthetic data for your project. I'd love to learn more about your project and understand your requirements.\n\nCan you please tell me a bit about the project you're working on? What's the main goal or objective of your project?"

### Chat Reviewer

In [123]:
## Rewrite Router
def chat_reviewer(chat_history: list) -> str:
    chat_reviewer = PromptTemplate(
        template="""<|begin_of_text|><|start_header_id|>system<|end_header_id|>
        You are an expert at evaluating the conversation between a USER and an AI. \n
        You are checking to see if the AI has already understood the user requirements and if the user approves of it too. \n

        Ask yourself the following questions: \n\n

        Has the user mentioned all of his requirements?
        Has the AI understood the requirements correctly?
        Has the AI presented the requirements back to the user for verification?
        Has the user approved of the requirements?

        Give a binary choice 'understood' (if the requirements has been met)) or 'not_understood' (if the ai still needs more information) based on the chat conversation.
        Return only the binary choice and no premable or explaination. \
        Be Sure of your decision before you submit. \n
        <|eot_id|><|start_header_id|>user<|end_header_id|>
        CHAT_HISTORY: {chat_history} \n
        <|eot_id|><|start_header_id|>assistant<|end_header_id|>""",
        input_variables=["chat_history"],
    )

    chat_review_router = chat_reviewer | GROQ_LLM | StrOutputParser()
    output = chat_review_router.invoke({"chat_history":chat_history})
    return output


In [124]:
CHAT_HISTORY = [{
    "messages": [
        {"user": "USER", "text": "Hi, how are you today?", "timestamp": "2023-02-02T12:00:00"},
        {"ai": "AI", "text": "Hello! I'm doing well, thank you for asking. How can I assist you?", "timestamp": "2023-02-02T12:00:05"},
        {"user": "USER", "text": "I need to book a flight from New York to Los Angeles on a specific date.", "timestamp": "2023-02-02T12:10:00"},
        {"ai": "AI", "text": "I'd be happy to help you with that! What is the specific date you're looking to travel?", "timestamp": "2023-02-02T12:11:00"},
        {"user": "USER", "text": "February 14th.", "timestamp": "2023-02-02T12:12:00"},
        {"ai": "AI", "text": "Let me check availability. Can you please confirm that you'd like to travel on February 14th from New York to Los Angeles?", "timestamp": "2023-02-02T12:13:00"},
        {"user": "USER", "text": "Yes, that's correct.", "timestamp": "2023-02-02T12:14:00"},
        {"ai": "AI", "text": "I've found a suitable flight for you. Would you like to book it?", "timestamp": "2023-02-02T12:15:00"}
    ]
}]

In [125]:
print(chat_reviewer(CHAT_HISTORY))

understood


### Requirement Extraction

In [33]:
## Requirement Extractor
def requirement_extractor(chat_history: list, chat_review: str) -> dict:
    requirement_extractor_prompt = PromptTemplate(
        template="""<|begin_of_text|><|start_header_id|>system<|end_header_id|>
        You are the Coding Requirement Extraction Agent take the CHAT_HISTORY of a conversation between the USER and AI, the CHAT_REVIEW  \
                which tells if the ai understands the user requirment \
                If the CHAT_REVIEW is 'understood' then extract the code requirement from the chat_history. \n
                
                Extract a code requirement in this format:
                    1. Problem statement: The problem that the user is trying to solve.
                    2. Column names: The columns that the user wants in the data.
                    3. Data types: The data types for each column.
                    4. Data size: The intended size of the data. This should be the number of rows.
                    5. Data Variability: The possible variability in the data as per the user's requirement. This is optional.

                
                You never make up information that hasn't been provided by the CHAT_HISTORY. \n
                Return the code requirement as a JSON with the keys 'problem_statement', 'column_names', 'data_types', 'data_size', and 'data_variability' if provided. \n
                If the CHAT_REVIEW is 'not_understood' then return 'Nothing-To-Extract' as JSON with a single key 'no_extraction' and no premable or explaination. \n

                
        <|eot_id|><|start_header_id|>user<|end_header_id|>
        CHAT_HISTORY: {chat_history} \n
        CHAT_REVIEW: {chat_review} \n
        <|eot_id|><|start_header_id|>assistant<|end_header_id|>""",
        input_variables=["chat_history","chat_review"],
    )

    requirement_extraction_chain = requirement_extractor_prompt | GROQ_LLM | JsonOutputParser()
    output = requirement_extraction_chain.invoke({"chat_history":chat_history, "chat_review":chat_review})
    return output
requirement_extractor_prompt = PromptTemplate(
    template="""<|begin_of_text|><|start_header_id|>system<|end_header_id|>
    You are the Coding Requirement Extraction Agent take the CHAT_HISTORY of a conversation between the USER and AI, the CHAT_REVIEW  \
            which tells if the ai understands the user requirment \
            If the CHAT_REVIEW is 'understood' then extract the code requirement from the chat_history. \n
            
            Extract a code requirement in this format:
                1. Problem statement: The problem that the user is trying to solve.
                2. Column names: The columns that the user wants in the data.
                3. Data types: The data types for each column.
                4. Data size: The intended size of the data. This should be the number of rows.
                5. Data Variability: The possible variability in the data as per the user's requirement. This is optional.

            
            You never make up information that hasn't been provided by the CHAT_HISTORY. \n
            Return the code requirement as a JSON with the keys 'problem_statement', 'column_names', 'data_types', 'data_size', and 'data_variability' if provided. \n
            If the CHAT_REVIEW is 'not_understood' then return 'Nothing-To-Extract' as JSON with a single key 'no_extraction' and no premable or explaination. \n

            
    <|eot_id|><|start_header_id|>user<|end_header_id|>
    CHAT_HISTORY: {chat_history} \n
    CHAT_REVIEW: {chat_review} \n
    <|eot_id|><|start_header_id|>assistant<|end_header_id|>""",
    input_variables=["chat_history","chat_review"],
)

requirement_extraction_chain = requirement_extractor_prompt | GROQ_LLM | JsonOutputParser()

In [48]:
CHAT_HISTORY =  {
    "messages": [
        {"user": "USER", "text": "Hi, I'm trying to create a dataset for sales in a city. I'm looking to analyze the sales of fashion products in New York City. I want to track the sales of specific products to understand their market performance.", "timestamp": "2023-02-10T14:25:00"},
        {"ai": "AI", "text": "Sounds like an interesting project! Can you please tell me more about what you're trying to achieve? What specific problem are you trying to solve?", "timestamp": "2023-02-10T14:26:00"},
        {"user": "USER", "text": "The problem I'm trying to solve is to analyze the sales trends of blouses, tops, and dresses in New York City. I want to understand which products are most popular and identify any seasonal patterns.", "timestamp": "2023-02-10T14:27:00"},
        {"ai": "AI", "text": "Got it! So, you're looking to analyze sales data for blouses, tops, and dresses in New York City. Can you please list the columns you'd like to include in the dataset?", "timestamp": "2023-02-10T14:28:00"},
        {"user": "USER", "text": "I'd like to include columns for date, product category (blouses, tops, dresses), product sub-category (e.g. long-sleeve blouses), and sales amount.", "timestamp": "2023-02-10T14:30:00"},
        {"ai": "AI", "text": "That makes sense! What data types do you expect for each column? For example, would the sales amount be a numerical value?", "timestamp": "2023-02-10T14:31:00"},
        {"user": "USER", "text": "Yes, the sales amount would be a numerical value. And the date would be a date type. The product categories would be categorical.", "timestamp": "2023-02-10T14:33:00"},
        {"ai": "AI", "text": "Got it! For the sales amount, I assume you want a numerical value, such as a float or integer. For the date, I assume it would be a date data type. And for the product categories, I assume it would be a categorical value, such as strings or categorical data types like enum or categorical.", "timestamp": "2023-02-10T14:34:00"},
        {"user": "USER", "text": "Yes, that's correct! I'd also like to know how many rows of data you'd like to generate.", "timestamp": "2023-02-10T14:35:00"},
        {"ai": "AI", "text": "I can generate a dataset with a specified number of rows. How many rows would you like the dataset to have?", "timestamp": "2023-02-10T14:36:00"},
        {"user": "USER", "text": "I'd like the dataset to have 10,000 rows.", "timestamp": "2023-02-10T14:37:00"},
        {"ai": "AI", "text": "Okay, I'll generate a dataset with 10,000 rows. Would you like to specify any variability in the data?", "timestamp": "2023-02-10T14:38:00"},
        {"user": "USER", "text": "No, I don't need any variability in the data.", "timestamp": "2023-02-10T14:39:00"},
        {"ai": "AI", "text": "Alright, I've generated the dataset with the specified columns, data types, and number of rows. I've excluded variability to ensure consistent data.", "timestamp": "2023-02-10T14:40:00"}
    ]
}


In [51]:
chat_review_ = chat_review_router.invoke({"chat_history": CHAT_HISTORY})

print(chat_review_)

not_understood


In [52]:
print(requirement_extraction_chain.invoke({"chat_history": CHAT_HISTORY, "chat_review":chat_review_}))

{'no_extraction': 'Nothing-To-Extract'}


### Code Generator

In [100]:
## Draft Code
code_generator_prompt = PromptTemplate(
    template="""<|begin_of_text|><|start_header_id|>system<|end_header_id|>
    You are the Data Science Coding Agent. You read the CHAT_HISTORY below, the conversation between the USER and AI. \
    , the CHAT_REVIEW that reviews the chat and tells if the AI understands the USER requirements, \
    the Extracted Requirements from REQUIREMENT_EXTRACTOR.

    Here is your task: \n

        1. You write excellent codes for the client based on the Extracted Requirements. \n
        2. Analyze the Extracted Requirements and make sure the code meets the requirements. \n
        3. You can use Pandas, Numpy, or any other library you think is necessary. \n
        4. You are to always ensure that the code is well-documented and easy to understand. \n
        5. Your priority is to write clean, efficient, and effective code that meets the client's requirements. \n
        6. Always check the datatypes compatibility to avoid errors. \n Use the libraries effectively to generate the data. \n
        7. You are to return the code as a string and no premable or explanation. \n

    Other Informations: \n
        You never make up information that hasn't been provided by the CHAT_HISTORY, CHAT_REVIEW, or REQUIREMENT_EXTRACTOR. \n
        The CHAT_REVIEW is either 'understood' or 'not_understood'. If CHAT_REVIEW is 'understood', \
        the Extracted Requirements will be in the format of a JSON with the keys 'problem_statement', 'column_names', 'data_types', 'data_size', and 'data_variability' if provided. \

        If the CHAT_REVIEW is 'not_understood' then the Extracted Requirements will be 'Nothing-To-Extract' or 'True' as JSON with a single key 'no_extraction'. \
        So when we have 'Nothing-To-Extract' then we don't have any requirements to work with. \
        Else you return 'no-code generated' as a string and no premable or explanation. \


    <|eot_id|><|start_header_id|>user<|end_header_id|>
    CHAT_HISTORY: {chat_history} \n\n
    CHAT_REVIEW: {chat_review} \n\n
    REQUIREMENT_EXTRACTOR: {extracted_requirements} \n\n

    <|eot_id|><|start_header_id|>assistant<|end_header_id|>""",
    input_variables=["chat_history","chat_review","extracted_requirements"],
)

code_generator_chain = code_generator_prompt | GROQ_LLM | StrOutputParser()

In [101]:
chat_review_ =  chat_review_router.invoke({"chat_history": CHAT_HISTORY})
extracted_req = requirement_extraction_chain.invoke({"chat_history": CHAT_HISTORY, "chat_review":chat_review_})
print(chat_review_)
print(extracted_req)

understood
{'problem_statement': 'Analyze the sales trends of blouses, tops, and dresses in New York City', 'column_names': ['date', 'product_category', 'product_sub_category', 'sales_amount'], 'data_types': {'date': 'date', 'product_category': 'categorical', 'product_sub_category': 'categorical', 'sales_amount': 'numerical'}, 'data_size': 10000, 'data_variability': 'no variability'}


In [102]:
code_output = code_generator_chain.invoke({"chat_history": CHAT_HISTORY, "chat_review":chat_review_, "extracted_requirements":extracted_req})

In [103]:
print(code_output)

Here is the code based on the extracted requirements:

```
import pandas as pd
import numpy as np
import datetime as dt

# Define the column names and their data types
column_names = ['date', 'product_category', 'product_sub_category', 'sales_amount']
data_types = {'date': 'datetime64[D]', 'product_category': 'category', 'product_sub_category': 'category', 'sales_amount': 'float'}

# Define the data size
data_size = 10000

# Generate the dataset
data = {
    'date': [dt.datetime(2023, 1, 1) + dt.timedelta(days=i) for i in range(data_size)],
    'product_category': np.random.choice(['blouses', 'tops', 'dresses'], size=data_size),
    'product_sub_category': np.random.choice(['long-sleeve blouses', 'short-sleeve blouses', 'tunics', 't-shirts', 'dresses'], size=data_size),
    'sales_amount': np.random.randint(1, 100, size=data_size)
}

# Convert the data to a pandas DataFrame
df = pd.DataFrame(data)

# Print the generated dataset
print(df)
```


### Code Rewriter

In [104]:
## Rewrite code
code_rewriter_prompt = PromptTemplate(
    template="""<|begin_of_text|><|start_header_id|>system<|end_header_id|>
    You are the Data Science Coding Agent. You read the GENERATED_CODE, and the Extracted Requirements from REQUIREMENT_EXTRACTOR. \n

    Here is your task: \n

      1. Extract the coding part in the GENERATED_CODE. \n
      2. You compare this with the Extracted Requirements, to ensure that the code meets the requirements. \n
      3. You are to check the code for any errors, inconsistencies, or missing requirements. \n
      4. You make a deep search and simulate the code to ensure that it works as expected. For every method used you check to make sure it is correct\n
      5. You are rewriting the code with a 100 percent correctness. \n
      6. You are to return the code as a string and no premable or explanation. \n
      7. You are to ONLY RETURN THE CORRECTED/REWRITTEN CODE. \n 


  Make sure you are 100 percent sure of your corrections before you submit. \n

    <|eot_id|><|start_header_id|>user<|end_header_id|>
    GENERATED_CODE: {generated_code} \n\n
    REQUIREMENT_EXTRACTOR: {extracted_requirements} \n\n

    <|eot_id|><|start_header_id|>assistant<|end_header_id|>""",
    input_variables=["generated_code","extracted_requirements"],
)

code_rewriter_chain = code_rewriter_prompt | GROQ_LLM | StrOutputParser()

In [105]:
rewritten_code = code_rewriter_chain.invoke({"generated_code": code_output, "extracted_requirements":extracted_req})

In [106]:
print(rewritten_code)

```
import pandas as pd
import numpy as np
import datetime as dt

# Define the column names and their data types
column_names = ['date', 'product_category', 'product_sub_category', 'sales_amount']
data_types = {'date': 'datetime64[D]', 'product_category': 'category', 'product_sub_category': 'category', 'sales_amount': 'float64'}

# Define the data size
data_size = 10000

# Generate the dataset
data = {
    'date': [dt.datetime(2023, 1, 1) + dt.timedelta(days=i) for i in range(data_size)],
    'product_category': np.random.choice(['blouses', 'tops', 'dresses'], size=data_size),
    'product_sub_category': np.random.choice(['long-sleeve blouses', 'short-sleeve blouses', 'tunics', 't-shirts', 'dresses'], size=data_size),
    'sales_amount': np.random.randint(1, 100, size=data_size)
}

# Convert the data to a pandas DataFrame
df = pd.DataFrame(data)

# Print the generated dataset
print(df)
```


In [108]:
import pandas as pd
import numpy as np
import datetime as dt

# Define the column names and their data types
column_names = ['date', 'product_category', 'product_sub_category', 'sales_amount']
data_types = {'date': 'datetime64[D]', 'product_category': 'category', 'product_sub_category': 'category', 'sales_amount': 'float64'}

# Define the data size
data_size = 10000

# Generate the dataset
data = {
    'date': [dt.datetime(2023, 1, 1) + dt.timedelta(days=i) for i in range(data_size)],
    'product_category': np.random.choice(['blouses', 'tops', 'dresses'], size=data_size),
    'product_sub_category': np.random.choice(['long-sleeve blouses', 'short-sleeve blouses', 'tunics', 't-shirts', 'dresses'], size=data_size),
    'sales_amount': np.random.randint(1, 100, size=data_size)
}

# Convert the data to a pandas DataFrame
df = pd.DataFrame(data)

# Print the generated dataset
df

Unnamed: 0,date,product_category,product_sub_category,sales_amount
0,2023-01-01,blouses,long-sleeve blouses,96
1,2023-01-02,blouses,dresses,52
2,2023-01-03,tops,short-sleeve blouses,78
3,2023-01-04,blouses,dresses,93
4,2023-01-05,dresses,t-shirts,98
...,...,...,...,...
9995,2050-05-14,blouses,short-sleeve blouses,43
9996,2050-05-15,tops,short-sleeve blouses,48
9997,2050-05-16,blouses,t-shirts,63
9998,2050-05-17,blouses,tunics,22


### Code Extractor

In [109]:
## Rewrite code
code_extractor_prompt = PromptTemplate(
    template="""<|begin_of_text|><|start_header_id|>system<|end_header_id|>
    You are a Code Extractor Agent. \n
    You take in codes that might have some texts added to it. \n

    Only extract the code and return it as the only output. \n

    Return only the code and nothing added to it.
    <|eot_id|><|start_header_id|>user<|end_header_id|>
    CODE: {generated_code} \n\n

    <|eot_id|><|start_header_id|>assistant<|end_header_id|>""",
    input_variables=["generated_code"],
)

code_extractor_chain = code_extractor_prompt | GROQ_LLM | StrOutputParser()

In [110]:
extracted_code = code_extractor_chain.invoke({"generated_code": rewritten_code})

In [112]:
print(extracted_code)

```
import pandas as pd
import numpy as np
import datetime as dt

column_names = ['date', 'product_category', 'product_sub_category', 'sales_amount']
data_types = {'date': 'datetime64[D]', 'product_category': 'category', 'product_sub_category': 'category', 'sales_amount': 'float64'}

data_size = 10000

data = {
    'date': [dt.datetime(2023, 1, 1) + dt.timedelta(days=i) for i in range(data_size)],
    'product_category': np.random.choice(['blouses', 'tops', 'dresses'], size=data_size),
    'product_sub_category': np.random.choice(['long-sleeve blouses', 'short-sleeve blouses', 'tunics', 't-shirts', 'dresses'], size=data_size),
    'sales_amount': np.random.randint(1, 100, size=data_size)
}

df = pd.DataFrame(data)

print(df)


### Code Modifier

In [113]:
## Rewrite code
code_modifier_prompt = PromptTemplate(
    template="""<|begin_of_text|><|start_header_id|>system<|end_header_id|>
    You are a Coding Agent. You receive from CODES texts that contain a full code that creates a synthetic data for a client. \n
    Your job is to convert this code into a high level Class that has a crate function whereby when called the data is created.
    You are returning a high level class that has a create function that creates the data.
    This is how you should structure the class:
      import the necessary libraries
       classname:   SyntheticDataGenerator
         method:      create - creates the synthetic data
         method: save_to_csv - saves the data to a csv file
         method: generate_and_save_data - generates and saves the data to a csv file

    Return just the code, no extra text.
    <|eot_id|><|start_header_id|>user<|end_header_id|>
    CODEs: {generated_code} \n\n

    <|eot_id|><|start_header_id|>assistant<|end_header_id|>""",
    input_variables=["generated_code"],
)

code_modifier_chain = code_modifier_prompt | GROQ_LLM | StrOutputParser()

In [114]:
modified_code = code_modifier_chain.invoke({"generated_code": extracted_code})

In [116]:
print(modified_code)

```
import pandas as pd
import numpy as np
import datetime as dt

class SyntheticDataGenerator:
    def __init__(self):
        self.column_names = ['date', 'product_category', 'product_sub_category', 'sales_amount']
        self.data_types = {'date': 'datetime64[D]', 'product_category': 'category', 'product_sub_category': 'category', 'sales_amount': 'float64'}

    def create(self):
        data_size = 10000
        data = {
            'date': [dt.datetime(2023, 1, 1) + dt.timedelta(days=i) for i in range(data_size)],
            'product_category': np.random.choice(['blouses', 'tops', 'dresses'], size=data_size),
            'product_sub_category': np.random.choice(['long-sleeve blouses', 'short-sleeve blouses', 'tunics', 't-shirts', 'dresses'], size=data_size),
            'sales_amount': np.random.randint(1, 100, size=data_size)
        }
        return pd.DataFrame(data)

    def save_to_csv(self, df, filename):
        df.to_csv(filename, index=False)

    def generate_and_save_

In [43]:
import pandas as pd
from faker import Faker
from datetime import date, timedelta

# Create a Faker instance
fake = Faker()

# Set the number of rows
data_size = 10000

# Create a list to store the data
data = []

# Iterate through the number of rows
for i in range(data_size):
    # Generate the date
    date_obj = date.fromtimestamp(int(fake.random_int(min=1643728000, max=1646324800)))
    date_str = date_obj.strftime('%Y-%m-%d')

    # Generate the product category (blouses, tops, dresses)
    product_categories = ['blouses', 'tops', 'dresses']
    product_category = fake.random_element(elements=product_categories)

    # Generate the product sub-category (e.g. long-sleeve blouses)
    product_sub_categories = ['long-sleeve blouses', 'short-sleeve blouses', 'tank tops']
    product_sub_category = fake.random_element(elements=product_sub_categories)

    # Generate the sales amount
    sales_amount = fake.random_int(min=10, max=1000)

    # Append the data to the list
    data.append({
        'date': date_str,
        'product_category': product_category,
        'product_sub_category': product_sub_category,
        'sales_amount': sales_amount
    })

# Convert the list to a Pandas DataFrame
df = pd.DataFrame(data)

In [44]:
df

Unnamed: 0,date,product_category,product_sub_category,sales_amount
0,2022-02-05,dresses,short-sleeve blouses,812
1,2022-02-11,dresses,long-sleeve blouses,133
2,2022-02-11,blouses,short-sleeve blouses,833
3,2022-03-03,dresses,tank tops,737
4,2022-02-17,dresses,tank tops,116
...,...,...,...,...
9995,2022-03-02,dresses,tank tops,304
9996,2022-02-25,tops,tank tops,75
9997,2022-02-18,dresses,tank tops,245
9998,2022-02-22,dresses,long-sleeve blouses,626
