In [1]:
import os
import random
import pickle
from openai import OpenAI
from langchain.schema import Document
import os

openai_api_key = os.environ['OPENAI_API_KEY']

from langchain.vectorstores import Chroma
from langchain.embeddings.openai import OpenAIEmbeddings
#embedding = OpenAIEmbeddings(openai_api_key=<>)
embedding = OpenAIEmbeddings()
persist_directory = '/Users/szymongrabowski/'

# Initialize OpenAI client with API key from environment variable
client = OpenAI()
# Predefined words for generating random company names
prefixes = ["Eco", "Green", "Future", "Net", "Pure", "Clean", "Bright", "Renew", "Next", "Smart"]
suffixes = ["Tech", "Logistics", "Solutions", "Energy", "Agri", "Water", "Future", "Eco", "Systems", "Dynamics"]

# Industries and corresponding climate strategies
clean_industries = [
    {"industry": "Technology", "climate_strategy": "Invest heavily in renewable energy and carbon offset projects."},
    {"industry": "Logistics", "climate_strategy": "Transition to an all-electric vehicle fleet and optimize routes for fuel efficiency."},
    {"industry": "Agriculture", "climate_strategy": "Adopt regenerative farming practices and use carbon sequestration methods."},
    {"industry": "Manufacturing", "climate_strategy": "Implement circular economy principles and reduce waste across production lines."},
    {"industry": "Energy", "climate_strategy": "Shift energy production to solar, wind, and hydro sources."},
    {"industry": "Construction", "climate_strategy": "Utilize sustainable materials and adopt energy-efficient building designs."},
    {"industry": "Textile", "climate_strategy": "Use recycled materials and switch to renewable energy sources in production."},
    {"industry": "Food & Beverage", "climate_strategy": "Minimize food waste and adopt sustainable sourcing for ingredients."},
    {"industry": "Automobile", "climate_strategy": "Invest in electric vehicle technology and offset carbon emissions through reforestation projects."},
    {"industry": "Water Management", "climate_strategy": "Optimize water use and reduce greenhouse gas emissions in water treatment processes."}
]

dirty_industries = [
    {"industry": "Technology", "climate_strategy": "Continuing to rely on data centers powered by fossil fuels, with minimal plans for energy efficiency."},
    {"industry": "Logistics", "climate_strategy": "Relying on diesel-powered trucks with little plan to electrify the fleet."},
    {"industry": "Agriculture", "climate_strategy": "Excessive use of fertilizers and pesticides, with no significant plans for sustainability."},
    {"industry": "Manufacturing", "climate_strategy": "Focus on cost-cutting rather than sustainability, with little investment in clean technologies."},
    {"industry": "Energy", "climate_strategy": "Heavy reliance on coal and natural gas with no substantial plans to shift to cleaner energy."},
    {"industry": "Construction", "climate_strategy": "Using non-sustainable materials and practices with minimal focus on reducing environmental impact."},
    {"industry": "Textile", "climate_strategy": "Continuing to rely on cheap, non-sustainable fabrics with little concern for environmental impact."},
    {"industry": "Food & Beverage", "climate_strategy": "Ignoring food waste reduction and sourcing ingredients from non-sustainable suppliers."},
    {"industry": "Automobile", "climate_strategy": "No plans to move toward electric vehicles, continuing to manufacture gas-powered cars."},
    {"industry": "Water Management", "climate_strategy": "No effort to reduce water usage or improve energy efficiency in water treatment processes."}
]

# Generate random company name
def generate_random_company_name():
    return f"{random.choice(prefixes)}{random.choice(suffixes)}"

# Generate financial statement based on a given prompt
def generate_financial_statement(prompt):
    response = client.chat.completions.create(
        messages=[
            {"role": "system", "content": "You are a financial expert."},
            {"role": "user", "content": prompt}
        ],
        model="gpt-3.5-turbo",
        temperature=0.5,
    )
    return response.choices[0].message.content

# Generate financial statements for clean or dirty companies
def generate_company_financial_statements(company_name, industry, climate_strategy):
    prompt = f"""
    Generate synthetic financial statements for a fictional company named {company_name}, which operates in the {industry} industry.
    Include an income statement, balance sheet, and cash flow statement with realistic financial figures.
    Additionally, describe the company's specific climate strategy to achieve net-zero emissions by 2050: {climate_strategy}.
    """
    return generate_financial_statement(prompt)

# Function to save the docs list to a file using pickle
def save_docs(docs, filename="company_docs.pkl"):
    with open(filename, 'wb') as file:
        pickle.dump(docs, file)
    print(f"Documents saved to {filename}")

# Function to load the docs list from a file using pickle
def load_docs(filename="company_docs.pkl"):
    if os.path.exists(filename):
        with open(filename, 'rb') as file:
            docs = pickle.load(file)
        print(f"Documents loaded from {filename}")
        return docs
    else:
        print("No saved documents found.")
        return []

# Main function to create a set of clean and dirty companies and save to docs
def main():
    docs = []  # List to store LangChain Document objects
    
    for i in range(5):  # Generate 5 clean companies
        company_name = generate_random_company_name()
        company_info = random.choice(clean_industries)
        
        print(f"\nGenerating financial statements for clean company {company_name}...\n")
        
        # Generate the financial statements
        financial_statements = generate_company_financial_statements(
            company_name, company_info["industry"], company_info["climate_strategy"]
        )
        
        # Create a Document object and append to the docs list
        doc = Document(page_content=financial_statements, metadata={"company_name": company_name, "type": "clean"})
        docs.append(doc)
        
        print(f"Generated and added financial statements for {company_name} to docs list.")
        print("\n" + "="*50 + "\n")
    
    for i in range(5):  # Generate 5 dirty companies
        company_name = generate_random_company_name()
        company_info = random.choice(dirty_industries)
        
        print(f"\nGenerating financial statements for dirty company {company_name}...\n")
        
        # Generate the financial statements
        financial_statements = generate_company_financial_statements(
            company_name, company_info["industry"], company_info["climate_strategy"]
        )
        
        # Create a Document object and append to the docs list
        doc = Document(page_content=financial_statements, metadata={"company_name": company_name, "type": "dirty"})
        docs.append(doc)
        
        print(f"Generated and added financial statements for {company_name} to docs list.")
        print("\n" + "="*50 + "\n")
    
    # Save docs to a file
    save_docs(docs)

    # Optionally, load the docs back from the file
    loaded_docs = load_docs()
    for doc in loaded_docs:
        print(f"Company: {doc.metadata['company_name']} ({doc.metadata['type']})")
        print(f"Content:\n{doc.page_content}\n{'='*50}\n")

if __name__ == "__main__":
    main()


  embedding = OpenAIEmbeddings()



Generating financial statements for clean company PureWater...

Generated and added financial statements for PureWater to docs list.



Generating financial statements for clean company PureLogistics...

Generated and added financial statements for PureLogistics to docs list.



Generating financial statements for clean company NextFuture...

Generated and added financial statements for NextFuture to docs list.



Generating financial statements for clean company EcoTech...

Generated and added financial statements for EcoTech to docs list.



Generating financial statements for clean company SmartSolutions...

Generated and added financial statements for SmartSolutions to docs list.



Generating financial statements for dirty company FutureWater...

Generated and added financial statements for FutureWater to docs list.



Generating financial statements for dirty company CleanSystems...

Generated and added financial statements for CleanSystems to docs list.



Generating financial 

In [2]:
docs = load_docs()

Documents loaded from company_docs.pkl


In [3]:
docs

[Document(metadata={'company_name': 'PureWater', 'type': 'clean'}, page_content="**PureWater - Financial Statements**\n\n**Income Statement**\n```\nPureWater - Income Statement\nFor the Year Ended December 31, 20XX\n\nRevenue: $5,000,000\nCost of Goods Sold: $2,000,000\nGross Profit: $3,000,000\n\nOperating Expenses:\n   - Marketing and Sales: $500,000\n   - Research and Development: $300,000\n   - General and Administrative: $700,000\nTotal Operating Expenses: $1,500,000\n\nOperating Income: $1,500,000\n\nInterest Expense: $100,000\nNet Income Before Tax: $1,400,000\n\nIncome Tax Expense: $400,000\n\nNet Income: $1,000,000\n```\n\n**Balance Sheet**\n```\nPureWater - Balance Sheet\nAs of December 31, 20XX\n\nAssets:\n   - Cash: $500,000\n   - Accounts Receivable: $1,000,000\n   - Inventory: $700,000\n   - Property, Plant, and Equipment: $3,000,000\nTotal Assets: $5,200,000\n\nLiabilities:\n   - Accounts Payable: $600,000\n   - Long-Term Debt: $1,500,000\nTotal Liabilities: $2,100,000\n

In [4]:
# Split
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1500,
    chunk_overlap = 150
)

In [5]:
splits = text_splitter.split_documents(docs)

In [6]:
splits

[Document(metadata={'company_name': 'PureWater', 'type': 'clean'}, page_content='**PureWater - Financial Statements**\n\n**Income Statement**\n```\nPureWater - Income Statement\nFor the Year Ended December 31, 20XX\n\nRevenue: $5,000,000\nCost of Goods Sold: $2,000,000\nGross Profit: $3,000,000\n\nOperating Expenses:\n   - Marketing and Sales: $500,000\n   - Research and Development: $300,000\n   - General and Administrative: $700,000\nTotal Operating Expenses: $1,500,000\n\nOperating Income: $1,500,000\n\nInterest Expense: $100,000\nNet Income Before Tax: $1,400,000\n\nIncome Tax Expense: $400,000\n\nNet Income: $1,000,000\n```\n\n**Balance Sheet**\n```\nPureWater - Balance Sheet\nAs of December 31, 20XX\n\nAssets:\n   - Cash: $500,000\n   - Accounts Receivable: $1,000,000\n   - Inventory: $700,000\n   - Property, Plant, and Equipment: $3,000,000\nTotal Assets: $5,200,000\n\nLiabilities:\n   - Accounts Payable: $600,000\n   - Long-Term Debt: $1,500,000\nTotal Liabilities: $2,100,000\n

In [7]:
vectordb = Chroma.from_documents(
    documents=splits,
    embedding=embedding,
    persist_directory=persist_directory
)

In [8]:
print(vectordb._collection.count())

24


In [9]:
question = "Who has the cleanest climate strategy"
docs = vectordb.similarity_search(question,k=1)

In [10]:
docs

[Document(metadata={'company_name': 'CleanSystems', 'type': 'dirty'}, page_content="**Climate Strategy for Achieving Net-Zero Emissions by 2050**\n\nCleanSystems' climate strategy to achieve net-zero emissions by 2050 involves a multi-faceted approach despite its reliance on diesel-powered trucks. The company plans to implement the following initiatives:\n\n1. **Investment in Carbon Offsetting:** CleanSystems will invest in carbon offset programs to compensate for the emissions produced by its diesel-powered trucks. This includes supporting renewable energy projects, reforestation efforts, and other initiatives to reduce the company's carbon footprint.\n\n2. **Efficiency Improvements:** The company will focus on improving the efficiency of its logistics operations to minimize fuel consumption and emissions. This includes optimizing delivery routes, reducing idle time, and implementing eco-driving practices among its drivers.\n\n3. **Transition to Sustainable Fuels:** While CleanSystems