In [None]:
!pip3 install edgartools

In [None]:
from edgar import *
from edgar.xbrl import *
from bs4 import BeautifulSoup
import requests
import os
import pandas as pd
from langchain.schema import Document
from langchain.vectorstores import Chroma
from langchain.embeddings.openai import OpenAIEmbeddings

set_identity("yiyi.xiao@barclays.com")
request_headers = {
    'User-Agent': 'Barclays yiyi.xiao@barclays.com',
    'Accept-Encoding': 'gzip, deflate',
    'Host': 'www.sec.gov'
}
output_path = 'C:/Users/YXiao/Documents/Fin_Statements'
persist_directory = 'C:/Users/YXiao/Downloads/'



def download_filing_primary_document(company_ticker):
    # # Get filing statement for a given date
    # filing = Company(company_ticker).get_filings(form="10-K", filing_date='2023-11-14:2024-11-13').get(0)
    # Get the latest filing statement
    filing = Company(company_ticker).get_filings(form="10-K").latest(1)
    print(filing)

    # Download financial statements
    # xbrl_data = filing.xbrl()
    # statements = xbrl_data.statements

    # statements['ConsolidatedBalanceSheets']
    # statements['ConsolidatedBalanceSheets'].get_dataframe()
    # statements['ConsolidatedStatementsofComprehensiveIncome'].get_dataframe()
    # statements['ConsolidatedStatementsofCashFlows'].get_dataframe()

    # Download the primary document to extract text
    response = requests.get(filing.document.url, headers = request_headers)
    if response.status_code != '200':
        with open(f"{output_path}/{company_ticker}_filing_content.html", "wb") as file:
            file.write(response.content)
            print(f'Downloaded {company_ticker}_filing_content.html')
    else:
        print(response.status_code)


def extract_text_from_filing(company_ticker):
    file_path = f"{output_path}/{company_ticker}_filing_content.html"
    with open(file_path, "r", encoding="utf-8") as file:
        content = file.read()

    # Parse the HTML content with BeautifulSoup
    soup = BeautifulSoup(content, "html.parser")

    # Filter for meaningful text within specific tags
    text_elements = []
    for tag in soup.find_all(['p', 'div']):
        if tag.text.strip():
            text_elements.append(tag.get_text(strip=True))

    # Filter for the business section text content
    start_idx = None
    end_idx = None
    for i, t in enumerate(text_elements):
        if 'Item 1' in t and 'Business' in t and len(t) < 20:
            start_idx = i
            end_idx = None
        elif 'Item 1A' in t and 'Risk Factors' in t and len(t) < 25:
            end_idx = i
    text_elements = text_elements[start_idx:end_idx]

    def useful_content(s):
        try:
            int(s)
            return False
        except:
            ls = s.lower()
            if 'table' in ls and 'of' in ls and 'contents' in ls and len(ls) < 20:
                return False
            return True
    
    text_elements = [t for t in text_elements if useful_content(t)]

    text_elements = ' '.join(text_elements)
    return text_elements


def main():
    sp_companies = pd.read_csv(r"C:\git\GoGoAI-ClimateFin\financials.csv",
                               usecols=['Symbol', 'Name', 'Sector'])
    sp_companies.rename(columns={'Symbol': 'Ticker', 'Name': 'Company_Name'}, inplace=True)

    docs = []  # List to store LangChain Document objects
    
    for i, row in sp_companies.iloc[:5].iterrows():
        download_filing_primary_document(row.Ticker)
        text_elements = extract_text_from_filing(row.Ticker)
        print(text_elements)
        doc = Document(page_content=text_elements, metadata={"company_name": row.Company_Name, "ticker": row.Ticker, "sector": row.Sector})
        docs.append(doc)
    print(docs)
    return docs

In [51]:
text_element = extract_text_from_filing('MMM')

In [None]:
text_element

In [None]:
docs = main()

In [63]:
docs_small = docs[0:3]

In [None]:
docs_small

In [7]:
# Split
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1500,
    chunk_overlap = 150
)

In [8]:
splits = text_splitter.split_documents(docs_small)

In [None]:
splits

In [None]:
len(splits)

In [None]:
embedding = OpenAIEmbeddings(api_key="YOUR_API_KEY")

In [None]:
persist_directory

In [None]:
vectordb = Chroma.from_documents(
    documents=splits,
    embedding=embedding,
    persist_directory='C:/Users/YXiao/Documents/'
)

In [None]:
llm_name = "gpt-4o"
from langchain.chat_models import ChatOpenAI
llm = ChatOpenAI(model_name=llm_name, temperature=0,
                api_key="YOUR_API_KEY")
llm.predict("Hello world!")

In [29]:
from langchain.memory import ConversationBufferMemory
memory = ConversationBufferMemory(
    memory_key="chat_history",
    return_messages=True
)

In [30]:
from langchain.chains import ConversationalRetrievalChain
retriever=vectordb.as_retriever()
qa = ConversationalRetrievalChain.from_llm(
    llm,
    retriever=retriever,
    memory=memory
)

In [31]:
# Build prompt
from langchain.prompts import PromptTemplate
template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer. Use three sentences maximum. Keep the answer as concise as possible. Always say "thanks for asking!" at the end of the answer. 
{context}
Question: {question}
Helpful Answer:"""
QA_CHAIN_PROMPT = PromptTemplate(input_variables=["context", "question"],template=template,)

In [32]:
from langchain.memory import ConversationBufferMemory
memory = ConversationBufferMemory(
    memory_key="chat_history",
    return_messages=True
)

In [33]:
from langchain.chains import ConversationalRetrievalChain
retriever=vectordb.as_retriever()
qa = ConversationalRetrievalChain.from_llm(
    llm,
    retriever=retriever,
    memory=memory
)

In [None]:
question = "What's the name of the company with the best climate strategy and provide a summary of its environment/sustainability strategy in keywords and bullet points"
result = qa({"question": question})
print(result['answer'])

In [None]:
Company('TSLA').get_filings(form="10-K").latest(1)

In [4]:
filing = Company('TSLA').get_filings(form="10-K").latest(1)


In [None]:
xbrl_data = filing.xbrl()
xbrl_data

In [None]:
statements = xbrl_data.statements
statements

In [None]:
statements['ConsolidatedBalanceSheets']

In [None]:
statements['ConsolidatedBalanceSheets'].get_dataframe()

In [None]:
statements['ConsolidatedStatementsofComprehensiveIncome']

In [None]:
statements['ConsolidatedStatementsofCashFlows']