In [2]:
!pip install yfinance langchain_pinecone openai python-dotenv langchain-community sentence_transformers pinecone-client groq



In [3]:
import os
import json
from langchain_pinecone import PineconeVectorStore
from langchain_community.embeddings import HuggingFaceEmbeddings
from google.colab import userdata
import concurrent.futures
import groq
import asyncio
from pinecone import Pinecone
from sentence_transformers import SentenceTransformer
import yfinance as yf
import requests
import logging
from langchain.schema import Document

In [9]:
PINECONE_API_KEY = userdata.get('PINECONE_API_KEY')
GROQ_API_KEY = userdata.get('GROQ_API_KEY')

pinecone = Pinecone(api_key=PINECONE_API_KEY)
index_name = "stocks"
model_name = "sentence-transformers/all-mpnet-base-v2"
namespace = "stock-descriptions"
index = pinecone.Index(index_name)
embedding_model = SentenceTransformer(model_name)

In [5]:
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [57]:
async def get_stock_info(ticker):
    try:
        stock = await asyncio.to_thread(lambda: yf.Ticker(ticker))
        info = await asyncio.to_thread(lambda: stock.info)
        return {
            "Ticker": info.get("symbol", "N/A"),
            "Name": info.get("longName", "N/A"),
            "Business Summary": info.get("longBusinessSummary", "N/A"),
            "Sector": info.get("sector", "N/A"),
            "Industry": info.get("industry", "N/A"),
            "Market Cap": info.get("marketCap", 0),
            "Volume": info.get("volume", 0),
            "PE Ratio": info.get("trailingPE", "N/A"),
            "Dividend Yield": info.get("dividendYield", "N/A"),
            "Beta": info.get("beta", "N/A"),
        }
    except Exception as e:
        logger.error(f"Failed to fetch info for {ticker}: {e}")
        return None

In [58]:
async def get_huggingface_embeddings(text):
    try:
        embedding = await asyncio.to_thread(lambda: embedding_model.encode(text))
        return embedding
    except Exception as e:
        logger.error(f"Failed to generate embeddings: {e}")
        return None

In [16]:
def get_company_tickers():
    url = "https://raw.githubusercontent.com/team-headstart/Financial-Analysis-and-Automation-with-LLMs/main/company_tickers.json"
    response = requests.get(url)
    if response.status_code == 200:
        return json.loads(response.content.decode('utf-8'))
    else:
        raise Exception("Failed to fetch tickers")

company_tickers = get_company_tickers()

In [15]:
# for idx, stock in company_tickers.items():
#     stock_ticker = stock['ticker']
#     stock_data = get_stock_info(stock_ticker)
#     if not stock_data or stock_data["Business Summary"] == "N/A":
#         print(f"Skipping {stock_ticker}: Insufficient data")
#         continue

#     stock_description = stock_data['Business Summary']
#     # print(f"Processing stock {idx + 1} / {len(company_tickers)}: {stock_ticker}")
#     try:
#         vectorstore.add_texts(
#             texts=[stock_description],
#             metadatas=[stock_data],
#             namespace=namespace
#         )
#     except Exception as e:
#         print(f"Error indexing {stock_ticker}: {e}")

KeyboardInterrupt: 

In [23]:
# async def process_stock(ticker, namespace):
#     try:
#         stock_data = await get_stock_info(ticker)
#         if not stock_data or stock_data["Business Summary"] == "N/A":
#             logger.warning(f"SKIPPED: {ticker} - Insufficient data")
#             return

#         embedding = await get_huggingface_embeddings(stock_data["Business Summary"])
#         if embedding is not None:
#             index.upsert(
#                 vectors=[(ticker, embedding.tolist(), stock_data)],
#                 namespace=namespace,
#             )
#             logger.info(f"PROCESSED: {ticker} - Successfully stored in Pinecone")
#     except Exception as e:
#         logger.error(f"FAILED: {ticker} - {e}")


async def process_stocks_concurrently(tickers, namespace, max_concurrent=100, batch_size=100):
    semaphore = asyncio.Semaphore(max_concurrent)
    vectors = []

    async def semaphore_process(ticker):
        async with semaphore:
            try:
                stock_data = await get_stock_info(ticker)
                if not stock_data or stock_data["Business Summary"] == "N/A":
                    logger.warning(f"SKIPPED: {ticker} - Insufficient data")
                    return None

                embedding = await get_huggingface_embeddings(stock_data["Business Summary"])
                if embedding is not None:
                    unique_id = f"{ticker}_{stock_data['Name'].replace(' ', '_')}"
                    return (unique_id, embedding.tolist(), stock_data)
            except Exception as e:
                logger.error(f"FAILED: {ticker} - {e}")
                return None

    tasks = [semaphore_process(ticker) for ticker in tickers]

    results = await asyncio.gather(*tasks)
    vectors = [result for result in results if result is not None]

    for i in range(0, len(vectors), batch_size):
        batch = vectors[i:i+batch_size]
        try:
            index.upsert(vectors=batch, namespace=namespace)
            logger.info(f"Upserted batch {i//batch_size + 1}")
        except Exception as e:
            logger.error(f"Batch upsert failed: {e}")

    return vectors

In [24]:
company_tickers

{'0': {'cik_str': 1045810, 'ticker': 'NVDA', 'title': 'NVIDIA CORP'},
 '1': {'cik_str': 320193, 'ticker': 'AAPL', 'title': 'Apple Inc.'},
 '2': {'cik_str': 789019, 'ticker': 'MSFT', 'title': 'MICROSOFT CORP'},
 '3': {'cik_str': 1018724, 'ticker': 'AMZN', 'title': 'AMAZON COM INC'},
 '4': {'cik_str': 1652044, 'ticker': 'GOOGL', 'title': 'Alphabet Inc.'},
 '5': {'cik_str': 1326801, 'ticker': 'META', 'title': 'Meta Platforms, Inc.'},
 '6': {'cik_str': 1318605, 'ticker': 'TSLA', 'title': 'Tesla, Inc.'},
 '7': {'cik_str': 1067983,
  'ticker': 'BRK-B',
  'title': 'BERKSHIRE HATHAWAY INC'},
 '8': {'cik_str': 1046179,
  'ticker': 'TSM',
  'title': 'TAIWAN SEMICONDUCTOR MANUFACTURING CO LTD'},
 '9': {'cik_str': 1730168, 'ticker': 'AVGO', 'title': 'Broadcom Inc.'},
 '10': {'cik_str': 59478, 'ticker': 'LLY', 'title': 'ELI LILLY & Co'},
 '11': {'cik_str': 19617, 'ticker': 'JPM', 'title': 'JPMORGAN CHASE & CO'},
 '12': {'cik_str': 104169, 'ticker': 'WMT', 'title': 'Walmart Inc.'},
 '13': {'cik_str'

In [59]:
tickers = [company_tickers[key]["ticker"] for key in company_tickers.keys()]
print(len(tickers))

9998


In [None]:
async def main():
    namespace = "stock-descriptions-optimized"
    # tickers = ["META", "AAPL", "AMZN", "TSM", "AVGO", "BRK-B"]
    tickers = [company_tickers[key]["ticker"] for key in company_tickers.keys()]
    await process_stocks_concurrently(tickers, namespace)

await main()



In [53]:
async def semantic_stock_search(query, namespace, top_k=10):
    try:
        query_embedding = await get_huggingface_embeddings(query)
        if query_embedding is None:
            logger.warning(f"Embedding generation failed for query: {query}")
            return []

        query_embedding = query_embedding.tolist()

        search_results = index.query(
            vector=query_embedding,
            top_k=top_k,
            namespace=namespace,
            include_metadata=True
        )

        if not search_results.get("matches"):
            logger.info(f"No matches found for query: {query}")
            return []

        logger.debug(f"Semantic search results: {search_results}")
        return [result["metadata"] for result in search_results["matches"]]
    except Exception as e:
        logger.error(f"Pinecone query error: {e}")
        return []


In [51]:
async def generate_stock_insights(matched_stocks, query):
    if not matched_stocks:
        logger.info(f"No stocks matched the query: {query}")
        return "No matching stocks were found for your query."

    stock_context = "\n\n".join([
        f"Ticker: {stock['Ticker']}\n"
        f"Name: {stock['Name']}\n"
        f"Sector: {stock['Sector']}\n"
        f"Market Cap: {stock['Market Cap']}"
        for stock in matched_stocks
    ])

    prompt = f"""
    You are a professional investment analyst.
    Given the user query: "{query}"
    Stock Matches:
    {stock_context}
    Provide a concise summary of the matches. Highlight key insights and suggest potential opportunities or risks.
    """

    logger.debug(f"Generated prompt for Groq: {prompt}")

    groq_client = groq.Groq(api_key=GROQ_API_KEY)
    try:
        response = groq_client.chat.completions.create(
            messages=[
                {"role": "system", "content": "You are a professional investment analyst."},
                {"role": "user", "content": prompt}
            ],
            model="mixtral-8x7b-32768",
            max_tokens=500
        )

        logger.debug(f"Groq response: {response}")
        return response.choices[0].message.content
    except Exception as e:
        logger.error(f"Groq API error: {e}")
        return "Unable to generate insights at this time."


In [56]:
async def process_stock_query(query):
    matched_stocks = await semantic_stock_search(query, namespace)
    insights = await generate_stock_insights(matched_stocks, query)
    return {
        "query": query,
        "matched_stocks": matched_stocks,
        "insights": insights
    }

query = "highest market cap"
results = await process_stock_query(query)

print("Query:", results["query"])
print("\nMatched Stocks:")
for stock in results["matched_stocks"]:
    print(f"Ticker: {stock['Ticker']}, Name: {stock['Name']}, "
          f"Sector: {stock['Sector']}, Market Cap: {stock['Market Cap']}")

print("\nInvestment Insights:")
print(results["insights"])

Query: highest market cap

Matched Stocks:
Ticker: HPH, Name: Highest Performances Holdings Inc., Sector: Financial Services, Market Cap: 84965664.0
Ticker: WAI, Name: Top KingWin Ltd, Sector: Financial Services, Market Cap: 67716088.0
Ticker: TOP, Name: TOP Financial Group Limited, Sector: Financial Services, Market Cap: 57404092.0
Ticker: TWG, Name: Top Wealth Group Holding Limited, Sector: Consumer Defensive, Market Cap: 14789599.0
Ticker: HHHEF, Name: 37 Capital Inc., Sector: Basic Materials, Market Cap: 1314388.0
Ticker: DPCS, Name: DP Cap Acquisition Corp I, Sector: Financial Services, Market Cap: 91350000.0
Ticker: RMT, Name: Royce Micro-Cap Trust, Inc., Sector: Financial Services, Market Cap: 518477856.0
Ticker: TOPS, Name: Top Ships Inc., Sector: Energy, Market Cap: 33979440.0
Ticker: PMAX, Name: Powell Max Limited, Sector: Industrials, Market Cap: 45680612.0
Ticker: MAIN, Name: Main Street Capital Corporation, Sector: Financial Services, Market Cap: 4875372544.0

Investment I