In [2]:
import os
from langchain_community.tools.tavily_search import TavilySearchResults
from dotenv import load_dotenv
load_dotenv()


True

In [3]:
TAVILY_API_KEY = os.getenv('TAVILY_API_KEY')

In [4]:
web_search_tool = TavilySearchResults(TAVILY_API_KEY=TAVILY_API_KEY,\
    description="As an AI Search Engine, search all the web and provide correct URLs for information related to the company name provided as input. You are expected to provide correct URLs that contain information about the given company's subsidiaries and associated domains. Focus on reputable sources such as company websites, business directories, news articles, and other relevant online resources. Search the web for accurate and up-to-date information about the company provided in query. Provide reliable URLs containing information on: 1. The company's subsidiaries 2. Associated domains and websites owned by the company. Focus on reputable sources such as: - Official company websites - Business directories (e.g., Bloomberg, Reuters) - Financial news websites - Industry-specific publications - Government databases (e.g., SEC filings for public companies). Prioritize recent and authoritative sources. Exclude unofficial blogs, personal websites, or unreliable sources. Provide a brief summary of what each URL contains, focusing on its relevance to the company's subsidiaries and associated domains.",\
        max_results=10,include_answer=True,include_images=True)

In [5]:
company_name = 'Google'

In [6]:
web_results = web_search_tool.invoke({"query": f"Search for and provide reliable web links containing information about subsidiaries and associated domains of the company: {company_name}. Focus on authoritative sources such as official company websites, business directories, and reputable news outlets. For each link, briefly summarize its relevance to the company's subsidiaries or associated domains."})

In [7]:
web_results

[{'url': 'https://cloud.google.com/resource-manager/docs/managing-multiple-orgs',
  'content': 'Each Google Workspace or Cloud Identity account is also associated with a primary domain, such as example.com . For details on using multiple domains, see Add\xa0...Missing:  subsidiaries | Show results with:subsidiaries'},
 {'url': 'https://icannwiki.org/Google',
  'content': "May 31, 2022 · Based on ICANN's List of New gTLD Applied-For Strings, Google applied for 101 new gTLDs through its wholly-owned subsidiary, Charleston Road\xa0..."},
 {'url': 'http://www.google.com/support/enterprise/static/gapps/docs/admin/en/nftf/multiple_domains/gapps_%20multiple_domains.pdf',
  'content': 'For example, bill@subsidiary.com could have the email alias bill@parent.com — as long as someone else does not already have that email address.'},
 {'url': 'https://en.wikipedia.org/wiki/Alphabet_Inc.',
  'content': 'He clarified that, as a result of the new holding company, Google would be "a bit slimmed down, 

In [8]:
web_result_tavily_url = [result['url'] for result in web_results]

In [9]:
web_result_tavily_url

['https://cloud.google.com/resource-manager/docs/managing-multiple-orgs',
 'https://icannwiki.org/Google',
 'http://www.google.com/support/enterprise/static/gapps/docs/admin/en/nftf/multiple_domains/gapps_%20multiple_domains.pdf',
 'https://en.wikipedia.org/wiki/Alphabet_Inc.',
 'https://en.wikipedia.org/wiki/Google',
 'https://support.google.com/nonprofits/thread/74557231/g-suite-legacy-free-to-g-suite-non-profit-but-with-two-domains?hl=en',
 'https://www.investopedia.com/investing/companies-owned-by-google/',
 'https://medium.com/google-cloud/setting-up-google-cloud-identity-for-multinational-companies-4afcbb18dee1',
 'https://support.google.com/a/answer/12099366?hl=en',
 'https://cloud.google.com/architecture/identity/best-practices-for-planning']

In [10]:
# write in json
import json
with open('web_result_tavily_url_latest.json', 'w') as f:
    json.dump(web_results, f)

In [11]:
# Perplexity web search for subsidiaries and associated domains
import os
PPLX_API_KEY = os.getenv('PERPLEXITY_API_KEY')


In [12]:
from langchain_community.chat_models import ChatPerplexity
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser

In [13]:
chat = ChatPerplexity(
    temperature=0, pplx_api_key=PPLX_API_KEY, model="llama-3-sonar-small-32k-online"
)

In [14]:
system = "As an AI Search Engine, search the entire web and provide correct URLs for information related \
    to the company name provided as input. You are expected to provide correct URLs\
    that contain information about the given company's subsidiaries and associated domains.\
    Focus on reputable sources such as company websites, business directories,\
    news articles, and other relevant online resources.\
    Search the web for accurate and up-to-date information about the company provided in query.\
    Provide reliable URLs containing information on:\
    1. The company's subsidiaries\
    2. Associated domains and websites owned by the company.\
    Focus on reputable sources such as: \
    - Official company websites - Business directories (e.g., Bloomberg, Reuters)\
    - Financial news websites\
    - Industry-specific publications\
    - Government databases (e.g., SEC filings for public companies).\
    Prioritize recent and authoritative sources.\
    Exclude unofficial blogs, personal websites, or unreliable sources.\
    Provide a brief summary of what each URL contains,\
    focusing on its relevance to the company's subsidiaries and associated domains."
human = "{input}"
prompt = ChatPromptTemplate.from_messages([("system", system), ("human", human)])

chain = prompt | chat | StrOutputParser()
perplexity_web_response = chain.invoke({"input": f"Search for and provide reliable web links\
                                    containing information about subsidiaries\
                                    and associated domains of the company: {company_name}.\
                                    Focus on authoritative sources such as official company websites,\
                                    business directories, and reputable news outlets.\
                                    For each link, briefly summarize its relevance to the company's\
                                    subsidiaries or associated domains."})


In [15]:
perplexity_web_response

"To find reliable web links containing information about subsidiaries and associated domains of Google, I searched authoritative sources such as official company websites, business directories, and reputable news outlets. Here are the results:\n\n1. **Google's Official Website**\n   - **URL:** https://www.google.com\n   - **Summary:** Google's official website provides information about the company's products and services, including search engines, Google Maps, Google Drive, and other applications. It also includes details about Google's subsidiaries and associated domains.\n\n2. **Google's Subsidiaries**\n   - **URL:** https://www.google.com/about/subsidiaries/\n   - **Summary:** This page lists Google's subsidiaries, including Alphabet Inc., Google Cloud, Google Fiber, Google Ventures, and Google Capital. It provides details about each subsidiary's role and services.\n\n3. **Google's Associated Domains**\n   - **URL:** https://www.google.com/about/our-company/associated-companies/\n 

In [16]:
with open('perplexity_web_response_latest.json', 'w') as f:
    json.dump(perplexity_web_response, f)

In [61]:
#response.content

In [17]:
OPENAI_API_KEY=os.getenv('OPENAI_API_KEY')

In [83]:
#print(OPENAI_API_KEY)

In [18]:
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(api_key=OPENAI_API_KEY)

In [19]:
openai_model='gpt-4o'

In [20]:
from langchain_core.prompts import ChatPromptTemplate

system = """
You are an AI Content Extractor. Your job is to extract urls with the summary as text.
Return the information in the following list format:
[
    {
        "url": "https://www.test.com",
        "summary": "This is a summary of the content in the website"
    },
    {
        "url": "https://www.test1.com",
        "summary": "This is a summary of the content in the website"
    }
]
"""
prompt = ChatPromptTemplate.from_messages(
    [
        ("system", "{system}"),
        ("human", "Response is :{response}"),
    ]
)

llm = ChatOpenAI(model=openai_model, temperature=0)
chain = prompt | llm | StrOutputParser()

openai_perplexity_response = chain.invoke(
    {
        "system": system,
        "response": perplexity_web_response
    }
    
)


In [21]:
openai_perplexity_response

'[\n    {\n        "url": "https://www.google.com",\n        "summary": "Google\'s official website provides information about the company\'s products and services, including search engines, Google Maps, Google Drive, and other applications. It also includes details about Google\'s subsidiaries and associated domains."\n    },\n    {\n        "url": "https://www.google.com/about/subsidiaries/",\n        "summary": "This page lists Google\'s subsidiaries, including Alphabet Inc., Google Cloud, Google Fiber, Google Ventures, and Google Capital. It provides details about each subsidiary\'s role and services."\n    },\n    {\n        "url": "https://www.google.com/about/our-company/associated-companies/",\n        "summary": "This page lists Google\'s associated companies, including YouTube, Android, and other platforms. It provides information about how these domains are related to Google\'s core services and products."\n    },\n    {\n        "url": "https://www.bloomberg.com/profile/com

In [22]:
perplexity_web_result = json.loads(openai_perplexity_response)
final_perplexity_urls = [result['url'] for result in perplexity_web_result]

In [23]:
final_perplexity_urls

['https://www.google.com',
 'https://www.google.com/about/subsidiaries/',
 'https://www.google.com/about/our-company/associated-companies/',
 'https://www.bloomberg.com/profile/company/1253793D',
 'https://www.reuters.com/technology/google']

In [24]:
fial_web_scrapping_urls = list(set(web_result_tavily_url) | set(final_perplexity_urls))

In [25]:
fial_web_scrapping_urls

['https://support.google.com/nonprofits/thread/74557231/g-suite-legacy-free-to-g-suite-non-profit-but-with-two-domains?hl=en',
 'https://www.google.com',
 'https://en.wikipedia.org/wiki/Google',
 'https://medium.com/google-cloud/setting-up-google-cloud-identity-for-multinational-companies-4afcbb18dee1',
 'https://www.reuters.com/technology/google',
 'https://www.google.com/about/our-company/associated-companies/',
 'https://en.wikipedia.org/wiki/Alphabet_Inc.',
 'https://www.bloomberg.com/profile/company/1253793D',
 'https://www.google.com/about/subsidiaries/',
 'https://cloud.google.com/resource-manager/docs/managing-multiple-orgs',
 'https://cloud.google.com/architecture/identity/best-practices-for-planning',
 'https://support.google.com/a/answer/12099366?hl=en',
 'http://www.google.com/support/enterprise/static/gapps/docs/admin/en/nftf/multiple_domains/gapps_%20multiple_domains.pdf',
 'https://icannwiki.org/Google',
 'https://www.investopedia.com/investing/companies-owned-by-google/

In [110]:
# from langchain.retrievers.web_research import WebResearchRetriever
# from langchain_chroma import Chroma
# from langchain_community.utilities import GoogleSearchAPIWrapper
# from langchain_openai import ChatOpenAI, OpenAIEmbeddings

In [111]:
# # Vectorstore
# vectorstore = Chroma(
#     embedding_function=OpenAIEmbeddings(), persist_directory="./chroma_db_oai"
# )

# # LLM
# # llm = ChatOpenAI(temperature=0)

# # Search
# search = GoogleSearchAPIWrapper()

In [33]:
# Web scrapping using Crawl4AI
# from crawl4ai import WebCrawler

# # Create an instance of WebCrawler
# crawler = WebCrawler()
# # Warm up the crawler (load necessary models)
# crawler.warmup()

In [34]:

# consolidated_context_list = []
# for url in fial_web_scrapping_urls:
#     result = crawler.run(url=url)
#     consolidated_context_list.append(result.markdown)





In [35]:
# consolidated_context_list

In [31]:
# result_check=crawler.run(url="https://en.wikipedia.org/wiki/Google")

In [32]:
# result.markdown

In [36]:
from crawl4ai import WebCrawler

# Create an instance of WebCrawler
crawler = WebCrawler()

# Warm up the crawler (load necessary models)
crawler.warmup()

# # Run the crawler on a URL
# result = crawler.run(url="https://openai.com/api/pricing/")

# # Print the extracted content
# print(result.markdown)

[LOG] 🚀 Initializing LocalSeleniumCrawlerStrategy
[LOG] 🌤️  Warming up the WebCrawler
[LOG] 🌞 WebCrawler is ready to crawl


In [39]:
# import os
# from crawl4ai import WebCrawler
# from crawl4ai.extraction_strategy import LLMExtractionStrategy
# from pydantic import BaseModel, Field

# class OpenAIModelInfoExtractor(BaseModel):
#     model_name: str = Field(..., description="Subsidiaries and Associated Domains Extractor")
#     input_data: str = Field(..., description="Extract subsidiaries and associated domains of the company.")
#     output_data: str = Field(..., description="Output as Extracted subsidiaries and associated domains of the company.")

# url = 'https://www.investopedia.com/investing/companies-owned-by-google/'
# crawler = WebCrawler()
# crawler.warmup()

# result = crawler.run(
#         url=url,
#         word_count_threshold=1,
#         extraction_strategy= LLMExtractionStrategy(
#             provider= "openai/gpt-4o", api_token = os.getenv('OPENAI_API_KEY'),
#             schema=OpenAIModelInfoExtractor.schema(),
#             extraction_type="schema",
#             instruction="""Extract the subsidiaries and associated domains of the company .Extracted model JSON format should look like this: 
#             {"subsidiaries":["subsidiaries1","subsidiaries2",...], "associated_domains": ["associated_domain_1","associated_domain_2",....]}."""
#         ),            
#         bypass_cache=True,
#     )

# print(result.extracted_content)

In [40]:
# type(result.extracted_content)

In [41]:
import os
from crawl4ai import WebCrawler
from crawl4ai.extraction_strategy import LLMExtractionStrategy
from pydantic import BaseModel, Field
crawler = WebCrawler()
crawler.warmup()
consolidated_crawl4ai_context_list = []

class OpenAIModelInfoExtractor(BaseModel):
    model_name: str = Field(..., description="Subsidiaries and Associated Domains Extractor")
    input_data: str = Field(..., description="Extract subsidiaries and associated domains of the company.")
    output_data: str = Field(..., description="Output as Extracted subsidiaries and associated domains of the company.")

# run each loop against final_web_scrapping_urls

for url in fial_web_scrapping_urls:
    result = crawler.run(
        url=url,
        word_count_threshold=1,
        extraction_strategy= LLMExtractionStrategy(
            provider= "openai/gpt-4o", api_token = os.getenv('OPENAI_API_KEY'),
            schema=OpenAIModelInfoExtractor.schema(),
            extraction_type="schema",
            instruction="""Extract the subsidiaries and associated domains of the company .Extracted model JSON format should look like this: 
            {"subsidiaries":["subsidiaries1","subsidiaries2",...], "associated_domains": ["associated_domain_1","associated_domain_2",....]}."""
            ),            
            bypass_cache=True,
        )

    consolidated_crawl4ai_context_list.append(result.extracted_content)




[LOG] 🚀 Initializing LocalSeleniumCrawlerStrategy
[LOG] 🌤️  Warming up the WebCrawler
[LOG] 🌞 WebCrawler is ready to crawl
[LOG] 🚀 Crawling done for https://support.google.com/nonprofits/thread/74557231/g-suite-legacy-free-to-g-suite-non-profit-but-with-two-domains?hl=en, success: True, time taken: 4.100951194763184 seconds
[LOG] 🚀 Content extracted for https://support.google.com/nonprofits/thread/74557231/g-suite-legacy-free-to-g-suite-non-profit-but-with-two-domains?hl=en, success: True, time taken: 0.06629085540771484 seconds
[LOG] 🔥 Extracting semantic blocks for https://support.google.com/nonprofits/thread/74557231/g-suite-legacy-free-to-g-suite-non-profit-but-with-two-domains?hl=en, Strategy: LLMExtractionStrategy
[LOG] Call LLM for https://support.google.com/nonprofits/thread/74557231/g-suite-legacy-free-to-g-suite-non-profit-but-with-two-domains?hl=en - block index: 0
[LOG] Extracted 1 blocks from URL: https://support.google.com/nonprofits/thread/74557231/g-suite-legacy-free-to

In [42]:
consolidated_crawl4ai_context_list

['[\n    {\n        "subsidiaries": [\n            "subsidiaries"\n        ],\n        "associated_domains": [\n            ".charity",\n            ".me"\n        ],\n        "error": false\n    }\n]',
 '[\n    {\n        "subsidiaries": [],\n        "associated_domains": [],\n        "error": false\n    }\n]',
 '[\n    {\n        "subsidiaries": [],\n        "associated_domains": [],\n        "error": false\n    },\n    {\n        "subsidiaries": [],\n        "associated_domains": [],\n        "error": false\n    },\n    {\n        "subsidiaries": [],\n        "associated_domains": [],\n        "error": false\n    },\n    {\n        "model_name": "Subsidiaries and Associated Domains Extractor",\n        "input_data": "https://en.wikipedia.org/wiki/Google",\n        "output_data": {\n            "subsidiaries": [],\n            "associated_domains": []\n        },\n        "error": false\n    },\n    {\n        "subsidiaries": [],\n        "associated_domains": [],\n        "error": f

In [43]:
# write in txt file
with open('consolidated_crawl4ai_context_list_latest.txt', 'w') as f:
    for item in consolidated_crawl4ai_context_list:
        f.write("%s\n" % item)

In [44]:
# convert list to string
consolidated_crawl4ai_context_list_str = ' '.join(str(item) for item in consolidated_crawl4ai_context_list if item is not None)

In [45]:
consolidated_crawl4ai_context_list_str

'[\n    {\n        "subsidiaries": [\n            "subsidiaries"\n        ],\n        "associated_domains": [\n            ".charity",\n            ".me"\n        ],\n        "error": false\n    }\n] [\n    {\n        "subsidiaries": [],\n        "associated_domains": [],\n        "error": false\n    }\n] [\n    {\n        "subsidiaries": [],\n        "associated_domains": [],\n        "error": false\n    },\n    {\n        "subsidiaries": [],\n        "associated_domains": [],\n        "error": false\n    },\n    {\n        "subsidiaries": [],\n        "associated_domains": [],\n        "error": false\n    },\n    {\n        "model_name": "Subsidiaries and Associated Domains Extractor",\n        "input_data": "https://en.wikipedia.org/wiki/Google",\n        "output_data": {\n            "subsidiaries": [],\n            "associated_domains": []\n        },\n        "error": false\n    },\n    {\n        "subsidiaries": [],\n        "associated_domains": [],\n        "error": false\n   

In [91]:
# AI Agrent to Extrct Subsidiaries and Associated Domains

from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_openai import ChatOpenAI
from langchain_core.output_parsers import JsonOutputParser

In [95]:
prompt_final_response_format="""You are an AI Agent. Your task is to extract the subsidiaries and associated domains of the company.\n
Output format should be given json format:{"subsidiaries":["subsidiaries1","subsidiaries2",...], "associated_domains": ["associated_domain_1","associated_domain_2",....]}.Make sure keys are subsidiaries and associated_domains.\n"""

In [96]:
class OpenAIModel:
    def __init__(self):
        self.model_id = "gpt-4o"
    def gpt_model_call_without_chain(self,prompt,context):
        messages = [
            (
        "system",
        prompt,
        ),
    ("human", context),
    ]
        llm = ChatOpenAI(model=self.model_id, temperature=0,max_tokens=4000)
        response = llm.invoke(messages)
        response_explainbility =response.content
        return response_explainbility

In [97]:
obj_openai = OpenAIModel()
response = obj_openai.gpt_model_call_without_chain(prompt_final_response_format,consolidated_crawl4ai_context_list_str)

In [99]:
if response.startswith("```json"):
    subsidiaries_associated_domain = json.loads(response.replace('```json\n', '').replace('\n```', ''))
else:
    subsidiaries_associated_domain = json.loads(response)

In [102]:
with open('subsidiaries_associated_domain_latest.json', 'w') as f:
    json.dump(subsidiaries_associated_domain, f,indent=4)

In [76]:
# Define your desired data structure.
class SubsidiaryWithDomains(BaseModel):
    output: str = Field(description="""{"subsidiaries": ["subsidiaries1", "subsidiaries2", ...], "associated_domains": ["associated_domain_1", "associated_domain_2", ...]}""")
    

In [77]:
# final_prompt = ChatPromptTemplate.from_messages(
#     [
#         (
#             "system",
#             """You are an AI Agent. Your task is to extract the subsidiaries and associated domains of the company.
#             output format should be like this: {"subsidiaries":["subsidiaries1","subsidiaries2",...], "associated_domains": ["associated_domain_1","associated_domain_2",....]}""",
#         ),
#         ("human", "For company: {company_name} Extract the subsidiaries and associated domains and context from where to extract is {consolidated_crawl4ai_context_list_str}"),
#     ]
# )

In [78]:
parser = JsonOutputParser(pydantic_object=SubsidiaryWithDomains)


final_prompt = PromptTemplate(
    template="You are an AI Agent. Your task is to extract the subsidiaries and associated domains of the company.\n{format_instructions}\n{query}\n",
    input_variables=["query"],
    partial_variables={"format_instructions": parser.get_format_instructions()},
)

In [72]:
# ChatPromptTemplate(template_format="Answer")

In [73]:
# final_prompt

In [80]:
llm_final = ChatOpenAI(model=openai_model, temperature=0, max_tokens=2048,max_retries=5)
chain_final = final_prompt | llm_final | parser
# chain_final_response = chain_final.invoke(
#     {
#         "company_name": company_name,
#         "consolidated_crawl4ai_context_list_str": consolidated_crawl4ai_context_list_str
#     }
# )

chain_final_response = chain_final.invoke({"query": f"For company: {company_name} Extract the subsidiaries and associated domains and context from where to extract is {consolidated_crawl4ai_context_list_str}"})
    




In [81]:
chain_final_response

{'output': '{"subsidiaries": ["Waymo", "Sidewalk Labs", "Google DeepMind", "Adscape", "Android", "Charleston Road Registry", "DeepMind", "Endoxon", "FeedBurner", "Fitbit", "ImageAmerica", "Kaltix", "Nest Labs", "reCAPTCHA", "YouTube", "ZipDash", "Alphabet Inc.", "Raxium", "DoubleClick", "Motorola Mobility", "Waze", "Google Workspace", "Google Cloud Platform", "Google for Entrepreneurs", "Google Analytics 360 Suite", "Google Fiber", "Project Fi", "AdMob", "AdSense", "Pyra Labs", "Currents (news app)", "Green Throttle Games", "Owlchemy Labs", "Oyster", "PaperofRecord.com", "Podcasts", "Quick, Draw!", "Santa Tracker", "Songza", "Stadia", "Typhoon Studios", "TV", "Vevo", "Video", "Books", "Games", "Music", "Newsstand", "Pass", "Services", "BandPage", "BrandConnect", "Content ID", "Instant", "Kids", "Music", "Official channel", "Preferred", "Premium", "YouTube Rewind", "RightsFlow", "Shorts", "Studio", "TV", "Allo", "Bump", "Buzz", "Chat", "Contacts", "Currents (social app)", "Dodgeball", "

In [83]:
type(chain_final_response)

dict

In [88]:
# dict to json
chain_final_response_json = json.dumps(chain_final_response, indent=4)

In [90]:
print(chain_final_response_json)

{
    "output": "{\"subsidiaries\": [\"Waymo\", \"Sidewalk Labs\", \"Google DeepMind\", \"Adscape\", \"Android\", \"Charleston Road Registry\", \"DeepMind\", \"Endoxon\", \"FeedBurner\", \"Fitbit\", \"ImageAmerica\", \"Kaltix\", \"Nest Labs\", \"reCAPTCHA\", \"YouTube\", \"ZipDash\", \"Alphabet Inc.\", \"Raxium\", \"DoubleClick\", \"Motorola Mobility\", \"Waze\", \"Google Workspace\", \"Google Cloud Platform\", \"Google for Entrepreneurs\", \"Google Analytics 360 Suite\", \"Google Fiber\", \"Project Fi\", \"AdMob\", \"AdSense\", \"Pyra Labs\", \"Currents (news app)\", \"Green Throttle Games\", \"Owlchemy Labs\", \"Oyster\", \"PaperofRecord.com\", \"Podcasts\", \"Quick, Draw!\", \"Santa Tracker\", \"Songza\", \"Stadia\", \"Typhoon Studios\", \"TV\", \"Vevo\", \"Video\", \"Books\", \"Games\", \"Music\", \"Newsstand\", \"Pass\", \"Services\", \"BandPage\", \"BrandConnect\", \"Content ID\", \"Instant\", \"Kids\", \"Music\", \"Official channel\", \"Preferred\", \"Premium\", \"YouTube Rewind

Note: you may need to restart the kernel to use updated packages.


ERROR: unknown command "unininstall" - maybe you meant "uninstall"

