In [6]:
from playwright.async_api import async_playwright
import asyncio
import nest_asyncio

nest_asyncio.apply()

# Explore jina.ia to get the extracted text: https://jina.ai/reader#demo

async def extract_linkedin_details(url):
    async with async_playwright() as p:
        browser = await p.chromium.launch()
        page = await browser.new_page()
        
        print(f"Navigating to {url}")
        await page.goto(url)
        await page.wait_for_timeout(3000)  # Wait for 3 seconds

        print("Extracting visible text")
        text = await page.evaluate("() => document.body.innerText")

        print(f"Extracted {len(text)} characters of text")
        
        await browser.close()
        return text

# Usage
linkedin_url = "https://www.linkedin.com/company/hyertekinc"
text_content = asyncio.get_event_loop().run_until_complete(extract_linkedin_details(linkedin_url))
print(text_content + "...")  # Print first 200 characters

Navigating to https://www.linkedin.com/company/hyertekinc
Extracting visible text
Extracted 7122 characters of text
Skip to main content
LinkedIn
Articles
People
Learning
Jobs
Games
Join now
Sign in
HyerTek
IT Services and IT Consulting
Rockville, Maryland  1,871 followers
See jobs
Follow
  

View all 11 employees

About us

HyerTek provides expert technology solutions for federal government clients, with a strong emphasis on innovation, integration, and automation that help enhance citizens lives. 



Founded in 2016, HyerTek, specializes in providing innovative and customized technology solutions. Our business expertise is firmly planted in the areas of  data analytics and business intelligence, cloud computing, collaboration, human centered design, and technical training. 

Our team is dedicated to understanding the unique needs and goals of each project, and we approach every task with enthusiasm and care. We are skilled in developing solutions that not only meet the technical requ

In [2]:
import boto3
import os

# Initialize DynamoDB resource
dynamodb = boto3.resource(
    "dynamodb",
    aws_access_key_id=os.environ.get("AWS_ACCESS_KEY_ID"),
    aws_secret_access_key=os.environ.get("AWS_SECRET_ACCESS_KEY"),
    region_name="us-east-1",
)
table = dynamodb.Table("jobs")

# Scan the table to fetch the first page of jobs
response = table.scan()
all_jobs = response['Items']


print(f"Total jobs fetched: {len(all_jobs)}")

Total jobs fetched: 174


In [8]:
# Pretty print job title, company, and category for each job

filter_categories = [
    # 'Strong Potential', 
    'Ideal Match']
company_names = []
for job in all_jobs:
    category = job.get('category', 'N/A')
    if category in filter_categories:
        company_names.append(job.get('company', 'N/A'))


company_names


['VST Consulting, Inc',
 'RemoteWorker US',
 'ATPCO',
 'Utilidata',
 'GlobalBit',
 'Travelers Companies (Import)',
 'Global Payments (Beamery)',
 'Semrush',
 'Oracle',
 'Flock Safety',
 'Kaiser Permanente',
 'Oracle',
 'Confluent',
 'Laiba Technologies',
 'Mint Mobile',
 'KK Technologies',
 'Russell Tobin',
 'Avesta Computer Services',
 'Agility Robotics',
 'Hamilton Telecommunications',
 'Capital One',
 'RemoteWorker US',
 'MD Anderson Cancer Center',
 'Logistics Management Institute',
 'Payler',
 'eBay',
 'Credit Acceptance',
 'Cox Communications',
 'UST',
 'Companion Protect',
 'WEX Inc',
 'Select Source International',
 'eRay Technologies LLC',
 'Maximus, Inc.',
 'Paymentology',
 'Deel',
 'Pendulum™',
 'pro IT',
 'Talent Hunt Group',
 'Whatnot',
 'Intermountain Healthcare',
 'Credit Acceptance',
 'MOCA Systems, Inc.',
 'CELLA',
 'Trust & Will',
 'Dawar Consulting, Inc.',
 'Biblioso',
 'PublicSquare']

In [11]:
import requests
import urllib.parse

responses = []

# Process the first 5 companies
for company in company_names[:5]:
    company_query = urllib.parse.quote(f"{company} company")
    url = f"https://s.jina.ai/{company_query}"
    print(f"Searching for: {company}, Query: {company_query}")
    response = requests.get(url)
    responses.append(response.text)
    character_count = len(response.text)
    print(f"Company: {company}, Character Count: {character_count}")


Searching for: VST Consulting, Inc, Query: VST%20Consulting%2C%20Inc%20company
Company: VST Consulting, Inc, Character Count: 73615
Searching for: RemoteWorker US, Query: RemoteWorker%20US%20company
Company: RemoteWorker US, Character Count: 50834
Searching for: ATPCO, Query: ATPCO%20company
Company: ATPCO, Character Count: 155374
Searching for: Utilidata, Query: Utilidata%20company
Company: Utilidata, Character Count: 113945
Searching for: GlobalBit, Query: GlobalBit%20company
Company: GlobalBit, Character Count: 60004


In [45]:
import instructor
import litellm
from pydantic import BaseModel, Field
from typing import List, Optional

def track_cost_callback(
    kwargs,
    completion_response,
    start_time,
    end_time,
):
    import litellm

    try:
        response_cost = kwargs.get("response_cost", 0)
        duration = end_time - start_time

        print("-" * 40)
        print("LLM Response Summary:")
        print(f"Model: {kwargs['model']}")
        print(f"Cost: ${response_cost:.6f}")
        print(f"Duration: {duration.total_seconds():.2f} seconds")

        if (
            isinstance(
                completion_response,
                litellm.ModelResponse,
            )
            and "usage" in completion_response
        ):
            usage = completion_response["usage"]
            print("Token Usage:")
            print(f"  Completion: {usage.completion_tokens}")
            print(f"  Prompt: {usage.prompt_tokens}")
            print(f"  Total: {usage.total_tokens}")
        print("-" * 40)
    except:
        print("Error occurred while printing LLM response summary")




def litellm_completion(model, content, response_model):
    import instructor
    import litellm

    litellm.success_callback = [track_cost_callback]

    client = instructor.from_litellm(litellm.completion)

    return client.chat.completions.create(
        model=model,
        temperature=0,
        messages=[{"role": "user", "content": content}],
        response_model=response_model,
    )


In [50]:
def print_company_details(result):
    if result.company_overview:
        print("Company Overview:")
        for item in result.company_overview:
            print(f"  - {item}")

    if result.key_links:
        print("\nKey Links:")
        if result.key_links.official:
            print(f"  - Official Website: {result.key_links.official}")
        if result.key_links.linkedin:
            print(f"  - LinkedIn: {result.key_links.linkedin}")
        if result.key_links.other:
            for link in result.key_links.other:
                print(f"  - {link}")


    if result.business_description:
        print("\nBusiness Description:")
        for item in result.business_description:
            print(f"  - {item}")

    if result.products_services:
        print("\nProducts/Services:")
        for item in result.products_services:
            print(f"  - {item}")

    if result.history:
        print("\nHistory:")
        for item in result.history:
            print(f"  - {item}")

    if result.financials:
        print("\nFinancials:")
        for item in result.financials:
            print(f"  - {item}")

    if result.leadership:
        print("\nLeadership:")
        for item in result.leadership:
            print(f"  - {item}")

    if result.technology:
        print("\nTechnology:")
        for item in result.technology:
            print(f"  - {item}")

    if result.corporate_culture:
        print("\nCorporate Culture:")
        for item in result.corporate_culture:
            print(f"  - {item}")

    if result.key_locations:
        print("\nKey Locations:")
        for item in result.key_locations:
            print(f"  - {item}")



    print("\n" + "-"*40 + "\n")

In [53]:


class KeyLinks(BaseModel):
    official: str = Field(None, description="Official website of the company")
    linkedin: Optional[str] = Field(None, description="LinkedIn profile of the company")
    other: Optional[List[str]] = Field(None, description="Other relevant links")

class CompanyCategoryDetails(BaseModel):
    company_overview: List[str] = Field(None, description="Details about the company overview")
    key_links: KeyLinks = Field(..., description="Details about the company's key links")
    business_description: Optional[List[str]] = Field(None, description="Details about the business description")
    products_services: Optional[List[str]] = Field(None, description="Details about the products or services")
    key_locations: Optional[List[str]] = Field(None, description="Details about the company's key locations")
    history: Optional[List[str]] = Field(None, description="Details about the company's history")
    financials: Optional[List[str]] = Field(None, description="Details about the company's financials")
    leadership: Optional[List[str]] = Field(None, description="Details about the company's leadership")
    technology: Optional[List[str]] = Field(None, description="Details about the company's technology")
    corporate_culture: Optional[List[str]] = Field(None, description="Details about the company's corporate culture")

prompt = """Given a company description, please extract and organize the following information in a structured format. If any information is not available, skip that category.
                
                Company Overview:
                Full name: [Official company name]
                Industry: [Main industry sectors]
                Headquarters: [City, State/Province, Country]
                Founded: [Year]
                Company type: [e.g., Public, Private, Subsidiary]
                Number of employees: [Employee range or exact number if available]
                
                Key Links:
                [Official website]
                [LinkedIn profile]
                [Social media profiles]
                [Other relevant online resources]

                Business Description:
                [Brief overview of main business activities]
                [Key products or services]
                [Target customers or market]
                [Unique selling points or competitive advantages]

                Products/Services:
                [List main products or services offered]

                Key Locations:
                Headquarters: [Location]
                [Other significant office locations]

                History:
                [Key milestones, including founding date]
                [Major acquisitions or mergers]
                [Significant rebranding or restructuring events]

                Financials:
                Total funding raised: [Amount if available]
                Last funding round: [Type and date of most recent funding]
                [Any other relevant financial information]

                Leadership:
                CEO: [Name of current CEO]
                [Other key executive positions and names]

                Technology:
                [Key technologies used or developed by the company]
                [Digital platforms or apps]

                Corporate Culture:
                [Company's self-description of culture]
                [Key values or focus areas]


                Please organize the information clearly and concisely, using bullet points where appropriate. If certain information is not provided in the company description, mention that the detail should not be scraped.
    """



result_gpt_1  = litellm_completion("gpt-4o-mini", prompt + responses[0], CompanyCategoryDetails)
result_claude_1 = litellm_completion("claude-3-5-sonnet-20240620", prompt + responses[0], CompanyCategoryDetails)
print_company_details(result_gpt_1)
print_company_details(result_claude_1)

result_gpt_2  = litellm_completion("gpt-4o-mini", prompt + responses[1], CompanyCategoryDetails)
result_claude_2 = litellm_completion("claude-3-5-sonnet-20240620", prompt + responses[1], CompanyCategoryDetails)
print_company_details(result_gpt_2)
print_company_details(result_claude_2)

result_gpt_3  = litellm_completion("gpt-4o-mini", prompt + responses[2], CompanyCategoryDetails)
result_claude_3 = litellm_completion("claude-3-5-sonnet-20240620", prompt + responses[2], CompanyCategoryDetails)
print_company_details(result_gpt_3)
print_company_details(result_claude_3)


----------------------------------------
LLM Response Summary:
Model: gpt-4o-mini
Cost: $0.003266
Duration: 6.57 seconds
Token Usage:
  Completion: 274
  Prompt: 20678
  Total: 20952
----------------------------------------
----------------------------------------Company Overview:
  - VST Consulting Inc. is a leading information technology (IT) Services Company that specializes in providing complete consulting solutions and staff augmentation to its clients.
  - Winner of Inc.500/5000 fastest growing IT Company in America.

Key Links:
  - Official Website: http://www.vstconsulting.com/
  - Official Website: https://www.linkedin.com/company/vst-consulting-inc
  - http://www.vstconsulting.com/about.html
  - http://www.vstconsulting.com/solutions.html
  - http://www.vstconsulting.com/careers.html

Business Description:
  - VST Consulting Inc. provides on-time, on-budget, and quality service to its clients and consistently meets their expectations.
  - The company caters to IT application 

In [2]:
result_gpt_1

NameError: name 'result_gpt_1' is not defined

In [30]:

from pydantic import BaseModel, Field
from typing import List


class Category(BaseModel):
    category: str = Field(None, description="Arbitrary category")
    details: List[str] = Field(None, description="List of details under the category")

class CommpanyInfo(BaseModel):
    categories: List[Category] = Field(None, description="List of categories and details")




# Function to extract arbitrary company information from text
def extract_arbitrary_company_info(text):
    import instructor
    import litellm
    litellm.success_callback = [track_cost_callback]

    client = instructor.from_litellm(litellm.completion)
    print(f"Extracting arbitrary company information from text: {text[:100]}...")  # Log the start of the extraction
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[{
                "role": "user",
                "content": f"""Given a company description, please extract and organize the information into arbitrary key categories. If any information is not available, skip that category.

                Text:
                {text}""",
        }],
        response_model=CommpanyInfo,
    )
    print("Extraction complete.")  # Log the completion of the extraction
    return response

# Process each job description, save the results, and display them with arbitrary categories and details as list items
arbitrary_results = []
for response_text in responses[:1]:
    print(f"Processing response text: {response_text[:100]}...")  # Log the start of processing each response
    company_info = extract_arbitrary_company_info(response_text)
    arbitrary_results.append(company_info)
    print("Processing complete.")  # Log the completion of processing each response

Processing response text: [1] Title: VST Consulting Inc
[1] URL Source: http://www.vstconsulting.com/
[1] Description: <strong...
Extracting arbitrary company information from text: [1] Title: VST Consulting Inc
[1] URL Source: http://www.vstconsulting.com/
[1] Description: <strong...
----------------------------------------Extraction complete.
Processing complete.

LLM Response Summary:
Model: gpt-4o-mini
Cost: $0.003201
Duration: 7.77 seconds
Token Usage:
  Completion: 304
  Prompt: 20121
  Total: 20425
----------------------------------------


In [34]:
for category in arbitrary_results[0].categories:
    print(f"Category: {category.category}")
    for detail in category.details:
        print(f"  - {detail}")

Category: Company Overview
  - VST Consulting Inc. is a leading IT Services Company that specializes in providing complete consulting solutions and staff augmentation to its clients. Established in 2002, it applies advanced technology and processes to solve business challenges.
  - We strive to offer our employees a professional, challenging, dynamic, and rewarding work environment that promotes professional and personal growth.
Category: Website
  - http://www.vstconsulting.com/
Category: Awards
  - Winner of Inc.500/5000 fastest growing IT Company in America for the last five years in a row.
  - Awarded NJ’s Fastest 50.
Category: Mission Statement
  - To be the number one choice of companies seeking technology solutions and to deliver a passionate commitment to clients' success.
Category: Core Values
  - Developing positive relationships with customers and employees.
Category: Industry Focus
  - Software Development
  - IT Consulting
Category: Size
  - Company size: 11-50 employees
C

In [5]:
import boto3
dynamodb = boto3.resource("dynamodb")
table = dynamodb.Table("companies")

key_schema = table.key_schema
key_names = [key['AttributeName'] for key in key_schema]

key_names, key_schema


(['name'], [{'AttributeName': 'name', 'KeyType': 'HASH'}])