In [1]:
import requests
from bs4 import BeautifulSoup
from nltk.tokenize import sent_tokenize
from transformers import BertTokenizer
import os
from dotenv import load_dotenv
import re
import concurrent.futures
from happytransformer import HappyTextClassification
from concurrent.futures import ThreadPoolExecutor, as_completed
import time
import textwrap

load_dotenv()

API_KEY = os.getenv("API_KEY")
CUSTOM_CONFIG_ID = os.getenv("CUSTOM_CONFIG_ID")

def bing_search(query, api_key, custom_config_id, num_results=10, excluded_site=''):
    search_query = f"{query} -site:{excluded_site}" if excluded_site else query
    search_url = f"https://api.bing.microsoft.com/v7.0/custom/search?q={search_query}&customconfig={custom_config_id}"
    headers = {'Ocp-Apim-Subscription-Key': api_key}
    params = {'count': num_results}
    response = requests.get(search_url, headers=headers, params=params)
    if response.status_code != 200:
        print(f"Failed to retrieve search results: {response.status_code}, {response.text}")
        return []
    results = response.json().get('webPages', {}).get('value', [])
    if not results:
        print("No results found.")
        return []
    urls = [result['url'] for result in results]
    print(f"Found URLs: {urls}")
    return urls

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
happy_class = HappyTextClassification(model_type="BERT", model_name="Vinoth24/environmental_claims")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

05/12/2024 16:06:05 - INFO - happytransformer.happy_transformer -   Using device: cpu


In [3]:
def process_and_classify_content(content):
    content = re.sub(r'\s+', ' ', content)
    sentences = sent_tokenize(content)
    classified_sentences = []
    for sentence in sentences:
        sentence = sentence.strip()
        if sentence:
            wrapped_text = textwrap.fill(sentence, width=80)
            for line in wrapped_text.split('\n'):
                result = happy_class.classify_text(line)
                if result.label == "LABEL_1" and result.score > 0.5:
                    clean_line = ' '.join(line.split())
                    classified_sentences.append(clean_line)
    return '\n'.join(classified_sentences)

def search_and_scrape(query, company_website, max_threads=10):
    urls = bing_search(query, API_KEY, CUSTOM_CONFIG_ID, excluded_site=company_website)
    urls_to_scrape = [url for url in urls if not any(substring in url for substring in ['.xlsx', 'sitemap', '/download', 'List', 'list'])]

    with open('scraped_data.txt', 'w', encoding='utf-8') as file:
        with ThreadPoolExecutor(max_workers=max_threads) as executor:
            future_to_url = {executor.submit(fetch_content, url): url for url in urls_to_scrape}
            for future in as_completed(future_to_url):
                url = future_to_url[future]
                try:
                    content = future.result()
                    if content:
                        processed_content = process_and_classify_content(content)
                        if processed_content:
                            wrapped_content = textwrap.fill(processed_content, width=120)
                            file.write(f"URL: {url}\nContent:\n{wrapped_content}\n\n")
                except Exception as e:
                    print(f"Error processing {url}: {e}")
                time.sleep(0.5)

def fetch_content(url):
    try:
        with requests.Session() as session:
            response = session.get(url, headers={'User-Agent': 'Mozilla/5.0'})
            if response.status_code == 200:
                soup = BeautifulSoup(response.text, 'html.parser')
                return soup.get_text()
    except Exception as e:
        print(f"Error fetching {url}: {e}")
    return None

In [4]:
company_name = "Mia Peru"
company_website = "miaperu.com"
print(f"Searching: intext:\"{company_name}\" company sustainability")
search_and_scrape(f"intext:\"{company_name}\" company sustainability", company_website)

Searching: intext:"Mia Peru" company sustainability
Found URLs: ['https://www.facebook.com/MIAPERU/', 'https://www.commonobjective.co/mia-peru', 'https://connectamericas.com/company/mia-peru', 'https://intengine.com/directory/profile/90341-mia-peru', 'https://www.zoominfo.com/c/mia-peru-corp/452766959', 'https://www.solunacollective.com/pages/mia', 'https://connectamericas.com/es/company/mia-peru', 'https://www.dnb.com/business-directory/company-profiles.mia_peru_corp_sac.874975a5708368c5ee61eddcad36c2b2.html', 'https://www.instagram.com/mia_peru/p/CoP3ihiM2GH/', 'http://mia-peru.com/']


05/12/2024 16:06:06 - INFO - happytransformer.happy_transformer -   Moving model to cpu
05/12/2024 16:06:06 - INFO - happytransformer.happy_transformer -   Initializing a pipeline
