In [1]:
import requests
from bs4 import BeautifulSoup
from nltk.tokenize import sent_tokenize
from transformers import BertTokenizer
import os
from dotenv import load_dotenv
import re
import concurrent.futures
from happytransformer import HappyTextClassification

load_dotenv()

# Access environment variables
API_KEY = os.getenv("API_KEY")
CSE_ID = os.getenv("CSE_ID")

# Function to perform a Google search and return a list of URLs
def google_search(query, api_key, cse_id, num_results=10):
    search_url = f"https://www.googleapis.com/customsearch/v1?q={query}&key={api_key}&cx={cse_id}&num={num_results}"
    response = requests.get(search_url)
    results = response.json().get('items', [])
    urls = [result['link'] for result in results]
    return urls

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
happy_class = HappyTextClassification(model_type="BERT", model_name="Vinoth24/environmental_claims")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

03/19/2024 15:56:40 - INFO - happytransformer.happy_transformer -   Using device: cpu


In [3]:
def search_and_scrape(query, company_website):
    urls = google_search(query, API_KEY, CSE_ID)
    urls_to_scrape = [url for url in urls if company_website.lower() not in url.lower() and not any(substring in url for substring in ['.xlsx', 'sitemap', '/download', 'List', 'list'])]

    
    with open('scraped_data.txt', 'w', encoding='utf-8') as file:
        with concurrent.futures.ThreadPoolExecutor() as executor:
            future_to_url = {executor.submit(fetch_content, url): url for url in urls_to_scrape}
            for future in concurrent.futures.as_completed(future_to_url):
                url = future_to_url[future]
                print(f"Processing: {url}")
                try:
                    content = future.result()
                    if content:
                        # Process and classify the content, then write to file if applicable
                        processed_content = process_and_classify_content(content)
                        if processed_content:
                            file.write(f"URL: {url}\nContent:\n{processed_content}\n\n")
                except Exception as e:
                    print(f"Error fetching {url}: {e}")

def process_and_classify_content(content):
    # Replace multiple spaces with a single space
    content = re.sub(r'\s+', ' ', content)
    
    sentences = sent_tokenize(content)
    classified_sentences = []
    for sentence in sentences:
        sentence = sentence.strip()  # Strip leading and trailing whitespaces
        if sentence:  # Check if sentence is not empty after stripping
            # Break down the sentence into smaller chunks if necessary, maintaining sentence integrity
            chunks = [sentence[i:i+512] for i in range(0, len(sentence), 512)] if len(sentence) > 512 else [sentence]
            for chunk in chunks:
                result = happy_class.classify_text(chunk)
                if result.label == "LABEL_1" and result.score > 0.5:
                    # Further clean each chunk to ensure no leading/trailing spaces and normalize inner spaces
                    clean_chunk = ' '.join(chunk.split())
                    classified_sentences.append(clean_chunk)
    return '\n'.join(classified_sentences)

def fetch_content(url):
    try:
        with requests.Session() as session:  # Use a Session for connection pooling
            response = session.get(url, headers={'User-Agent': 'Mozilla/5.0'})
            if response.status_code == 200:
                soup = BeautifulSoup(response.text, 'html.parser')
                return soup.get_text()
    except Exception as e:
        print(f"Error fetching {url}: {e}")
    return None

In [4]:
company_name = "nike"
company_website = "nike.com" # Currently best if website is input without "https:/" and "www."
print(f"Searching: intext:\"{company_name}\" company sustainability")
search_and_scrape(f"intext:\"{company_name}\" company sustainability", company_website)

Searching: intext:"nike" company sustainability


03/19/2024 15:56:40 - INFO - happytransformer.happy_transformer -   Moving model to cpu
03/19/2024 15:56:40 - INFO - happytransformer.happy_transformer -   Initializing a pipeline


Processing: https://sustainabilitymag.com/articles/nike-making-strides-towards-net-zero-with-sustainable-foam
Processing: https://goodonyou.eco/how-ethical-is-nike/
Processing: https://www.dazeddigital.com/fashion/article/52679/1/what-you-need-to-know-about-nike-sustainability-goals-2021-microsite
Processing: https://www.weavabel.com/blog/is-nike-sustainable-focusing-on-a-brighter-future
Processing: https://directory.goodonyou.eco/brand/nike
