In [1]:
import requests
from bs4 import BeautifulSoup
from nltk.tokenize import sent_tokenize
from transformers import BertTokenizer
import os
from dotenv import load_dotenv
import re
import concurrent.futures
from happytransformer import HappyTextClassification
from concurrent.futures import ThreadPoolExecutor, as_completed
import time
import textwrap

load_dotenv()

# Access environment variables
API_KEY = os.getenv("API_KEY")
CSE_ID = os.getenv("CSE_ID")

# Function to perform a Google search and return a list of URLs
def google_search(query, api_key, cse_id, num_results=10, excluded_site=''):
    search_query = f"{query} -site:{excluded_site}" if excluded_site else query
    search_url = f"https://www.googleapis.com/customsearch/v1?q={search_query}&key={api_key}&cx={cse_id}&num={num_results}"
    response = requests.get(search_url)
    results = response.json().get('items', [])
    urls = [result['link'] for result in results]
    print(urls)
    return urls

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
happy_class = HappyTextClassification(model_type="BERT", model_name="Vinoth24/environmental_claims")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

05/09/2024 18:24:06 - INFO - happytransformer.happy_transformer -   Using device: cpu


In [3]:
def process_and_classify_content(content):
    content = re.sub(r'\s+', ' ', content)
    sentences = sent_tokenize(content)
    classified_sentences = []
    for sentence in sentences:
        sentence = sentence.strip()
        if sentence:
            wrapped_text = textwrap.fill(sentence, width=80)
            for line in wrapped_text.split('\n'):
                result = happy_class.classify_text(line)
                if result.label == "LABEL_1" and result.score > 0.5:
                    clean_line = ' '.join(line.split())
                    classified_sentences.append(clean_line)
    return '\n'.join(classified_sentences)

def search_and_scrape(query, company_website, max_threads=10):
    urls = google_search(query, API_KEY, CSE_ID, excluded_site=company_website)
    urls_to_scrape = [url for url in urls if not any(substring in url for substring in ['.xlsx', 'sitemap', '/download', 'List', 'list'])]

    with open('scraped_data.txt', 'w', encoding='utf-8') as file:
        with ThreadPoolExecutor(max_workers=max_threads) as executor:
            future_to_url = {executor.submit(fetch_content, url): url for url in urls_to_scrape}
            for future in as_completed(future_to_url):
                url = future_to_url[future]
                try:
                    content = future.result()
                    if content:
                        processed_content = process_and_classify_content(content)
                        if processed_content:
                            wrapped_content = textwrap.fill(processed_content, width=120)
                            file.write(f"URL: {url}\nContent:\n{wrapped_content}\n\n")
                except Exception as e:
                    print(f"Error processing {url}: {e}")
                time.sleep(0.5)

def fetch_content(url):
    try:
        with requests.Session() as session:
            response = session.get(url, headers={'User-Agent': 'Mozilla/5.0'})
            if response.status_code == 200:
                soup = BeautifulSoup(response.text, 'html.parser')
                return soup.get_text()
    except Exception as e:
        print(f"Error fetching {url}: {e}")
    return None


In [4]:
company_name = "nike"
company_website = "nike.com" # Currently best if website is input without "https:/" and "www."
print(f"Searching: intext:\"{company_name}\" company sustainability")
search_and_scrape(f"intext:\"{company_name}\" company sustainability", company_website)

Searching: intext:"nike" company sustainability
['https://sustainabilitymag.com/articles/nike-making-strides-towards-net-zero-with-sustainable-foam', 'https://www.weavabel.com/blog/is-nike-sustainable-focusing-on-a-brighter-future', 'https://www.dazeddigital.com/fashion/article/52679/1/what-you-need-to-know-about-nike-sustainability-goals-2021-microsite', 'https://goodonyou.eco/how-ethical-is-nike/', 'https://directory.goodonyou.eco/brand/nike', 'https://www.eco-stylist.com/is-nike-sustainable-how-ethical-is-nike-full-sustainability-rating/', 'https://www.businessinsider.com/nike-behind-on-environmental-goals-popularity-leather-jordans-dunks-airforce-2022-7', 'https://fashinza.com/sustainability/learn/what-nike-is-doing-to-become-more-sustainable-in-2023/', 'https://www.fastcompany.com/91072898/nike-was-accused-of-greenwashing-a-judge-disagrees', 'https://www.cnbc.com/2020/02/05/nike-ceo-john-donahoe-shoppers-care-about-sustainabilty.html']


05/09/2024 18:24:07 - INFO - happytransformer.happy_transformer -   Moving model to cpu
05/09/2024 18:24:07 - INFO - happytransformer.happy_transformer -   Initializing a pipeline
