In [None]:
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from scrapy.crawler import CrawlerProcess
from twisted.internet import reactor  # Import reactor for setting up a manual timeout

class SeriousEatsSpider(CrawlSpider):
    name = 'aseriousEats'
    allowed_domains = ['seriouseats.com']
    start_urls = ['https://www.seriouseats.com/']
    request_count = 0  # Initialize request counter

    rules = (
        # Adjusted rules as per your requirement
        Rule(LinkExtractor(deny=(r'/how-to/',)), follow=True, callback='parse_item'),
    )

    def __init__(self, *args, **kwargs):
        super(SeriousEatsSpider, self).__init__(*args, **kwargs)
        self.seen_urls = set()
        # Setup a manual timeout using reactor.callLater
        reactor.callLater(600, self.close_spider_due_to_timeout)  # Schedule spider to close after 600 seconds

    def close_spider_due_to_timeout(self):
        self.crawler.engine.close_spider(self, 'timeout_reached')  # Close spider due to timeout

    def parse_item(self, response):
        self.request_count += 1  # Increment request count for every processed request
        if self.request_count > 80000:  # Check if request count exceeds the limit
            self.crawler.engine.close_spider(self, 'request_limit_reached')  # Close spider due to request limit reached
            return
        
        url = response.url
        if url not in self.seen_urls:
            self.seen_urls.add(url)
            item = {'url': url}
            yield item

# Scrapy CrawlerProcess setup
process = CrawlerProcess(settings={
    'USER_AGENT': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3',
    'FEED_FORMAT': 'csv',
    'FEED_URI': 'C:/Users/Admin/Desktop/try_me/seriouseats_2.csv',
    'LOG_LEVEL': 'DEBUG',
    #'DEPTH_LIMIT': 10,
})

process.crawl(SeriousEatsSpider)
process.start()





In [2]:
import pandas as pd

df = pd.read_csv('seriouseats_2.csv')

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd



In [3]:
#div comp structured-ingredients
#https://www.seriouseats.com/bun-cha-hanoi-recipe-8421208

Unnamed: 0,url
0,https://www.seriouseats.com/
1,https://www.seriouseats.com/vegetable-guides-5...
2,https://www.seriouseats.com/pantry-guides-5181287
3,https://www.seriouseats.com/equipment-5117081
4,https://www.seriouseats.com/noodle-guides-5117999


In [84]:
#Done Ingredients

import requests
from bs4 import BeautifulSoup
import re  # Import regular expressions

url = 'https://www.seriouseats.com/spaghetti-with-canned-clam-sauce'

response = requests.get(url)
if response.status_code == 200:
    soup = BeautifulSoup(response.content, 'html.parser')
    ingredients_list = soup.find_all('li', class_='structured-ingredients__list-item')

    formatted_ingredients = []
    for item in ingredients_list:
        p_tag = item.find('p')
        if p_tag:  # Check if the <p> tag exists
            ingredient_text = p_tag.get_text(" ", strip=True)  # Use a space as separator for inner tags
            # Use regular expression to find quantities and units and ensure proper spacing
            formatted_text = re.sub(r'(\d+)\s*(g|ml|ounces|cup|tablespoons|teaspoon|pounds|bunch)', r'\1 \2', ingredient_text, flags=re.I)
            formatted_text = re.sub(r'(\d+)([a-zA-Z])', r'\1 \2', formatted_text)  # Add space between numbers and text
            formatted_ingredients.append(formatted_text)

print(formatted_ingredients)

2024-02-15 21:12:27 [urllib3.connectionpool] DEBUG: Starting new HTTPS connection (1): www.seriouseats.com:443
2024-02-15 21:12:28 [urllib3.connectionpool] DEBUG: https://www.seriouseats.com:443 "GET /spaghetti-with-canned-clam-sauce HTTP/1.1" 200 None


['4 tablespoons (60 g) unsalted butter , cut into 4 pieces and divided', '2 celery ribs (5 oz; 140 g ), peeled (optional) and cut into small dice', '2 medium shallots ( 60 g ), minced', '1/2 cup ( 30 g ) finely chopped fresh parsley leaves , divided', '4 medium garlic cloves ( 15 g ), finely minced', '1 fresh green Thai chile ( 4 g ), thinly sliced (optional, see note)', 'Freshly ground black pepper', 'Two ( 8-ounce ; 237 ml) bottles clam juice', 'Two (6.5-ounce; 184 g ) cans chopped or minced clams , clams and liquid divided', '1 teaspoon (5 ml) soy sauce', '1 pound ( 450 g ) spaghetti', 'Kosher salt', 'Celery leaves (pale yellow-green leaves from the celery heart), for garnish', 'Lemon wedges , for serving']


In [None]:
#DONE Rating

import requests
from bs4 import BeautifulSoup

url = 'https://www.seriouseats.com/basque-cheesecake'

response = requests.get(url)
if response.status_code == 200:
    soup = BeautifulSoup(response.content,'html.parser')
    star_rating = soup.find('div', class_='comp js-feedback-trigger aggregate-star-rating mntl-block')

    rating_counter = 0

    if star_rating:  # Check if star_rating is not None
        # Assuming you want to find all <a> tags with class 'active' or 'half' within star_rating
        active_ratings = star_rating.find_all('a', class_='active')
        half_ratings = star_rating.find_all('a', class_='half')

        rating_counter += len(active_ratings)  # Each 'active' class represents 1 star
        rating_counter += 0.5 * len(half_ratings)  # Each 'half' class represents 0.5 star
        num_reviews = star_rating.get_text(strip=True).replace('(', '').replace(')','')
        overall_rating = [rating_counter, int(num_reviews)]



In [None]:
#DONE Title

url = 'https://www.seriouseats.com/basque-cheesecake'
response = requests.get(url)
if response.status_code == 200:
    soup = BeautifulSoup(response.content,'html.parser')
    title = soup.find('h1').get_text(strip=True)
    print(title)


In [None]:
#DONE COOKING VALUES

url = 'https://www.seriouseats.com/basque-cheesecake'
response = requests.get(url)

recipe_time = []

if response.status_code == 200:
    soup = BeautifulSoup(response.content, 'html.parser')
    # Target the container that directly holds the time and serving information
    times_container = soup.find_all('div', class_='project-meta__times-container')
    results_container = soup.find_all('div', class_='project-meta__results-container')

    # Process each container separately to ensure all relevant data is captured
    for container in times_container + results_container:
        for items in container.find_all('div', class_='loc'):  # Find all divs with class 'loc' within the container
            label = items.find('span', class_='meta-text__label').get_text(strip=True) if items.find('span', class_='meta-text__label') else None
            value = items.find('span', class_='meta-text__data').get_text(strip=True) if items.find('span', class_='meta-text__data') else None
            if label and value:  # Ensure both label and value are found
                recipe_time.append({label: value})

print(recipe_time)
    


    

In [None]:
#NUTRITION
