## Web Scrapping: 

In [1]:
import requests   # Python library that lets us interact with websites
from bs4 import BeautifulSoup  # class from bs4 library
import pandas as pd

# Function to fetch and parse HTML content of the page
def get_page_content(url):    
    response = requests.get(url)   #sends an HTTP GET request to the URL
    if response.status_code == 200:
        return BeautifulSoup(response.text, 'html.parser') #turns HTML string into a Python object (so that easy to work with)
    else:
        raise Exception(f"Failed to fetch page. Status Code: {response.status_code}")

# Function to extract quote details (Author, Quote Text, Tags)
def get_quotes_from_page(soup):    # soup is the object which holds html of one page, we will use this to seach and pull data.
    quotes = []
    quote_blocks = soup.find_all('div', class_='quote')

    for i in quote_blocks:
        text = i.find('span', class_='text').text.strip() # get the text and remove extra space either from front and back
        author = i.find('small', class_='author').text.strip()
        tag_elements = i.find_all('a', class_='tag')

        # Instead of joining tags, we make a new row for each tag
        for j in tag_elements:        # we are looping over each tag, and creating a new row for each.
            quotes.append({
                'Author': author,
                'Quote': text,
                'Tag': j.text.strip()
            })

    return quotes


# Main function to iterate over all pages and collect data
def scrape_quotes():
    all_quotes = []
    base_url = "http://quotes.toscrape.com"
    page_url = "/page/1/"

    while page_url:     # while loop because we don't know the number of pages.
        print(f"Scraping {base_url + page_url}...")
        soup = get_page_content(base_url + page_url)   # called the first fucntion which we have created.
        quotes_on_page = get_quotes_from_page(soup)    # called the second fucntion which we have created.
        all_quotes.extend(quotes_on_page)       # add quotes from every page

        # Find the "next" button to go to next page
        next_button = soup.find('li', class_='next')     # webiste has next button on every page except on last page
        if next_button:
            page_url = next_button.a['href']    # access the <a> tag which has link for the next page 
        else:
            page_url = None      # if next button not found then None, this will end the while loop. 

    return pd.DataFrame(all_quotes)

# Run the scraper and store data in DataFrame
df = scrape_quotes()

# Display first 5 results
print(df.head())


Scraping http://quotes.toscrape.com/page/1/...
Scraping http://quotes.toscrape.com/page/2/...
Scraping http://quotes.toscrape.com/page/3/...
Scraping http://quotes.toscrape.com/page/4/...
Scraping http://quotes.toscrape.com/page/5/...
Scraping http://quotes.toscrape.com/page/6/...
Scraping http://quotes.toscrape.com/page/7/...
Scraping http://quotes.toscrape.com/page/8/...
Scraping http://quotes.toscrape.com/page/9/...
Scraping http://quotes.toscrape.com/page/10/...
            Author                                              Quote  \
0  Albert Einstein  “The world as we have created it is a process ...   
1  Albert Einstein  “The world as we have created it is a process ...   
2  Albert Einstein  “The world as we have created it is a process ...   
3  Albert Einstein  “The world as we have created it is a process ...   
4     J.K. Rowling  “It is our choices, Harry, that show what we t...   

             Tag  
0         change  
1  deep-thoughts  
2       thinking  
3          wor

In [2]:
df.head()

Unnamed: 0,Author,Quote,Tag
0,Albert Einstein,“The world as we have created it is a process ...,change
1,Albert Einstein,“The world as we have created it is a process ...,deep-thoughts
2,Albert Einstein,“The world as we have created it is a process ...,thinking
3,Albert Einstein,“The world as we have created it is a process ...,world
4,J.K. Rowling,"“It is our choices, Harry, that show what we t...",abilities


In [3]:
import regex as re
df['Quote'] = df['Quote'].apply(lambda x: re.sub(r'[“”]', '"', x)) # Curly quotes were causing error in SQL.
                                                                                         # Therefore applied "" straight ones

In [4]:
df.head()

Unnamed: 0,Author,Quote,Tag
0,Albert Einstein,"""The world as we have created it is a process ...",change
1,Albert Einstein,"""The world as we have created it is a process ...",deep-thoughts
2,Albert Einstein,"""The world as we have created it is a process ...",thinking
3,Albert Einstein,"""The world as we have created it is a process ...",world
4,J.K. Rowling,"""It is our choices, Harry, that show what we t...",abilities


In [6]:
df['length_of_quote'] = df['Quote'].str.len()
df.head()

Unnamed: 0,Author,Quote,Tag,length_of_quote
0,Albert Einstein,"""The world as we have created it is a process ...",change,115
1,Albert Einstein,"""The world as we have created it is a process ...",deep-thoughts,115
2,Albert Einstein,"""The world as we have created it is a process ...",thinking,115
3,Albert Einstein,"""The world as we have created it is a process ...",world,115
4,J.K. Rowling,"""It is our choices, Harry, that show what we t...",abilities,85
