#**RateGain Code Rangers Hackathon**
- **Problem Statement: Web_Scrapping**
- **Team: Xampa**

##Importing Libraries

In [2]:
import requests #for making HTTP based requests
from bs4 import BeautifulSoup #for web scrapping
import csv ##for data management
import pandas as pd #for data manipulation and analysis

##Scrape Blog Item Function

In [3]:
# Function to scrape data from a single blog item
def scrape_blog_item(blog_items):
    all_blog_data = []

    for item in blog_items:
        blog_data = {}
        # Extract image URL
        if(item.find('div', class_='img')):
          blog_data['Image URL'] = item.find('div', class_='img').find('a')['data-bg']
        else:
          blog_data['Image URL'] =""

        # Extract blog title
        if(item.find('div', class_='content')):
          blog_data['Title'] = item.find('div', class_='content').find('h6').text.strip()

        # Extract blog date
        if(item.find('div', class_='bd-item')):
          blog_data['Date'] = item.find('div', class_='bd-item').find('span').text.strip()

        # Extract blog likes count
        if(item.find('a', class_='zilla-likes')):
          likes_span = item.find('a', class_='zilla-likes').find('span')
          blog_data['Likes Count'] = int(likes_span.text.split()[0]) if likes_span else 0

        all_blog_data.append(blog_data)

    return all_blog_data

## Defining Constants and Variables

In [None]:
base_url = 'https://rategain.com/blog'
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
}
all_data = []
page_num = 1

## Handling Pagination

In [None]:
# Fetch and scrape data from all pages
while True:
    current_url = f"{base_url}/page/{page_num}/" if page_num > 1 else base_url
    response = requests.get(current_url, headers=headers)
    if response.status_code == 200:
        print(f"Scraping {current_url}...")
        soup = BeautifulSoup(response.content, 'html.parser')

        blog_items = soup.find_all('article', class_='blog-item')
        if not blog_items:
            break

        scraped_data = scrape_blog_item(blog_items)
        all_data.extend(scraped_data)

        page_num += 1
    else:
        print(f'Request for {current_url} failed with status code: {response.status_code}')
        break  # Terminate the loop if the response status code is not 200

Scraping https://rategain.com/blog...
Scraping https://rategain.com/blog/page/2/...
Scraping https://rategain.com/blog/page/3/...
Scraping https://rategain.com/blog/page/4/...
Scraping https://rategain.com/blog/page/5/...
Scraping https://rategain.com/blog/page/6/...
Scraping https://rategain.com/blog/page/7/...
Scraping https://rategain.com/blog/page/8/...
Scraping https://rategain.com/blog/page/9/...
Scraping https://rategain.com/blog/page/10/...
Scraping https://rategain.com/blog/page/11/...
Scraping https://rategain.com/blog/page/12/...
Scraping https://rategain.com/blog/page/13/...
Scraping https://rategain.com/blog/page/14/...
Scraping https://rategain.com/blog/page/15/...
Scraping https://rategain.com/blog/page/16/...
Scraping https://rategain.com/blog/page/17/...
Scraping https://rategain.com/blog/page/18/...
Scraping https://rategain.com/blog/page/19/...
Scraping https://rategain.com/blog/page/20/...
Scraping https://rategain.com/blog/page/21/...
Scraping https://rategain.com/

## Data Managment

### Save to CSV File

In [None]:
# Save scraped data to a CSV file
csv_filename = 'blog_data.csv'
with open(csv_filename, mode='w', newline='', encoding='utf-8') as file:
    fieldnames = ['Title', 'Date', 'Image URL', 'Likes Count']
    writer = csv.DictWriter(file, fieldnames=fieldnames)

    writer.writeheader()
    for data in all_data:
        writer.writerow({'Title': data['Title'], 'Date': data['Date'], 'Image URL': data['Image URL'], 'Likes Count': data['Likes Count']})


### Save to Excel File

In [None]:
!pip install xlwt



In [None]:
# Save scraped data to an Excel file in xlsx format with columns in the desired order
excel_filename = 'blog_data.xlsx'
df = pd.DataFrame(all_data, columns=['Title', 'Date', 'Image URL', 'Likes Count'])
df.to_excel(excel_filename, index=False, columns=['Title', 'Date', 'Image URL', 'Likes Count'])

### Display of Data

In [None]:
df1=pd.read_csv("blog_data.csv")

In [None]:
df1

Unnamed: 0,Title,Date,Image URL,Likes Count
0,A Complete Guide to Hotel Revenue Management,"November 22, 2023",https://rategaincom.wpenginepowered.com/wp-con...,1
1,The Ultimate Guide to Choosing the Right Hotel...,"November 22, 2023",https://rategaincom.wpenginepowered.com/wp-con...,2
2,Maximize Your Hotel’s Exposure with Google AdS...,"October 20, 2023",https://rategaincom.wpenginepowered.com/wp-con...,32
3,Beyond Reach & Frequency: Hotels' New Era with...,"October 12, 2023",https://rategaincom.wpenginepowered.com/wp-con...,16
4,Managing Overbookings and Cancellations with H...,"October 5, 2023",https://rategaincom.wpenginepowered.com/wp-con...,11
...,...,...,...,...
396,3 Reasons to visit RateGain at ITB Berlin,"February 12, 2015",,1
397,Top 3 Tips to Price Your Hotel Rooms Right in ...,"February 12, 2015",,2
398,Europe's Outlook for 2015: What it means for y...,"February 9, 2015",,5
399,RateGain Announces Merril Yu and Sunish Sadasi...,"February 6, 2015",,5


In [None]:
df2=pd.read_excel('blog_data.xlsx')

In [None]:
df2

Unnamed: 0,Title,Date,Image URL,Likes Count
0,A Complete Guide to Hotel Revenue Management,"November 22, 2023",https://rategaincom.wpenginepowered.com/wp-con...,1
1,The Ultimate Guide to Choosing the Right Hotel...,"November 22, 2023",https://rategaincom.wpenginepowered.com/wp-con...,2
2,Maximize Your Hotel’s Exposure with Google AdS...,"October 20, 2023",https://rategaincom.wpenginepowered.com/wp-con...,32
3,Beyond Reach & Frequency: Hotels' New Era with...,"October 12, 2023",https://rategaincom.wpenginepowered.com/wp-con...,16
4,Managing Overbookings and Cancellations with H...,"October 5, 2023",https://rategaincom.wpenginepowered.com/wp-con...,11
...,...,...,...,...
396,3 Reasons to visit RateGain at ITB Berlin,"February 12, 2015",,1
397,Top 3 Tips to Price Your Hotel Rooms Right in ...,"February 12, 2015",,2
398,Europe's Outlook for 2015: What it means for y...,"February 9, 2015",,5
399,RateGain Announces Merril Yu and Sunish Sadasi...,"February 6, 2015",,5
