This code automates the collection and classification of information and images from the official website of a hotel. Using DuckDuckGo to find the hotel's main page, it extracts essential data such as address, description, contact details, services, and GPS coordinates. The description is summarized using the T5 Transformer model, and all images on the site are downloaded and classified using a Convolutional Neural Network (CNN) model. The classified images are then organized and stored locally based on their categories.

To meet the requirements is necessary to create an input file, an output file, and integrating the CNN model, as well as ensuring the output contains the specified categories (activities, conference, hotel, pool, restaurant, room, sightseeing, spa).

The purpose of this project is purely educational, to understand the process of web scraping and image classification. It is very likely that the images may not be classified very accurately as the training capacity of the model has been limited in terms of data.

For better accuracy, we encourage you to use the provided model and adjust its performance (CNN structure and training dataset).

Credit: Poață Andrei Cătălin (UNSTPB, Artificial Intelligence master), Ionuț Vișan (UNSTPB, Artificial Intelligence master)

!! We do not encourage the use of web scraping techniques that violate the terms and conditions of websites.

Imports

In [None]:
!pip install pycountry

In [None]:
import warnings
import torch
import os
import torch.nn as nn
import torch.optim as optim
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
import torch.nn.functional as functional
from sklearn.preprocessing import LabelEncoder
import shutil
import re
from transformers import T5Tokenizer, T5ForConditionalGeneration
import pprint
import requests
from PIL import Image
from bs4 import BeautifulSoup
from urllib.parse import urlparse, urlunparse, urljoin
from collections import Counter
import time
import pycountry

Constants

You need to complete:

HOTEL_NAME

CITY

By entering this data, you will be able to search for the hotel's website and receive all the information mentioned below.

In [None]:
DATA_PATHS = ["data/activities", "data/conference", "data/hotel", "data/pool", "data/restaurant", "data/room", "data/sightseeing", "data/spa"]
MODEL_PATH = './cnn.pth'
INPUT_PATH = "input"
OUTPUT_PATH = "output"
HOTEL_NAME = "xxxx"
CITY = "xxxx"
num_to_label = {0: 'activities', 1: 'conference', 2: 'hotel', 3: 'pool', 4: 'restaurant', 5: 'room', 6: 'sightseeing', 7: 'spa'}
label_to_num = {'activities': 0, 'conference': 1, 'hotel': 2, 'pool': 3, 'restaurant': 4, 'room': 5, 'sightseeing': 6, 'spa': 7}

In [None]:
Returns:
- Homepage
- Address
- Description
- Email
- Latitude
- Longitude
- Phone
- Services
- Summary (using T5 Transformer)
- All the links inside the page
- All the images inside the page
- Images classified using CNN model

In [None]:
def get_hotel_website_duckduckgo(hotel_name, city):
    # Construct the search query
    query = f"{hotel_name} {city} site oficial"
    # Construct the DuckDuckGo search URL
    url = f"https://duckduckgo.com/html/?q={query}"
    # Define the headers for the HTTP request
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    }
    # Send the HTTP GET request to DuckDuckGo
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        # Parse the HTML content of the response
        soup = BeautifulSoup(response.text, 'html.parser')
        # Define a list of domains to exclude from the search results
        exclude_domains = [
            "booking.com", "tripadvisor.com", "expedia.com", "hotels.com",
            "agoda.com", "airbnb.com", "guestreservations.com", "travelocity.com",
            "orbitz.com", "priceline.com", "kayak.com", "reservations.com", "facebook.com",
            "reservationstays.com", "instagram.com", "trivago.com"
        ]
        # Iterate over all <a> tags with href attributes
        for a in soup.find_all('a', href=True):
            href = a['href']
            if "http" in href and "duckduckgo.com" not in href and "translate.duckduckgo" not in href:
                # Check if the URL does not belong to any excluded domain
                if not any(domain in href for domain in exclude_domains):
                    # Check if the hotel name is part of the URL
                    if any(hotel_name.lower() in href.lower() for hotel_name in hotel_name.split()):
                        # Parse and reconstruct the URL to its main page
                        parsed_url = urlparse(href)
                        main_page_url = urlunparse((parsed_url.scheme, parsed_url.netloc, '', '', '', ''))
                        return main_page_url
    else:
        # Print error message if the search request failed
        print(f"Failed to retrieve search results: {response.status_code}")
        return None

def get_most_frequent_website(hotel_name, city, attempts=3):
    # Start timing the function execution
    start_time = time.time()
    results = []
    # Attempt to retrieve the hotel website multiple times
    for _ in range(attempts):
        website = get_hotel_website_duckduckgo(hotel_name, city)
        if website:
            results.append(website)
        # Add a delay between attempts to avoid rate limiting
        time.sleep(2)

    # Count the frequency of each retrieved website
    website_counts = Counter(results)
    if website_counts:
        most_common_website = website_counts.most_common(1)[0][0]
    else:
        most_common_website = None

    # End timing the function execution
    end_time = time.time()
    execution_time = end_time - start_time

    # Print the results and execution time
    print("Results obtained in 3 attempts:")
    for result in results:
        print(result)

    print(f"Most frequent website for {hotel_name} in {city}: {most_common_website}")
    print(f"Total execution time: {execution_time:.2f} seconds")

    return most_common_website

def fetch_hotel_info(url):
    # Define the headers for the HTTP request
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    }
    # Send the HTTP GET request to the hotel website
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        response.encoding = 'utf-8'
        # Parse the HTML content of the response
        soup = BeautifulSoup(response.text, 'html.parser')

        # Extract the hotel description
        description = None
        description_p = soup.find('p', class_='text')
        if description_p:
            description = description_p.get_text(separator="\n", strip=True)
        else:
            description_meta = soup.find('meta', attrs={'name': 'description'})
            if description_meta:
                description = description_meta.get('content', '').strip()

        # Extract latitude and longitude
        latitude = longitude = None
        script_tag = soup.find('script', string=re.compile(r'var hotel_gps_coordinates'))
        if script_tag:
            script_content = script_tag.string
            latitude = re.search(r'latitude\s*:\s*([\d.]+)', script_content).group(1)
            longitude = re.search(r'longitude\s*:\s*([\d.]+)', script_content).group(1)

        # Extract address
        address = None
        address_span = soup.find('span', class_='element element_address')
        if address_span:
            address = address_span.get_text(separator="\n", strip=True)

        # Extract contact information (phone and email)
        phone = email = None
        phone_tag = soup.find('span', class_='phone element')
        if phone_tag:
            phone = phone_tag.find_next('a').get('href').replace('tel:', '')
        email_tag = soup.find('span', class_='email element')
        if email_tag:
            script_tag = email_tag.find_next('a').get('href')
            if 'cdn-cgi/l/email-protection' in script_tag:
                encoded_email = script_tag.split('#')[1]
                r = int(encoded_email[:2], 16)
                email = ''.join([chr(int(encoded_email[i:i+2], 16) ^ r) for i in range(2, len(encoded_email), 2)])
            else:
                email = script_tag.replace('mailto:', '')

        # Extract services offered by the hotel
        services = []
        service_wrappers = soup.find_all('div', class_='service_wrap')
        for service_wrapper in service_wrappers:
            service_text = service_wrapper.get_text(separator=" ", strip=True)
            if service_text:
                services.append(service_text)

        # Return the extracted information as a dictionary
        return {
            "description": description,
            "latitude": latitude,
            "longitude": longitude,
            "address": address,
            "phone": phone,
            "email": email,
            "services": services
        }
    else:
        # Print error message if the request failed
        print(f"Failed to access the page. Status code: {response.status_code}")

def summarize_description(description):
    # Initialize the T5 model and tokenizer for text summarization
    model_name = "t5-small"
    tokenizer = T5Tokenizer.from_pretrained(model_name, legacy=False)
    model = T5ForConditionalGeneration.from_pretrained(model_name)
    # Prepare the input text for summarization
    input_text = f"summarize: {description}"
    encoding = tokenizer.encode_plus(input_text, return_tensors="pt", max_length=512, truncation=True)
    # Generate the summary using beam search
    generated_ids = model.generate(encoding['input_ids'], num_beams=4, max_length=150, early_stopping=True)
    # Decode the generated summary
    summarized_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)
    return summarized_text

def get_hotel_info(hotel_name, city):
    # Get the most frequent hotel website
    website = get_most_frequent_website(hotel_name, city)
    if website:
        # Fetch the hotel information from the website
        info = fetch_hotel_info(website)
        if info and info["description"]:
            # Summarize the hotel description
            info["summary"] = summarize_description(info["description"])
            info['homepage'] = website
        return info
    else:
        # Print error message if the website is not found
        print("Hotel website not found.")
        return None

# Fetch the hotel information
info = get_hotel_info(HOTEL_NAME, CITY)
pprint.pprint(info)

hotel_homepage = info['homepage']
if not hotel_homepage.endswith('/'):
    hotel_homepage = hotel_homepage + '/'

def download_image(url, local_filename):
    # Send the GET request to the URL
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    }
    response = requests.get(url, headers=headers)

    # Check if the request was successful
    if response.status_code == 200:
        # Open a local file in binary write mode
        with open("input/" + local_filename, 'wb') as f:
            # Write the response content to the file
            f.write(response.content)
        print(f"Image downloaded and saved as {local_filename}")
    else:
        # Print error message if the download failed
        print(f"Failed to download the image. HTTP status code: {response.status_code}")

def extract_image_links(url):
    # Send a GET request to the specified URL
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    }
    response = requests.get(url, headers=headers)

    # Ensure the request was successful
    if response.status_code != 200:
        raise Exception(f"Failed to load page: {url} Status code: {response.status_code}")

    # Parse the HTML content of the page
    soup = BeautifulSoup(response.content, 'html.parser')

    # Find all <img> tags
    image_links = []

    for img in soup.find_all('img'):
        src = img.get('src')
        if src and not '.svg' in src:
            image_links.append(add_https_prefix(src))

    # Extract all links from <a> tags that might be images
    for a in soup.find_all('a'):
        for attr in a.attrs.keys():
            if isinstance(a[attr], str) and (a[attr].endswith('.jpg') or a[attr].endswith('.jpeg') or a[attr].endswith('.png')):
                image_links.append(add_https_prefix(a[attr]))

    # Extract all links from <div> tags that might be images
    for a in soup.find_all('div'):
        for attr in a.attrs.keys():
            if isinstance(a[attr], str) and (a[attr].endswith('.jpg') or a[attr].endswith('.jpeg') or a[attr].endswith('.png')):
                image_links.append(add_https_prefix(a[attr]))

    # Extract all links from <source> tags that might be images
    for a in soup.find_all('source'):
        for attr in a.attrs.keys():
            if isinstance(a[attr], str) and (a[attr].endswith('.jpg') or a[attr].endswith('.jpeg') or a[attr].endswith('.png')):
                image_links.append(add_https_prefix(a[attr]))

    # Extract all links from <link> tags that might be images
    for a in soup.find_all('link'):
        for attr in a.attrs.keys():
            if isinstance(a[attr], str) and (a[attr].endswith('.jpg') or a[attr].endswith('.jpeg') or a[attr].endswith('.png')):
                image_links.append(add_https_prefix(a[attr]))

    # Return unique image links
    return list(set(image_links))

def add_https_prefix(url):
    # Add HTTPS prefix to the URL if missing
    if url.startswith("//"):
        return "https:" + url
    elif url.startswith("/"):
        return hotel_homepage + url[1:]
    return url

def is_valid_url(url):
    # Check if the URL is valid and starts with http or https
    return url.startswith(('http://', 'https://'))

def is_internal_link(url, base_url):
    # Check if the URL belongs to the same domain
    return urlparse(url).netloc == urlparse(base_url).netloc

def contains_language_code(url):
    # Get the list of all ISO 639-1 language codes
    language_codes = [lang.alpha_2 for lang in pycountry.languages if hasattr(lang, 'alpha_2')]
    # Check if the URL contains any of the language codes
    for code in language_codes:
        if f'/{code}/' in url or f'-{code}' in url:
            return True
    return False

def extract_other_pages_url(page_url, home_page_url, links):
    # Define the headers for the HTTP request
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    }
    # Send the HTTP GET request to the page URL
    response = requests.get(page_url, headers=headers)

    # Ensure the request was successful
    if response.status_code != 200:
        raise Exception(f"Failed to load page: {page_url} Status code: {response.status_code}")
    # Parse the HTML content of the response
    soup = BeautifulSoup(response.content, 'html.parser')

    # Add the current page URL to the links list
    links.append(page_url)

    internal_links = []

    # Extract all <a> tags with the href attribute
    for a_tag in soup.find_all('a', href=True):
        href = a_tag['href']

        if 'email-protection' in href or '#respond' in href or ".jpg" in href or ".jpeg" in href or ".svg" in href or ".gif" in href:
            continue
        # Convert relative URLs to absolute URLs
        full_url = urljoin(page_url, href)
        if is_valid_url(full_url) and is_internal_link(full_url, home_page_url) and not contains_language_code(full_url):
            internal_links.append(full_url)

    # Recursively extract internal links from the page
    for internal_link in internal_links:
        if internal_link not in links:
            extract_other_pages_url(internal_link, home_page_url, links)

def get_all_website_images(home_page):
    # Initialize lists to store page links and image links
    pages_links = []
    all_image_links = []
    # Extract all internal page URLs from the website
    extract_other_pages_url(home_page, home_page, pages_links)

    pprint.pprint(pages_links)

    # Extract image links from each internal page
    for page_link in pages_links:
        image_links = extract_image_links(page_link)
        all_image_links.extend(image_links)

    # Remove duplicate image links
    all_image_links = list(set(all_image_links))

    # Download each image and save it locally
    counter = 0
    for image_link in all_image_links:
        url = add_https_prefix(image_link)
        download_image(url, "image_" + str(counter) + "." + url.split('.')[-1])
        counter += 1

get_all_website_images(hotel_homepage)

def list_files_in_directory(directory_path):
    try:
        # List all files and directories in the specified path
        files_and_dirs = os.listdir(directory_path)

        # Filter the list to only include files
        files = [os.path.join(directory_path, f).replace('\\', '/') for f in files_and_dirs if os.path.isfile(os.path.join(directory_path, f))]

        return files
    except Exception as e:
        # Print error message if listing files failed
        print(f"An error occurred: {e}")
        return []

label_encoder = LabelEncoder()

class ImageDataset(Dataset):
    def __init__(self, dataframe, transform=None):
        self.dataframe = dataframe
        self.transform = transform

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        # Load the image from the specified file path
        img_path = self.dataframe.iloc[idx, 0]
        with warnings.catch_warnings():
            warnings.simplefilter("ignore", UserWarning)
            image = Image.open(img_path).convert('RGB')
        # Get the label for the image
        label = self.dataframe.iloc[idx, 1]
        # Apply the transformations to the image
        if self.transform:
            image = self.transform(image)
        return image, torch.tensor(label, dtype=torch.long)

# Define the transformations to be applied to each image
transform = transforms.Compose([
    transforms.Resize((224, 224)),  # Resize to 224x224
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class ImageClassifier(nn.Module):
    def __init__(self):
        super(ImageClassifier, self).__init__()
        # Define the layers of the CNN
        self.conv1 = nn.Conv2d(3, 6, 5)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(6, 16, 5)
        self.fc1 = nn.Linear(16 * 53 * 53, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 8)

    def forward(self, x):
        # Define the forward pass
        x = self.pool(functional.relu(self.conv1(x)))
        x = self.pool(functional.relu(self.conv2(x)))
        x = x.view(-1, 16 * 53 * 53)
        x = functional.relu(self.fc1(x))
        x = functional.relu(self.fc2(x))
        x = self.fc3(x)
        return x

model = ImageClassifier().to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Load the pre-trained model weights
model.load_state_dict(torch.load(MODEL_PATH, map_location=torch.device('cpu'), weights_only=False))

def build_dataframe_from_input(input_path):
    # List all files in the input directory
    inputs = list_files_in_directory(input_path)
    # Create a DataFrame with file paths and target labels
    inputs_df = pd.DataFrame({'filepath': inputs, 'target': [-1] * len(inputs)})
    return inputs_df

def build_dataloader_from_dataframe(df):
    # Create a Dataset and DataLoader from the DataFrame
    ds = ImageDataset(df, transform)
    dl = DataLoader(ds, batch_size=32, shuffle=False)
    return dl

test_df = build_dataframe_from_input(INPUT_PATH)
test_dl = build_dataloader_from_dataframe(test_df)

predictions = []
with torch.no_grad():
    for data in test_dl:
        images, labels = data
        # Transfer data to GPU
        images, labels = images.to(device), labels.to(device)
        # Perform a forward pass through the model
        outputs = model(images)
        # Get the predicted labels
        _, predicted = torch.max(outputs.data, 1)
        predictions = predictions + predicted.cpu().numpy().tolist()

# Create a copy of the DataFrame and add predictions
result_df = test_df.copy().drop(['target'], axis=1)
result_df['prediction'] = predictions

# Print and copy the files for each prediction category
for num in num_to_label.keys():
    print(f'Elements of type {num_to_label[num]}:')
    current_prediction_files = result_df[result_df['prediction'] == num]['filepath'].tolist()
    pprint.pprint(current_prediction_files)
    for file in current_prediction_files:
        shutil.copyfile(file, f'output/{num_to_label[num]}/{file.split("/")[-1]}')