# Data extraction

This notebook is the first part of the project and contains class used for data acquisition, i.e. downloading products reviews and ratings from ebay which is an e-commerce website (https://www.ebay.com/).
The input data was a list containing search words (words that can be entered in a search bar). The products category was mainly focused on electronics and tools. The class returns pandas dataframe consisting of four columns: product category, raw review title, raw review content and rating.

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np

In [6]:
class EbayCrawler:
    """
    This class is dedicated for collecting ebay reviews.
    The final result is pandas dataframe which contains
    four columns: category, review title, review content and rating

    The input data consists of search words, having search words
    defined the functions checks search results with
    every given search word and collects products urls that have reviews.
    """

    def __init__(self, search_words, number_of_pages=1):
        """
        the user gives list of search words and the number of pages
        which will be checked
        """
        if number_of_pages < 1:
            raise ValueError('number of pages must be at least 1')
        if len(search_words) < 1:
            raise ValueError('there must be at least one search word')

        self.search_words = search_words
        self.number_of_pages = number_of_pages
        self.search_urls = []
        self.products_urls = []
        self.df = None

    def search_word_urls(self):
        """
        the method creates urls for every search words
        """
        if not self.search_urls:
            print('creating search links...\n')
            for word in self.search_words:
                for i in range(1, self.number_of_pages+1):
                    url = f'https://www.ebay.com/sch/i.html?_from=R40&_nkw' \
                            f'={word}&_sacat=0&_pgn={i}'
                    self.search_urls.append(url)

    def find_products_urls(self, display=True, num_to_display=10):
        """
        the method receives an url with search word results and
        collects urls of products that have some reviews
        """
        if not self.products_urls:
            print('downloading products urls...\n')
            for url in self.search_urls:
                soup = BeautifulSoup(requests.get(url).text)
                for link in soup.find_all('div', class_='s-item__reviews'):
                    self.products_urls.append(link.a['href'])
        if display and self.products_urls != []:
            if len(self.products_urls) < num_to_display:
                num_to_display = len(self.products_urls)
            print(f'Links for the first {num_to_display} products:')
            for i in range(num_to_display):
                print(self.products_urls[i])

    @property
    def urls_number(self):

        """
        how many products were found
        """
        return len(self.products_urls)

    def create_reviews_df(self, num_to_download=None, clear=0):
        """
        this method receives a list with products urls and returns one
        dataframe with reviews and ratings the user can define how many
        links will be considered in downloading the reviews
        """
        if clear == 1:
            self.df = None
        num_of_links = len(self.products_urls)

        if num_to_download is not None:
            if num_to_download < len(self.products_urls):
                num_of_links = num_to_download

        if self.df is None:
            for i, product in enumerate(self.products_urls[:num_of_links], 0):
                print(f'downloading {i+1} link from {num_of_links}')
                self.df = pd.concat(
                    (self.df, EbayCrawler.one_product_reviews(product))
                    )
            self.df.set_index(np.arange(self.df.shape[0]), inplace=True)
            print('creating dataframe is completed')
        else:
            print('creating dataframe is completed')

    @classmethod
    def one_product_reviews(cls, url):
        """
        This function returns pandas dataframe with reviews and rating
        for a given url
        """

        isMore = True
        content = requests.get(url).text
        soup = BeautifulSoup(content)

        # checking if there is a link to the review section
        # (more than 10 reviews)
        try:
            url = soup.find('div', class_="see--all--reviews").a['href']
        except:
            isMore = False

        category = 'not defined'
        try:
            category = soup.find('nav', class_='breadcrumb clearfix') \
                       .text.split('>')[-1]
        except:
            pass

        # if there is more than 10 reviews
        if isMore:
            content = requests.get(url).text
            soup = BeautifulSoup(content)

            reviews_page_number = len(soup.find_all('a', class_='spf-link'))-2
            if reviews_page_number == -2:
                return EbayCrawler.ebay_parser(url+'?pgn=1')
            else:
                for page in range(1, reviews_page_number+1):
                    url_temp = url + f'?pgn={page}'
                    if page == 1:
                        reviews = EbayCrawler.ebay_parser(url_temp, category)
                    else:
                        reviews = pd.concat((
                            reviews, EbayCrawler.ebay_parser(url_temp, category)
                            ))
                return reviews

        # if there is less than 11 reviews
        else:
            return EbayCrawler.ebay_parser2(url, category)

    @staticmethod
    def ebay_parser(url, category=None):
        """
        The function collects all reviews and ratings for a given url if a
        product has more than one page with reviews
        """
        try:
            content = requests.get(url).text
        except:
            return 'connection failed'

        review_data = pd.DataFrame(
            columns=['category', 'review title', 'review content', 'rating']
            )

        soup = BeautifulSoup(content)
        for review, rating_stars in zip(
            soup.find_all('div', class_='ebay-review-section-r'),
            soup.find_all('span', class_='star-rating')[1:]
        ):

            rating = len(rating_stars.find_all('i', class_='fullStar'))

            # checkin if a review has a content and a title
            # if not it is skipped
            if review.p is None or review.h3 is None:
                continue
            else:
                review_data = review_data.append(
                    {
                        'category': category,
                        'review title': review.h3.text,
                        'review content': review.p.text,
                        'rating': rating
                    },
                    ignore_index=True
                )

        return review_data

    @staticmethod
    def ebay_parser2(url, category=None):
        """
        The function collects all reviews and ratings for a given url
        if a product has only one page with reviews
        """
        try:
            content = requests.get(url).text
        except:
            return 'connection failed'

        review_data = pd.DataFrame(columns=[
            'category', 'review title', 'review content', 'rating'
            ])

        soup = BeautifulSoup(content)

        for review, rating_stars in zip(
            soup.find_all('div', class_='review--section--r'),
            soup.find_all('div', class_='review--section--l')
        ):
            rating = rating_stars.span.text[0]
            review_data = review_data.append(
                {
                    'category': category,
                    'review title': review.h4.text,
                    'review content': review.p.text,
                    'rating': rating
                },
                ignore_index=True
            )

        return review_data

    def save_to_csv(self, file_name, **kwargs):
        """
        saving created dataframe to csv file
        """
        if self.df is None:
            raise ValueError('dataframe needs to be created')
        else:
            self.df.to_csv(file_name, **kwargs)
            print('dataframe saved to csv file')

## Example showing how to use the class
note: the instance below was not used for reviews used in the project

In [7]:
#creating an instance of a class with a list with search words
#and number of pages that are going to be searched
electronics = EbayCrawler(['apple', 'samsung', 'huawei'], 5)

In [8]:
#creating urls for every given search word
electronics.search_word_urls()

creating search links...



In [9]:
#the first five search links
electronics.search_urls[:5]

['https://www.ebay.com/sch/i.html?_from=R40&_nkw=apple&_sacat=0&_pgn=1',
 'https://www.ebay.com/sch/i.html?_from=R40&_nkw=apple&_sacat=0&_pgn=2',
 'https://www.ebay.com/sch/i.html?_from=R40&_nkw=apple&_sacat=0&_pgn=3',
 'https://www.ebay.com/sch/i.html?_from=R40&_nkw=apple&_sacat=0&_pgn=4',
 'https://www.ebay.com/sch/i.html?_from=R40&_nkw=apple&_sacat=0&_pgn=5']

In [10]:
#downloading ulrs of products that has reviews
electronics.find_products_urls(display=True, num_to_display=5)

downloading products urls...

Links for the first 5 products:
https://www.ebay.com/p/4018215500?iid=174105091978&var=472963059163#UserReviews
https://www.ebay.com/p/23024045643?iid=114711167770&var=414867589699#UserReviews
https://www.ebay.com/p/3033813445?iid=113652223677&var=413778701567&rt=nc#UserReviews
https://www.ebay.com/p/9040885904?iid=313512124902#UserReviews
https://www.ebay.com/p/2305260683?iid=293463314986&var=592274477604#UserReviews


In [11]:
#a total number of found urls
electronics.urls_number

385

In [12]:
#creating dataframe
electronics.create_reviews_df(num_to_download=5, clear=1)

downloading 1 link from 5
downloading 2 link from 5
downloading 3 link from 5
downloading 4 link from 5
downloading 5 link from 5
creating dataframe is completed


In [13]:
electronics.df.head()

Unnamed: 0,category,review title,review content,rating
0,Cell Phones & Smartphones,This product is in a league by itself.,Semi easy to swap SIM card and unknown futuris...,5
1,Cell Phones & Smartphones,Great phone!,"The I phone 8 looked brand new,\nGlad I made t...",5
2,Cell Phones & Smartphones,I Phone 8 plus is Good Phone. It has great fea...,I Phone 8 Plus is a very good phone. It is bet...,5
3,Cell Phones & Smartphones,Wish it didn’t cause me headaches.,"Many positives: feels good in hand, pleasing d...",2
4,Cell Phones & Smartphones,Pleased with my previously used Apple i-Phone 8,I was pleasantly surprised with the rapid ship...,4


In [55]:
#saving dataframe to csv file, a user can give some additional options regaridng saving options
electronics.save_to_csv('test.csv', index=False)

dataframe saved to csv file
