In [7]:
"""
Web scraping program with BeautifulSoup package.

Author: Ye N.E.

Description: Scraping data from celebritynetworth.com
    and save outputs into a database.
    Program must include function that takes celeb's name as input
    and returns his/her net worth according to celebritynetworth.com
"""

import requests 
from bs4 import BeautifulSoup as Soup
import sqlite3
import os.path
import apikey


class DB:
    def __init__(self, page_num=1):
        self.file_name = "celeb_networth_{}.db".format(page_num)
        self.table_name = "celebNetworth_{}".format(page_num)
        self.column_1 = "name"
        self.type_1 = "text"
        self.column_2 = "networth"
        self.type_2 = "text"
        
    def create(self):   
        connect = sqlite3.connect(self.file_name)
        connect.execute('''CREATE TABLE {tn} ({col_1} {t_1}, {col_2} {t_2})'''\
                        .format(tn=self.table_name,\
                                col_1=self.column_1, t_1=self.type_1,\
                                col_2=self.column_2, t_2=self.type_2))
    
class GetParse():
    def __init__(self, site):
        self.site = site 
        self.agent = {"User-Agent":"Mozilla/5.0"}
        api_key = apikey.Key()
        self.payload = {'api_key':api_key, 'url':'https://httpbin.org/ip'}
        #uncomment the line below if you don't want to use Scraper API
        #self.page = requests.get(self.site, headers=self.agent)
        #comment out the line below if you dont't want to use Scraper API
        self.page = requests.get(self.site, params=self.payload, timeout=60) 
        self.soup = Soup(self.page.content, "html.parser")
    
    
    def __repr__(self):
        return self.page.status_code

    
class Scraper:
    def __init__(self, site):
        self.site = site
        self.URLs = set()
        self.celebs = {}
        self.found = False
        
    
    def get_url(self, num_page, start_page=1):
        for i in range(start_page, num_page):
            soup = GetParse(self.site+str(i)).soup
            print(soup.text)
            for tag in soup.find_all('a', href=True):
                url = tag['href']
                if '-net-worth/' in url:
                    self.URLs.add(url) 
        
        return (self.URLs)
    
    
    def scrape(self, num_page, start_page=1):
        """
        Scrape data from a given number of pages of the site of interest
        
        :Input:
        num_page (int): the number of pages from the site data will be sraped
        start_page (int): the page where the scraping should start from, default is "1"
        
        :Return:
        self.celebs (dict): a dictionary of celebrity's name as key, 
                                and his/her networth as value
        """
        self.get_url(num_page, start_page)
        for url in self.URLs:
            soup = GetParse(url).soup
            try:
                name = (soup.find_all("h1", {"class": "title"})[0].text).upper()
                net_worth = soup.find_all("div", {"class":"value"})[0].text
                self.celebs[name] = net_worth
            except IndexError as error:
                pass
        
        return self.celebs
    
    
    def get_net_worth(self, name, start_page=1, num_page=2):
        """
        Scrape data from page to page.
        Save data into dababase by their page number.
        Repeat the process until the celebrity's networth of interest is found
        
        ***Use this function if there's no pre-existing database***
        
        :Input:
        name (str): name of the celebrity of interest
        
        :Return:
        find[1] (str): networth of the celebrity of interest
        """
        name = str(name).upper()
        full_name = "{} NET WORTH".format(name)
        
        while not self.found:
            file_name = DB(start_page).file_name
            table_name = DB(start_page).table_name
            
            if os.path.exists(file_name)==False:
                DB(start_page).create()
                connect = sqlite3.connect(file_name)
                self.celebs.clear()
                self.scrape(num_page, start_page)
                for key, value in self.celebs.items():
                    connect.execute('''INSERT OR IGNORE INTO {tn} ({col_1}, {col_2})\
                                    VALUES ("{name}", "{nw}")'''\
                                    .format(tn=table_name,\
                                            col_1=DB().column_1, col_2=DB().column_2,\
                                            name=key, nw=value))
                    if key==full_name:
                        net_worth = self.celebs[key]
                        print('Found at page {}'.format(start_page))
                        self.found=True
                
                connect.commit()
                connect.close()
            
                start_page+=1
                num_page+=1
            
            else:
                self.get_networth_db(name)
            
        return net_worth
    
    
    def get_networth_db(self, name):
        """
        Find the celebrity's networth of interest from pre-existing databases 
        
        ***Use this function if there're pre-existing databases***
        
        :Input:
        name (str): name of the celebrity of interest
        
        :Return:
        find[1] (str): networth of the celebrity of interest
        """
        name = str(name).upper()
        full_name = "{} NET WORTH".format(name)
        start_page = 1
        num_page = 2 
        
        while not self.found:
            file_name = DB(start_page).file_name
            table_name = DB(start_page).table_name
            
            if os.path.exists(file_name)==False:
                self.get_net_worth(name, start_page, num_page)
            else:
                connect = sqlite3.connect(file_name)
                c = connect.cursor()
                c.execute('''SELECT * FROM {tn} WHERE {col_1}="{key}"'''\
                          .format(tn=table_name, col_1=DB().column_1,\
                                  key=full_name))
                find = c.fetchone()
                start_page+=1
                num_page+=1

                if find!=None:
                    self.found=True
                    return find[1]
                
                
site = "https://www.celebritynetworth.com/category/richest-celebrities/page/"
scrape = Scraper(site)

#Just scrape data from from a given number of pages from the site of interest
#scrape.scrape(2)

#Scrape, save, and find networth of the celebrity of interest by their names
scrape.get_net_worth('Natalie Portman')

#Find networth of the celebrity of interest by their names from pre-existing databases
#scrape.get_networth_db('KYLIE JENNER')


Richest Celebrities | Celebrity Net Worth  {"@context":"https://schema.org","@graph":[{"@type":"WebSite","@id":"https://www.celebritynetworth.com/#website","url":"https://www.celebritynetworth.com/","name":"Celebrity Net Worth"},{"@type":"CollectionPage","@id":"https://www.celebritynetworth.com/category/richest-celebrities/#webpage","url":"https://www.celebritynetworth.com/category/richest-celebrities/","inLanguage":"en-US","name":"Richest Celebrities | Celebrity Net Worth","isPartOf":{"@id":"https://www.celebritynetworth.com/#website"}}]}  ;window.google_analytics_uacct = "UA-10583146-2"; (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
			(i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
			m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
			})(window,document,'script','//www.google-analytics.com/analytics.js','ga');
			
			ga('create', 'UA-10583146-2', 'auto');
			ga('send', 'pageview');   #brand

Richest Celebrities - Page 2 | Celebrity Net Worth  {"@context":"https://schema.org","@graph":[{"@type":"WebSite","@id":"https://www.celebritynetworth.com/#website","url":"https://www.celebritynetworth.com/","name":"Celebrity Net Worth"},{"@type":"CollectionPage","@id":"https://www.celebritynetworth.com/category/richest-celebrities/page/2/#webpage","url":"https://www.celebritynetworth.com/category/richest-celebrities/page/2/","inLanguage":"en-US","name":"Richest Celebrities | Celebrity Net Worth","isPartOf":{"@id":"https://www.celebritynetworth.com/#website"}}]}  ;window.google_analytics_uacct = "UA-10583146-2"; (function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
			(i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
			m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
			})(window,document,'script','//www.google-analytics.com/analytics.js','ga');
			
			ga('create', 'UA-10583146-2', 'auto');
			ga('send'

KeyboardInterrupt: 

In [6]:
import sqlite3

db = sqlite3.connect('celeb_networth_10.db')
c = db.cursor()

for row in c.execute('''SELECT * FROM celebNetworth_10'''):
    print(row)
    

('RUPAUL NET WORTH', '$60 Million')
('JC CAYLEN NET WORTH', '$4 Million')
('JORDAN PEELE NET WORTH', '$50 Million')
('MAYA HAWKE NET WORTH', '$3 Million')
('ANDY MACKAY NET WORTH', '$30 Million')
('LAUREN OLIVER NET WORTH', '$8 Million')
('AXL ROSE NET WORTH', '$200 Million')
('PENN BADGLEY NET WORTH', '$8 Million')
('FHER OLVERA NET WORTH', '$10 Million')
('MYLENE FARMER NET WORTH', '$50 Million')
('SHANE DAWSON NET WORTH', '$12 Million')
('TONY KAYE NET WORTH', '$3 Million')
('NATE MENDEL NET WORTH', '$40 Million')
('TOMMY JAMES NET WORTH', '$1 Million')
('PEWDIEPIE NET WORTH', '$30 Million')
('GARY GLITTER NET WORTH', '$8 Million')
('UPCHURCH NET WORTH', '$4 Million')
('DUDLEY MOORE NET WORTH', '$25 Million')
('GEOFF DOWNES NET WORTH', '$8 Million')
('RIP TAYLOR NET WORTH', '$3 Million')
('MIKE SHINODA NET WORTH', '$45 Million')
('LORI LOUGHLIN NET WORTH', '$20 Million')
('BEBE REXHA NET WORTH', '$5 Million')
('RIVER PHOENIX NET WORTH', '$5 Million')
('DIMEBAG DARRELL NET WORTH', '$

In [159]:
name={'Ye':26, 'Steven':31}



In [160]:
name.clear()


In [162]:
name

{}