# Final Project: Web Service APIs
## Secondary data sources notebook
by Lin Ma and Khanh Tran

Other than the primary data source we have, we used wikipedia as the secondary data source provider. This notebook includes the functions that we used to web scrape the addition list for the most popular artists on Spotify.(https://en.wikipedia.org/wiki/List_of_most-streamed_artists_on_Spotify)

The following cell is the process of how we do the web scraping for a table on Wikipedia.

In [3]:
import requests
from lxml import etree
import io
import pandas as pd


In [7]:
def parseHTML(table_url):
    """ This function takes in an url, parses the data into an element tree
    and returns the root element of that tree.
    Parameters:
        table_url: a given url
    Return: root element of etree
    """
    resp = requests.get(table_url) #use get request to retrieve data
    if resp.status_code != 200:
        return None #return None if status code is not 200
    parser = etree.HTMLParser(remove_blank_text=True)
    tree = etree.parse(io.BytesIO(resp.content), parser) #parse the data to an element tree
    return tree.getroot()  #return the tree's root

In [5]:
def getColumnName(root):
    """ This function takes in an element and returns the list of column names from that element.
    Parameter:
        root: a tree element
    Return: a list of column names
    """
    colNames = [] #create an empty list to hold the column names
    for i in range(1,5): #iterate through seven th
        path = '//table[@class="wikitable sortable"][4]/tbody/tr[1]/th[{}]/text()'
        col = root.xpath(path.format(i)) #use string format to custom the xpath
        if len(col) == 1:
            colNames.append(col[0].strip())
        elif len(col) == 2: #if a list of 2 strings is returned from the given xpath, combine both and add to colNames
            colNames.append(col[0].strip()+' '+col[1].strip())
    return colNames

In [6]:
def getRank(root):
    """ This function takes in an element and returns the list of ranks from that element.
    Parameter:
        root: a tree element
    Return: a list of ranks
    """
    #strip the strings returned from the xpath, turn them into integers and add to the ranks list
    ranks = [int(r.replace('.','')) for r in root.xpath('//table[@class="wikitable sortable"][4]/tbody/tr[position()>1]/td[1]/text()')]
    #ranks = [int(r.strip()) for r in root.xpath('//table[@class="wikitable sortable plainrowheaders"]/tbody/tr[position()>2]/td[1]/text()')]
    return ranks #return the ranks list

In [5]:
def getArtist(root):
    """ This function takes in an element and returns the list of Artists from that element.
    Parameter:
        root: a tree element
    Return: a list of Artists
    """
    artists = [a for a in root.xpath('//table[@class="wikitable sortable"][4]/tbody/tr[position()>1]/td[2]/a/text()')]
    return artists #return the countries list

In [6]:
def getFollower(root):
    """ This function takes in an element and returns the list of followers from that element.
    Parameter:
        root: a tree element
    Return: a list of followers
    """
    #assign the list returned from the xpath to the list contReg
    followers = [float(f) for f in root.xpath('//table[@class="wikitable sortable"][4]/tbody/tr[position()>1]/td[3]/text()')]
    return followers #return the list

In [7]:
def getCountry(root):
    """ This function takes in an element and returns the list of statistical regions from that element.
    Parameter:
        root: a tree element
    Return: a list of statistical regions
    """
    countries = [c for c in root.xpath('//table[@class="wikitable sortable"][4]/tbody/tr[position()>1]/td[4]/a/@title')]
    return countries #return the list

In [37]:
def createDict(columns,root):
    """ This function takes in column names and a tree element to create a dictionary of
    (column names: list of things from the tree element) pairs.
    Parameters:
        columns: a list of column names
        root: a tree element
    Return: a dictionary
    """
    d = {}
    #for each column as a key in the d dictionary, call the appropriate function
    #to get the list of things from the tree that match the column name
    d[columns[0]]=getRank(root)
    d[columns[1]]=getArtist(root)
    d[columns[2]]=getFollower(root)
    d[columns[3]]=getCountry(root)
    return d #return the dictionary

In [46]:
def toDataFrame(DICT,indexCol):
    """ This function takes in a dictionary and use pandas
    to yield a dataframe from that dictionary.
    Parameters:
        DICT: a dictionary
    Return: a dataframe
    """
    df = pd.DataFrame(DICT) #use pandas to yield a dataframe from DICT
    df.set_index(indexCol, inplace = True)
    return df #return the dataframe

In [50]:
def toCSV(df):
    """ This function takes in a dataframe and exports a csv file.
    Parameters:
        df: a dataframe
    Return: None
    """
    df.to_csv(path_or_buf='proj3_followers.csv',index=0) #export a csv file named cntry_by_pop from the given dataframe

In [None]:
def toJSON(DICT):
    with open(".json", "w") as filex:
        json.dump(DICT, filex)

In [51]:
def main(url):
    """ This function takes in an url, retrieves the HTML and extracts the data
    then yields a pandas dataframe and exports to csv.
    Parameters:
        url: a given url
    Return: None
    """
    root = parseHTML(url) #get the root of a tree element from the data
    headers = getColumnName(root)
    if root != None:
        data = createDict(headers,root) #create a dictionary from the extracted data
        df = toDataFrame(data,headers[0]) #yields a pandas dataframe from the data
        print(df) #print the dataframe
        toCSV(df) #export csv

In [52]:
main("https://en.wikipedia.org/wiki/List_of_most-streamed_artists_on_Spotify")

               Artist              Country  Followers (millions)
Rank                                                            
1          Ed Sheeran       United Kingdom                  36.8
2               Drake               Canada                  29.2
3             Rihanna             Barbados                  28.1
4       Justin Bieber               Canada                  25.5
5              Eminem        United States                  21.8
6       Ariana Grande        United States                  21.0
7          Bruno Mars        United States                  18.1
8        David Guetta               France                  16.6
9             Beyoncé        United States                  16.4
10       Taylor Swift        United States                  16.2
11           Coldplay       United Kingdom                  15.9
12            Shakira             Colombia                  14.3
13           Maroon 5        United States                  14.2
14      Calvin Harris    