In [31]:
import requests 
import bs4
import pandas as pd
import numpy as np
import json

In [16]:
EDIBLE_FRUITS_URL = 'https://en.wikipedia.org/wiki/List_of_culinary_fruits'
WIKIPEDIA_API_URL = 'https://en.wikipedia.org/w/api.php'

In [3]:
response = requests.get(EDIBLE_FRUITS_URL)
if response.status_code == 200:
    print ("Get Successfull")
else:
    print ("Get Failed")

Get Successfull


In [4]:
resp_soap = bs4.BeautifulSoup(response.text)

In [5]:
resp_soap.title

<title>List of culinary fruits - Wikipedia</title>

###### Header of the first table

In [6]:
resp_soap.table.tbody.tr

<tr>
<th><a href="/wiki/Common_name" title="Common name">Common name</a>
</th>
<th><a href="/wiki/Binomial_nomenclature" title="Binomial nomenclature">Species name</a>
</th>
<th><a href="/wiki/Cultivar" title="Cultivar">Cultivar list</a>
</th></tr>

In [7]:
def add_to_dictionary(table_dict):
    
    if (row_list[0]) in table_dict:
        table_dict[row_list[0]][1] = pos_dict[row_list[0]][1] + ", " + row_list[2]
    else:
        table_dict[row_list[0]] = [row_list[1],row_list[2]]

In [8]:
def process_row(row,ncolumns=0):
    
    row_values = []
    col_count = 0
    for column in row:
        
        if isinstance(column,bs4.element.Tag): # Avoid processing NavigableString - Mostly newline char      
            col_count += 1
            row_values.append(column.get_text(strip=True))
            
        if ncolumns > 0 and col_count >= ncolumns:
            break
            
    return row_values

In [9]:
def process_table(soap_msg_table,ncolumns=0):
    
    row_list = []
    for row in soap_msg_table:
        
        if isinstance(row,bs4.element.Tag): # Avoid processing NavigableString - Mostly newline char
            row_list.append(process_row(row,ncolumns))
            
    #print (row_list)
    return row_list

In [10]:
all_common_spices = pd.DataFrame()
for table in resp_soap.find_all("table"):
    common_spices = pd.DataFrame(process_table(table.tbody,ncolumns=3))
    header = common_spices.loc[0]
    common_spices = common_spices[1:]
    all_common_spices = all_common_spices.append(common_spices)
all_common_spices.columns = header

In [11]:
all_common_spices.head()

Unnamed: 0,Common name,Species name,Cultivar list
1,Apple,Malus pumila,Apple cultivars
2,Chinese quince,Pseudocydonia sinensis,
3,Chokeberry,Aronia melanocarpa,
4,Cocky apple,Planchonia careya,
5,Eastern mayhaw,Crataegus aestivalis,


In [12]:
all_common_spices.shape

(422, 3)

In [13]:
all_common_spices.loc[all_common_spices['Common name'] == 'Saguaro fruit']

Unnamed: 0,Common name,Species name,Cultivar list
7,Saguaro fruit,Carnegiea gigantea,


###### Get the data using Wikipedia API

In [43]:
def get_wiki_url(query):
    return WIKIPEDIA_API_URL + f'?action=parse&format=json&page={query}'

In [44]:
response = requests.get(get_wiki_url('Apple'))
if response.status_code == 200:
    print ("Get Successfull")
else:
    print ("Get Failed")

Get Successfull


In [51]:
json_response = json.loads(response.text)

In [56]:
json_response['parse']['text']['*']

'<div class="mw-parser-output"><div role="note" class="hatnote navigation-not-searchable">This article is about the fruit and the tree. For the technology company, see <a href="/wiki/Apple_Inc." title="Apple Inc.">Apple Inc.</a> For other uses, see <a href="/wiki/Apple_(disambiguation)" class="mw-disambig" title="Apple (disambiguation)">Apple (disambiguation)</a>, <a href="/wiki/Apple_Blossom_(disambiguation)" class="mw-redirect mw-disambig" title="Apple Blossom (disambiguation)">Apple Blossom (disambiguation)</a>, and <a href="/wiki/Apple_tree_(disambiguation)" class="mw-disambig" title="Apple tree (disambiguation)">Apple tree (disambiguation)</a>.</div>\n<p class="mw-empty-elt">\n</p>\n<div class="shortdescription nomobile noexcerpt noprint searchaux" style="display:none">edible fruit of domesticated deciduous tree</div>\n<p class="mw-empty-elt">\n\n\n</p>\n<table class="infobox biota" style="text-align: left; width: 200px; font-size: 100%">\n\n<tbody><tr>\n<th colspan="2" style="tex

In [58]:
resp_soap = bs4.BeautifulSoup(json_response['parse']['text']['*'])
resp_soap.prettify()

'<div class="mw-parser-output">\n <div class="hatnote navigation-not-searchable" role="note">\n  This article is about the fruit and the tree. For the technology company, see\n  <a href="/wiki/Apple_Inc." title="Apple Inc.">\n   Apple Inc.\n  </a>\n  For other uses, see\n  <a class="mw-disambig" href="/wiki/Apple_(disambiguation)" title="Apple (disambiguation)">\n   Apple (disambiguation)\n  </a>\n  ,\n  <a class="mw-redirect mw-disambig" href="/wiki/Apple_Blossom_(disambiguation)" title="Apple Blossom (disambiguation)">\n   Apple Blossom (disambiguation)\n  </a>\n  , and\n  <a class="mw-disambig" href="/wiki/Apple_tree_(disambiguation)" title="Apple tree (disambiguation)">\n   Apple tree (disambiguation)\n  </a>\n  .\n </div>\n <p class="mw-empty-elt">\n </p>\n <div class="shortdescription nomobile noexcerpt noprint searchaux" style="display:none">\n  edible fruit of domesticated deciduous tree\n </div>\n <p class="mw-empty-elt">\n </p>\n <table class="infobox biota" style="text-align