In [4]:
import requests 
import bs4
import pandas as pd
import numpy as np
import json
import matplotlib 
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS

In [21]:
EDIBLE_FRUITS_URL = 'https://en.wikipedia.org/wiki/List_of_culinary_fruits'
EDIBLE_FRUITS_URL = 'https://en.wikipedia.org/w/api.php?action=parse&format=json&page=List_of_culinary_fruits'
WIKIPEDIA_API_URL = 'https://en.wikipedia.org/w/api.php'

In [22]:
response = requests.get(EDIBLE_FRUITS_URL)
if response.status_code == 200:
    print ("Get Successfull")
else:
    print ("Get Failed")

Get Successfull


In [23]:
jresponse = json.loads(response.text)
resp_soap = bs4.BeautifulSoup(jresponse['parse']['text']['*'])

###### Header of the first table

In [28]:
resp_soap.table.tbody.tr

<tr>
<th><a href="/wiki/Common_name" title="Common name">Common name</a>
</th>
<th><a href="/wiki/Binomial_nomenclature" title="Binomial nomenclature">Species name</a>
</th>
<th><a href="/wiki/Cultivar" title="Cultivar">Cultivar list</a>
</th></tr>

In [29]:
def add_to_dictionary(table_dict):
    
    if (row_list[0]) in table_dict:
        table_dict[row_list[0]][1] = pos_dict[row_list[0]][1] + ", " + row_list[2]
    else:
        table_dict[row_list[0]] = [row_list[1],row_list[2]]

In [30]:
def process_row(row,ncolumns=0):
    
    row_values = []
    col_count = 0
    for column in row:
        if isinstance(column,bs4.element.Tag): # Avoid processing NavigableString - Mostly newline char      
            col_count += 1
            row_values.append(column.get_text(strip=True))
            
            if col_count == 1:
                if column.find('a'):
                    row_values.append(column.a.get_attribute_list('href')[0])
                else:
                    row_values.append('')
            
        if ncolumns > 0 and col_count >= ncolumns:
            break
            
    return row_values

In [31]:
def process_table(soap_msg_table,ncolumns=0):
    
    row_list = []
    for row in soap_msg_table:
        
        if isinstance(row,bs4.element.Tag): # Avoid processing NavigableString - Mostly newline char
            row_list.append(process_row(row,ncolumns))
            
    #print (row_list)
    return row_list

In [32]:
all_edible_fruits = pd.DataFrame()
for table in resp_soap.find_all("table"):
    common_spices = pd.DataFrame(process_table(table.tbody,ncolumns=3))
    header = common_spices.loc[0]
    common_spices = common_spices[1:]
    all_edible_fruits = all_edible_fruits.append(common_spices)
    
all_edible_fruits.columns = ['Common name','Page','Species name','Cultivar list']

In [33]:
all_edible_fruits.head()

Unnamed: 0,Common name,Page,Species name,Cultivar list
1,Apple,/wiki/Apple,Malus pumila,Apple cultivars
2,Chinese quince,/wiki/Chinese_quince,Pseudocydonia sinensis,
3,Chokeberry,/wiki/Chokeberry,Aronia melanocarpa,
4,Cocky apple,/wiki/Cocky_apple,Planchonia careya,
5,Eastern mayhaw,/wiki/Crataegus_aestivalis,Crataegus aestivalis,


In [34]:
all_edible_fruits.shape

(422, 4)

In [35]:
all_edible_fruits.loc[all_edible_fruits.index[1]].head()

Unnamed: 0,Common name,Page,Species name,Cultivar list
2,Chinese quince,/wiki/Chinese_quince,Pseudocydonia sinensis,
2,Acerola,/wiki/Acerola,Malpighia emarginata,
2,Amanatsu,/wiki/Amanatsu,Citrus × natsudaidai,
2,Amazon grape,/wiki/Amazon_grape,Pourouma cecropiifolia,
2,Canary melon,/wiki/Canary_melon,Cucumis melo var. inodorus'Canary Melon',


In [36]:
all_edible_fruits = all_edible_fruits.reset_index()
all_edible_fruits.drop('index',axis=1,inplace=True)

In [37]:
all_edible_fruits.drop(all_edible_fruits.index[all_edible_fruits['Page'] == '']).shape

(419, 4)

In [38]:
all_edible_fruits.loc[all_edible_fruits['Common name'] == 'Saguaro fruit']

Unnamed: 0,Common name,Page,Species name,Cultivar list
410,Saguaro fruit,/wiki/Carnegiea_gigantea,Carnegiea gigantea,


###### Get the data using Wikipedia API

In [40]:
def get_wiki_url(end_point):
    return WIKIPEDIA_API_URL + f'?action=parse&format=json&page={end_point.split("/")[-1]}'

In [41]:
def get_child_pages(end_points):

    massive_text = ""
    for ep in end_points:
        url =get_wiki_url(ep)
        print(url)
        response = requests.get(url)
        if response.status_code != 200:
            print ("Get Failed") # Only report if any failures
        
        jresponse = json.loads(response.text)
        
        if 'parse' in jresponse:
            #print(jresponse['parse'].keys())
            #print(jresponse['parse']['text']['*'])
            massive_text += bs4.BeautifulSoup(jresponse['parse']['text']['*']).get_text()
    
    return massive_text

In [61]:
massive_text = get_child_pages(all_edible_fruits['Page'][5:50])

https://en.wikipedia.org/w/api.php?action=parse&format=json&page=Crataegus
https://en.wikipedia.org/w/api.php?action=parse&format=json&page=Genipa_americana
https://en.wikipedia.org/w/api.php?action=parse&format=json&page=Loquat
https://en.wikipedia.org/w/api.php?action=parse&format=json&page=Flacourtia_inermis
https://en.wikipedia.org/w/api.php?action=parse&format=json&page=Medlar
https://en.wikipedia.org/w/api.php?action=parse&format=json&page=Malus_niedzwetzkyana
https://en.wikipedia.org/w/api.php?action=parse&format=json&page=Pear
https://en.wikipedia.org/w/api.php?action=parse&format=json&page=Quince
https://en.wikipedia.org/w/api.php?action=parse&format=json&page=Ramontchi
https://en.wikipedia.org/w/api.php?action=parse&format=json&page=Rose_hip
https://en.wikipedia.org/w/api.php?action=parse&format=json&page=Rowan
https://en.wikipedia.org/w/api.php?action=parse&format=json&page=Sapodilla
https://en.wikipedia.org/w/api.php?action=parse&format=json&page=Serviceberry
https://en.wik

In [62]:
stopwords = set(STOPWORDS)
stopwords.add('output')
stopwords.add('cs1')
stopwords.add('background')
stopwords.add('parser')
stopwords.add('mw')
stopwords.add('wikimedia')
stopwords.add('png')
stopwords.add('repeat')
stopwords.add('org')
stopwords.add('url')
stopwords.add('upload')
stopwords.add('edit')
# instantiate a word cloud object
response_wc = WordCloud(
    background_color='white',
    max_words=2000,
    stopwords=stopwords
)

# generate the word cloud
response_wc.generate(massive_text)

<wordcloud.wordcloud.WordCloud at 0x7f58006e6978>

```
fig = plt.figure()
fig.set_figwidth(14)
fig.set_figheight(18)
plt.imshow(response_wc, interpolation='bilinear')

plt.axis('off')
plt.show()
```