In [None]:
!pip install textblob
!pip install nltk
!pip install wikipedia

In [None]:
from textblob import TextBlob
from nltk.tokenize import sent_tokenize
import nltk
nltk.download('punkt')

In [None]:
def rank_sentences(text_dict, top=5):
    '''
    text_dict(dict): a dictionary mapping different sections to some text. use the output from the scrape
                     function as an argument
    top(int): total number of sentences that will be returned by this function. Default is 5.
    
    Summary of the algorithm: Stitch all the text together and break them down into sentences.
    For each sentence, give a sentiment score. Take some number of sentences(specified by top)
    with the highest sentiment score(i.e. more positive sentiment) and return the cleaned version of them.
    '''
    
    all_text = ' '.join(text_dict[key] for key in text_dict)
    sentences = sent_tokenize(all_text)
    all_text = [(sent, TextBlob(sent).polarity) for sent in sentences]
    all_text_forward = sorted(all_text, key = lambda x: x[1], reverse=True)[:top]
    
    return [(clean_text(text), score) for text,score in all_text_forward]

def clean_text(text):
    '''clean up the ===title=== part of the text'''
    text = re.sub('e.g.', '', text)
    return re.sub(r'(=)+([a-zA-Z0-9\s]*)(=)+', '', text).strip()

In [None]:
import wikipedia
import re
def scrape(name, filtered_sections = False):
    ''' name (str): name of plant
        filtered_sections: False if you want info from all sections, or list of strings if you only want some info
        RETURNS: dict of '''
    top_wiki = wikipedia.search(name)[0]
    all_content = wikipedia.page(top_wiki).content
    section_split = re.split('\n\n\n== '+ '([a-zA-Z0-9\s]*)' + ' ==\n', all_content)

    content_dict = {'Summary': section_split[0]}
    for i in range(len(section_split)):
        if i%2 == 1:
            content_dict[section_split[i]] = section_split[i+1]

    if filtered_sections == False:
        return content_dict
    else:
        filtered = {}
        for section in filtered_sections:
            try:
                filtered[section] = content_dict[section]
            except KeyError:
                print('the section: '+ section + ' does not exist for '+ name)
                
        return filtered

In [None]:
text_dict = scrape('Golden shower tree')
rank_sentences(text_dict)

In [None]:
text = '=== Culinary uses ===\nBesides making an excellent forage crop for livestock, its leaves and flowers are a valuable survival food: they are high in proteins, and are widespread and abundant.'
regex = re.search(r'(=)+([a-zA-Z0-9\s]*)(=)+', text)

new_text = re.sub(r'(=)+([a-zA-Z0-9\s]*)(=)+', '', text).strip()
new_text