In [10]:
import requests
from bs4 import BeautifulSoup
import re

# REGEX compiled expressions
tweet_re = re.compile(r'#[a-zA-z]+[0-9]+')

goals_re = re.compile(r'Goal [0-9]+')

des_separator_re = re.compile(r'<div id="subHeadline">')
htmltag_re = re.compile(r'<[^>]*>')
newline_re = re.compile(r'[\n\r]+')

#*****************************************

# FUNCTIONS

# Title
def get_title():
    return soup.find(id = 'headline').get_text().strip()

# Goals
def get_goals():
    if len(r.history) and 'the ocean conference' in soup.title.text.lower():


        other_sgd_ind = home_right_raw.index('Other SDGs')

        other_goals = re.findall(goals_re,home_right_raw[other_sgd_ind:])

        goals_lst = ['Goal 14'] + other_goals

    else:
        goals_raw = soup.find(id='targets')
        goals_lst = [goal.get_text() for goal in goals_raw.findAll('strong')]
        
    return ','.join(goals_lst)

# Partners
def get_partners():
    partner_index = home_right_raw_lst.index('Partners') + 1
    next_index = home_right_raw_lst.index('Ocean Basins') \
                    if len(r.history) and 'the ocean conference' in soup.title.text.lower() \
                    else home_right_raw_lst.index('Countries')
            
    partners = [p.strip() for p in home_right_raw_lst[partner_index:next_index]]
    partners = list(filter(None, partners))
    
    return ' || '.join(partners)

# Description
def get_description():
    des_raw = soup.find(id='intro').find('div', attrs={'class':'wrap'})

    temp = str(des_raw)

    if '<div id="subHeadline">' in temp:
        temp = re.sub(des_separator_re, ' : ', temp)
        temp = re.sub(htmltag_re, '', temp)
        temp = re.sub(newline_re, '\n', temp)
        return temp.strip()
    else: 
        return des_raw.get_text().strip()
    
def get_resources():
    resources_raw = soup.find(id='resources')
    resources_lst = []

    for resource in resources_raw.findAll('div', recursive = False):
        temp = resource.get_text()

        if temp != '':
            temp = re.sub(r'\n+', ' : ', temp.strip())
#             temp = re.sub(r'[\x91\x92]', '\'', temp)
            resources_lst.append(temp.strip())
    return ' || '.join(resources_lst)


def get_timeframe():
    time_frame_index  = [i for i, item in enumerate(home_right_raw_lst) if re.search('Time-frame', item)]
    return home_right_raw_lst[time_frame_index[0]] if len(time_frame_index) == 1 else 'Time-frame: '

def get_countries():
    try:
        countries_index = home_right_raw_lst.index('Countries') + 1
        next_index = home_right_raw_lst.index('Contact information')
        countries = home_right_raw_lst[countries_index:next_index]
        return ",".join(countries)
    except:
        return ''
    
def get_hashtag():
    try:
        return list(filter(tweet_re.match, home_right_raw_lst))[0]
    except:
        return ''

In [12]:
#*****************************************

# SETTING UP VARIABLES
base_url = 'https://sustainabledevelopment.un.org/partnership/?p='

functions = [get_title, get_goals, get_partners, get_description,\
             get_resources, get_timeframe, get_countries, get_hashtag]

ids = open('good_ids.txt').read().split()

from random import sample
sub_ids = sample(ids, 50)

In [25]:
data_file = open('data.csv', 'w')

data_file.write('Project_idx\tTitle\tGoals\tPartners\tDescription\tResources\tTime_frame\tCountries\tHashtag\n')

for ide in sub_ids:
    
    project_idx = '0'*(5 - len(ide)) + ide
    
    url = base_url + ide
    
    r = requests.get(url)
    raw_data = r.text
    
#     print(' ******************************************  ')
#     print(url)
    
    soup = BeautifulSoup(raw_data, 'html.parser')

    home_right = soup.find('div', attrs={'class':'homeRight'})
    
    home_right_raw = str(home_right)

    home_right_raw_lst = home_right.getText().split('\n')
    home_right_raw_lst = list(filter(None, home_right_raw_lst))
    
    row = [repr(f()) for f in functions]
    
#     row.insert(0, repr(project_idx))
    
    row = '\t'.join(row)
    row = repr(project_idx) + "\t" + row + '\n'
    data_file.write(row)
    
#     for f in functions:
#         print('\n --------- \n')
#         print(f.__name__)
#         print('\n')
#         print(repr(f()))
        
data_file.close()

In [29]:
import pandas as pd

In [30]:
daata = pd.read_csv('data.csv', sep='\t')

In [31]:
daata

Unnamed: 0,Title,Goals,Partners,Description,Resources,Time_frame,Countries,Hashtag
0,u'Saving 1 Megawatt-hour of Power every day','',u'Tarumitra',': Description/achievement of initiative\nAs o...,"u'Financing (in USD) : 100,000 USD || Staff / ...",u'Time-frame: - 2015-03-30','',u'#SDGAction31'
1,u'Without Water There Is No Life - Educational...,'',"u""-\tThe International Health Awareness Networ...",': Description/achievement of initiative\nThe ...,'',u'Time-frame: - 2013-06-01','',u'#SDGAction32'
2,"u""Promoting sustainability through lthe UK's l...",'',u'Sustainability Links',': Description/achievement of initiative\nLink...,'',u'Time-frame: - 2012-12-31','',u'#SDGAction33'
3,u'Promotion of Green Productivity for Sustaina...,'',"u'Asian Productivity Organization, Tokyo, Japan'",': Description/achievement of initiative\nSMEs...,u'Staff / Technical expertise : Training & cap...,u'Time-frame: - 2020-04-09','',u'#SDGAction36'
4,"u""A Youth Delegation's Commitment to Mission L...",'',u'Students on Ice Alumni Delegation',': Description/achievement of initiative\nFulf...,"u'Other, please specify : Volunteer efforts by...",u'Time-frame: - 2012-06-25','',u'#SDGAction60'
5,u'Preparedness for Environmental Emergencies','',u'Green Cross International with UNEP Awarenes...,': Description/achievement of initiative\nRedu...,u'Staff / Technical expertise : 2-3 staff || F...,u'Time-frame: - 2015-12-31','',u'#SDGAction62'
6,u'International Ecocity Framework and Standard...,'',"u'Ecocity Builders, ICLEI, Livable Cities Netw...",': Description/achievement of initiative\nCiti...,u'Staff / Technical expertise : Ecocity Builde...,u'Time-frame: - 2017-12-29','',u'#SDGAction66'
7,u'Permanent Plan of Awareness of the Climate E...,'',"u'Jean Carlos Dimenez Fuentes, Carolina Guillo...",': Description/achievement of initiative\nBy p...,"u'Financing (in USD) : 14,000 USD'",u'Time-frame: - 2016-05-17','',u'#SDGAction69'
8,u'Vision 2050: A new agenda for business in Br...,'',u'BCSD-Brazil',': Description/achievement of initiative\nBraz...,u'Staff / Technical expertise : More than 450 ...,u'Time-frame: - 2050-01-01','',u'#SDGAction70'
9,u'The SustainAGRO Initiative','',u'SustainAGRO Initiative (including the Instit...,': Description/achievement of initiative\nSust...,"u'Staff / Technical expertise : Scientists, jo...",u'Time-frame: - 2012-07-31','',u'#SDGAction73'


In [None]:
1