# Web scraping practices

https://s3.amazonaws.com/mimirplatform.production/files/2a5e3ca3-aac7-4a7e-9cc4-d2821c6f97b7/190103_Scraping_Notebook_v1.html

## Materials

In [None]:
import os
import requests
import re

In [None]:
import warnings
warnings.filterwarnings("ignore") # ignore warnings

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
from bs4 import BeautifulSoup
from datetime import datetime
from wordcloud import WordCloud

## Scraping

In [None]:
string1 = ('Edgar X. AMPLE\n---------------------------------\n12th Corner Street, London (UK)\n+44 (0) 80 400 103 / '
           '+44 590 34 56 78\nexample@email.com')
print(string1)

In [None]:
re_name = re.compile(r'[A-Z][a-z]+(?:-[A-Z][a-z]+)? [A-Z]\. [A-Z]+')
re_phone = re.compile(r'(?:^|(?<=[^0-9]\s))'
                      r'(?:\+[0-9]{2} (?:\(0\) )?|0)[0-9]{2,3}(?: /)?(?: [0-9]{2,3}){2,3}'
                      r'(?:$|(?=\s[^0-9]))')
re_email = re.compile(r'\b[a-z]+(?:\.[a-z]+)?@[a-z]+\.[a-z]{2,3}\b')

re_dict = {'name': re_name, 'phone': re_phone, 'e-mail': re_email}

result_df = []
for t, p in re_dict.items():
    for m in p.finditer(string1):
        result_df.append([t, m.group(), m.start(), m.end()])
        
result_df = pd.DataFrame(result_df, columns = ['Type', 'Match', 'Start', 'End'])
result_df

## Regular Expressions

## Case Study

# Exercises

Q1

In [2]:
# start your code with the following lines
import re

test_str = ['abc ', 'cba', 'a:xa|bc_', 'ca_b__b', '/ccaba', 'ab_x_ba-cab', 
            '|a-b-c|', 'a/b-ac+a', 'c.a.a.cb', '_bbac/', 'ab_ccab_abc', 
            'abcbcbabc', 'b///b//a']

In [None]:
# write your code here

re_p1 = '^[a-c].+[a-c]$'

re_p2 = ''

re_p3 = ''

re_p4 = ''

In [3]:
# match all codes that begin with a, b, or c and end with a, b, or c 
re_p1 = [s for s in test_str if re.match('^[a-c].+[a-c]$', s)]

In [4]:
re_p1

['cba',
 'ca_b__b',
 'ab_x_ba-cab',
 'a/b-ac+a',
 'c.a.a.cb',
 'ab_ccab_abc',
 'abcbcbabc',
 'b///b//a']

In [None]:
# match all codes that contain the letters a, b, and c either in alphabetical order or in reverse alphabetical order, with possibly any character(s) between the letters

In [None]:
re_p2 = [s for s in test_str if re.search('^[a-c]<[a-c].+[a-c]$', s)]

In [None]:
re_p3 = ''

In [None]:
re_p4 = ''

Q2

In [5]:
# start your code with the following lines
import requests
from bs4 import BeautifulSoup
import pandas as pd

df_cols = ['country', 'city', 'school', 'description', 
           'entry_date', 'url_partner_page', 'url_fact_sheet']
# write your code here
url = "https://www.qtem.org/en/academic-partners"
r = requests.get(url)

In [6]:
html_content = r.text

In [7]:
html_content

'\r\n\r\n\r\n<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">\r\n<html xmlns="http://www.w3.org/1999/xhtml" lang="nl">\r\n\t<head>\r\n\t\t\r\n\r\n\t\t\t<title>Academic Partners - Qtem</title>\r\n\t\t\t<meta name="viewport" content="initial-scale=1.0, width=device-width" />\r\n\r\n\t\t\t<meta name="description" content="QTEM, Quantitative Techniques for Economics and Management, is an international network bringing together outstanding students, Academic Partners and International Corporations.">\r\n\t\t\t<meta name="keywords" content="QTEM,Quantitative,Techniques,Economics,Management,Masters,Network">\r\n\r\n\t\t\t<meta property="og:image" content="https://qtem.org/img/QTEM_logo@2x.png"/>\r\n\t\t\t<meta property="og:title" content="Qtem" />\r\n\t\t\t<meta property="og:description" content="QTEM, Quantitative Techniques for Economics and Management, is an international network bringing together outstanding students

In [None]:
# you can end your code with the following lines
print(len(qtem_ap_tags) == partners_df.shape[0])
partners_df.head()