## Web scrapping using python

#### References
1. [Practical Introduction to Web Scraping in Python](https://realpython.com/python-web-scraping-practical-introduction/)
2. [Web Scraping using Python](https://www.datacamp.com/community/tutorials/web-scraping-using-python)

In [39]:
# $ python3 -m venv venv
# $ . ./venv/bin/activate

In [42]:
#Better
!pip install requests BeautifulSoup4 fire



In [45]:
from requests import get
from requests.exceptions import RequestException
from contextlib import closing
from bs4 import BeautifulSoup
import pandas as pd
import os, sys

import fire

In [48]:
#%%writefile ../pyscrap_url.py

def simple_get(url):
    """
    Attempts to get the content at `url` by making an HTTP GET request.
    If the content-type of response is some kind of HTML/XML, return the
    text content, otherwise return None.
    """
    try:
        with closing(get(url, stream=True)) as resp:
            if is_good_response(resp):
                return resp.content  #.encode(BeautifulSoup.original_encoding)
            else:
                return None

    except RequestException as e:
        log_error('Error during requests to {0} : {1}'.format(url, str(e)))
        return None


def is_good_response(resp):
    """
    Returns True if the response seems to be HTML, False otherwise.
    """
    content_type = resp.headers['Content-Type'].lower()
    return (resp.status_code == 200 
            and content_type is not None 
            and content_type.find('html') > -1)


def log_error(e):
    """
    It is always a good idea to log errors. 
    This function just prints them, but you can
    make it do anything.
    """
    print(e)
    
def get_elements(url, tag='',search={}, fname=None):
    """
    Downloads a page specified by the url parameter
    and returns a list of strings, one per tag element
    """
    
    if isinstance(url,str):
        response = simple_get(url)
    else:
        #if already it is a loaded html page
        response = url

    if response is not None:
        html = BeautifulSoup(response, 'html.parser')
        
        res = []
        if tag:    
            for li in html.select(tag):
                for name in li.text.split('\n'):
                    if len(name) > 0:
                        res.append(name.strip())
                       
                
        if search:
            soup = html            
            
            
            r = ''
            if 'find' in search.keys():
                print('findaing',search['find'])
                soup = soup.find(**search['find'])
                r = soup

                
            if 'find_all' in search.keys():
                print('findaing all of',search['find_all'])
                r = soup.find_all(**search['find_all'])
   
            if r:
                for x in list(r):
                    if len(x) > 0:
                        res.extend(x)
            
        return res

    # Raise an exception if we failed to get any data from the url
    raise Exception('Error retrieving contents at {}'.format(url))    
    
def get_element(url, tag='',search={}, fname=None):
    """
    Downloads a page specified by the url parameter
    and returns a list of strings, one per tag element
    """
    if isinstance(url,str):
        response = simple_get(url)
    else:
        #if already it is a loaded html page
        response = url
    if response is not None:
        html = BeautifulSoup(response, 'html.parser')
        res = []
        if tag:    
            for li in html.select(tag):
                for name in li.text.split('\n'):
                    if len(name) > 0:
                        res.append(name.strip())
        if search:
            soup = html            
            r = ''
            if 'find' in search.keys():
                print('findaing',search['find'])
                soup = soup.find(**search['find'])
                r = soup
            if 'find_all' in search.keys():
                print('findaing all of',search['find_all'])
                r = soup.find_all(**search['find_all'])
            if r:
                for x in list(r):
                    if len(x) > 0:
                        res.extend(x)
        return res
    # Raise an exception if we failed to get any data from the url
    raise Exception('Error retrieving contents at {}'.format(url))    
    
    
if get_ipython().__class__.__name__ == '__main__':
    fire(get_tag_elements)

### Web scrape 100 most influential Twitter users in Africa using Python or Bash to obtain the 100 African twitter influencers. 

In [51]:
res = get_elements('https://africafreak.com/100-most-influential-twitter-users-in-africa',tag='h2')
res

['100. Jeffrey Gettleman (@gettleman)',
 '99. Africa24 Media (@a24media)',
 '98. Scapegoat (@andiMakinana)',
 '97. Africa Check (@AfricaCheck)',
 '96. James Copnall (@JamesCopnall)',
 '95. Online Africa (@oafrica)',
 '94. Patrick Ngowi (@PatrickNgowi)',
 '93. DOS African Affairs (@StateAfrica)',
 '92. MoadowAJE (@Moadow)',
 '91. Brendan Boyle (@BrendanSAfrica)',
 '90. City of Tshwane (@CityTshwane)',
 '89. VISI Magazine (@VISI_Mag)',
 '88. andBeyond (@andBeyondSafari)',
 '87. This Is Africa (@ThisIsAfricaTIA)',
 '86. Sarah Carter (@sarzss)',
 '85. The EIU Africa team (@TheEIU_Africa)',
 '84. Investing In Africa (@InvestInAfrica)',
 '83. Barry Malone (@malonebarry)',
 '82. ARTsouthAFRICA (@artsouthafrica)',
 '81. Kahn Morbee (@KahnMorbee)',
 '80. Jamal Osman (@JamalMOsman)',
 '79. iamsuede™ (@iamsuede)',
 '78. Mike Stopforth (@mikestopforth)',
 '77. Equal Education (@equal_education)',
 '76. Tristan McConnell (@t_mcconnell)',
 '75. Kate Forbes (@forbeesta)',
 '74. Vanessa Raphaely (@hur

###  List the Twitter handles of the 10 most influential Twitter users in Africa in order of their popularity (most influential to least influential)

In [54]:
list_str = str(res)
list_str_split = list_str.split("11")
new_list = list_str_split[1]
new_list
new_list2 = new_list.split(",")
new_list2
Influentials = []
for x in range(1, 11):
    Influentials.append(new_list2[x])
Influentials

[" '10. Computicket (@Computicket)'",
 " '9. loyiso gola (@loyisogola)'",
 " '8. 5FM (@5FM)'",
 " '7. mailandguardian (@mailandguardian)'",
 " '6. Helen Zille (@helenzille)'",
 " '5. Julius Sello Malema (@Julius_S_Malema)'",
 " '4. News24 (@News24)'",
 " '3. Jacob G. Zuma (@SAPresident)'",
 " '2. Gareth Cliff (@GarethCliff)'",
 " '1. Trevor Noah (@Trevornoah)'"]

In [57]:
Influentials.reverse()
Influentials

[" '10. Computicket (@Computicket)'",
 " '9. loyiso gola (@loyisogola)'",
 " '8. 5FM (@5FM)'",
 " '7. mailandguardian (@mailandguardian)'",
 " '6. Helen Zille (@helenzille)'",
 " '5. Julius Sello Malema (@Julius_S_Malema)'",
 " '4. News24 (@News24)'",
 " '3. Jacob G. Zuma (@SAPresident)'",
 " '2. Gareth Cliff (@GarethCliff)'",
 " '1. Trevor Noah (@Trevornoah)'"]

In [60]:
Influentials[:5]
Influentials = pd.Series(Influentials)
user_twitter_handle = [i.split('(')[-1].strip(")") for i in Influentials]
user_twitter_handle

["@Computicket)'",
 "@loyisogola)'",
 "@5FM)'",
 "@mailandguardian)'",
 "@helenzille)'",
 "@Julius_S_Malema)'",
 "@News24)'",
 "@SAPresident)'",
 "@GarethCliff)'",
 "@Trevornoah)'"]

In [63]:
import csv
from pandas import DataFrame
df = DataFrame (user_twitter_handle,columns=['Top 10 Influentials'])
print(df)

  Top 10 Influentials
0      @Computicket)'
1       @loyisogola)'
2              @5FM)'
3  @mailandguardian)'
4       @helenzille)'
5  @Julius_S_Malema)'
6           @News24)'
7      @SAPresident)'
8      @GarethCliff)'
9       @Trevornoah)'


In [None]:
df.to_csv(r'tijesunimiolashore@gmail.com_influentials.csv')

### List the Twitter handles of the 10 most influential government officials in Africa in order of their influence (most influential to least influential)

In [None]:
url= 'https://www.atlanticcouncil.org/blogs/africasource/african-leaders-respond-to-coronavirus-on-twitter/#east-africa'
response = simple_get(url)

res = get_elements(response, search={'find_all':{'class_':'wp-block-embed__wrapper'}})
res_gov = get_element(response, search={'find_all':{'class_':'wp-block-embed__wrapper'}}) 
res_gov

### To get the list of the 36 government officials

In [71]:
url= 'https://www.atlanticcouncil.org/blogs/africasource/african-leaders-respond-to-coronavirus-on-twitter/#east-africa'
response = get(url).content
res_gov = get_elements(response, tag='blockquote')
for r in res_gov:
    split_data = r.split('— ',maxsplit=1)[1].rsplit('(',maxsplit=1)
    name = split_data[0].split(',')[0].strip()
    handle =  split_data[1].rsplit(')',maxsplit=1)[0]
    print(f'{name} : {handle}')

Eswatini Government : @EswatiniGovern1
Malawi Government : @MalawiGovt
Hage G. Geingob : @hagegeingob
Seychelles Ministry of Finance : @FinanceSC
PresidencyZA : @PresidencyZA
Ministry of Health Zambia : @mohzambia
President of Zimbabwe : @edmnangagwa
MinSantédj : @MinSantedj
Yemane G. Meskel : @hawelti
State House Kenya : @StateHouseKenya
Paul Kagame : @PaulKagame
Mohamed Farmaajo : @M_Farmaajo
H.E Hussein Abdelbagi Akol : @SouthSudanGov
Abdalla Hamdok : @SudanPMHamdok
TanzaniaSpokesperson : @TZSpokesperson
Yoweri K Museveni : @KagutaMuseveni
MOFA/MRE -(Angola) : @angola_Mirex
Amb. Willy Nyamitwe : @willynyamitwe
Chérif Mahamat Zene : @Cherif_MZ
Présidence RDC 🇨🇩 : @Presidence_RDC
Ali Bongo Ondimba : @PresidentABO
Présidence du Bénin : @PresidenceBenin
Roch KABORE : @rochkaborepf
Presidente Cabo Verde : @PresidenciaCV
Alassane Ouattara : @AOuattara_PRCI
State House of The Gambia : @Presidency_GMB
Nana Akufo-Addo : @NAkufoAddo
Pr. Alpha CONDÉ : @President_GN
Umaro Sissoco Embalo : @USEm

### To get the twitter handles only of the first 10 government officials, we need to get their followers count and deduce the first 10 most influential government officials in Africa from there

In [80]:
url= 'https://www.atlanticcouncil.org/blogs/africasource/african-leaders-respond-to-coronavirus-on-twitter/#east-africa'
response = get(url).content
res_gov = get_elements(response, tag='blockquote')
Gov_official_handles = []
for r in res_gov:
    split_data = r.split('— ',maxsplit=1)[1].rsplit('(',maxsplit=1)
    name = split_data[0].split(',')[0].strip()
    handle =  split_data[1].rsplit(')',maxsplit=1)[0]
    Gov_official_handles.append(f'{handle}')

    MyList = []
for x in range(0, 10):
    name = Gov_official_handles[x]
    MyList.append(Gov_official_handles[x])
MyList

import csv
from pandas import DataFrame
df = DataFrame (MyList,columns=['Top 10 Government Influentials'])
df.to_csv(r'tijesunimiolashore@gmail.com_govinfluentials.csv')

In [None]:
for r in res_gov:
    split_data = r.split('— ',maxsplit=1)[1].rsplit('(',maxsplit=1)
    name = split_data[0].split(',')[0].strip()
    handle =  split_data[1].rsplit(')',maxsplit=1)[0]
    print(f'{name} : {handle}')
    
    import tweepy
    consumer_key = "cd6xXCBVFRKohny14KUvgBcyf"
    consumer_secret = "FvHm7Cd2g73FdZ269VjbZSE3ymngOse3p8RXfpavwbmovupt8d"
    access_token = "1246485711282462722-8JMUgu4vunrqXs04hfASXgZ1rhP1f3"
    access_token_secret = "aLi3JSDTZ79XDutDFXiSVU9kqrm88TKNUZF9bG7pIskoB"

    # Creating the authentication object
    auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
    # Setting your access token and secret
    auth.set_access_token(access_token, access_token_secret)
    # Creating the API object while passing in auth information
    api = tweepy.API(auth) 

    for x in range(0, 36):

        # The Twitter user who we want to get tweets from
        name = Gov_official_handles[x]
        # Number of tweets to pull
        tweetCount = 10

        # Calling the user_timeline function with our parameters
        results = api.user_timeline(id=name, count=tweetCount)

        # foreach through all tweets pulled
        for tweet in results:
           # printing the text stored inside the tweet object
           print(tweet.text)

### Provide the top 5 unique hashtags these that the top influencer used in their top 10 retweets.

In [None]:
import tweepy
consumer_key = "cd6xXCBVFRKohny14KUvgBcyf"
consumer_secret = "FvHm7Cd2g73FdZ269VjbZSE3ymngOse3p8RXfpavwbmovupt8d"
access_token = "1246485711282462722-8JMUgu4vunrqXs04hfASXgZ1rhP1f3"
access_token_secret = "aLi3JSDTZ79XDutDFXiSVU9kqrm88TKNUZF9bG7pIskoB"

# Creating the authentication object
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
# Setting your access token and secret
auth.set_access_token(access_token, access_token_secret)
# Creating the API object while passing in auth information
api = tweepy.API(auth) 

for x in range(0, 10):

    # The Twitter user who we want to get tweets from
    name = "@Trevornoah"
    # Number of tweets to pull
    tweetCount = 10

    # Calling the user_timeline function with our parameters
    results = api.user_timeline(id=name, count=tweetCount)

    # foreach through all tweets pulled
    for tweet in results:
       # printing the text stored inside the tweet object
       print(tweet.text)

### Provide the top 5 unique hashtags these that the top government official used in their top 10 retweets.

In [None]:
# The Twitter user who we want to get tweets from
name = "gettleman"
# Number of tweets to pull
tweetCount = 20

# Calling the user_timeline function with our parameters
results = api.user_timeline(id=name, count=tweetCount)

# foreach through all tweets pulled
for tweet in results:
   # printing the text stored inside the tweet object
   print(tweet.text)

In [None]:

consumer_key = "cd6xXCBVFRKohny14KUvgBcyf"
consumer_secret = "FvHm7Cd2g73FdZ269VjbZSE3ymngOse3p8RXfpavwbmovupt8d"
access_token = "1246485711282462722-8JMUgu4vunrqXs04hfASXgZ1rhP1f3"
access_token_secret = "aLi3JSDTZ79XDutDFXiSVU9kqrm88TKNUZF9bG7pIskoB"

## Web scrapping using bash script
If the web site has a quite simple HTML, you can easily use curl to perform the request and then extract the needed values using bash commands grep, cut , sed, ..

This tutorial is adapted from [this](https://medium.com/@LiliSousa/web-scraping-with-bash-690e4ee7f98d) medium article

In [None]:
%%bash 

# curl the page and save content to tmp_file
#url = "https://www.atlanticcouncil.org/blogs/africasource/african-leaders-respond-to-coronavirus-on-twitter/#east-africa"
#curl -X GET $url -o tmp_file


#!/bin/bash

# write headers to CSV file
echo "Name, twitter_id" >> extractData.csv
n="1"
while [ $n -lt 2 ]
do
  
  #get title
  title=$(cat tmp_file | grep "class=\"twitter-tweet\"" | cut -d ';' -f1 )
  echo $title
  #get author
  #twitter_id=$(cat tmp_file |grep -A1 "class=\"css-901oao css-16my406 r-1qd0xha r-ad9z0x r-bcqeeo r-qvutc0\"" | tail -1)

  #echo "$title, $twitter_id" >> extractData.csv
  #echo "$title, $twitter_id"
    
  n=$[$n+1]

done