# Web Scraping from Wikipédia

The goal is to collect information on 6 companies by scraping the wikipedia.org website. The companies are:
- Microsoft
- Salesforce
- BNP Paribas
- HSBC
- Dataiku
- Bouygues Construction

The information we are interested in is:
- Returned
- Number of employees
- The headquarters
- Social media links
- Website links

To solve the problem we will use the Selenium library. This involves creating a Dashboard in the form of a web application where all the information collected will be represented. The application must contain a map to locate the head offices

In [1]:
from selenium import webdriver
import pandas as pd
import numpy as np
import re

## Web Scraping

In [2]:
#store website in a variable
url = 'https://en.wikipedia.org/wiki/'
#chromedriver
driver = webdriver.Chrome('chromedriver.exe')

  driver = webdriver.Chrome('chromedriver.exe')


In [3]:
companies = ['Microsoft','Salesforce','BNP Paribas','HSBC','Dataiku','Bouygues Construction']
effectif = []
site_web = []
siege_social = []
revenue = []

for company in companies :
    
    #wikipedia extraction
    #go to the wikipage
    driver.get(url + company.replace(" ", "_"))
    
    #collect raw information
    wiki_data = driver.find_elements_by_xpath('//table[@class="infobox vcard"]/tbody')
    
    #collect information we need using RegEx
    effectif.append(re.findall(r"Number of employees(.*?)\n",wiki_data[0].text)[0])
    site_web.append(re.findall(r"Website\s\w+\.\w+\.\w+|Website\s\w+\.\w+|Website\s\.*",wiki_data[0].text)[0])
    siege_social.append(re.findall(r"Headquarters(.*?)\n(.*?)\n",wiki_data[0].text)[0])
    revenue.append(re.findall(r"Revenue(.*?)\n",wiki_data[0].text)[0])
       

  wiki_data = driver.find_elements_by_xpath('//table[@class="infobox vcard"]/tbody')


## From raw data to preprocessed CSV file

In [4]:
#create dataframe
dataframe = pd.DataFrame(data=np.array([companies,effectif,site_web,revenue,siege_social]).T, columns=["Name", "Effectif", "Site Web", "Revenue", "Adresse"])

#preprocessing 
dataframe["Site Web"] = dataframe["Site Web"].apply(lambda x : "https://www." + x.replace("Website","").replace(" ",""))
dataframe["Effectif"] = dataframe["Effectif"].apply(lambda x : int(x.split("(")[0].replace(',','').replace('+','')))
dataframe['Revenue'] = dataframe['Revenue'].apply(lambda x : x.split("(")[0])
dataframe["Adresse"] = dataframe["Adresse"].apply(lambda x : " ".join(map(str, x)).split("Key")[0].split("Area")[0])
dataframe["Revenue"] = dataframe["Revenue"].apply(lambda x : x.replace(".","").replace(" million","").replace(" billion", "000"))
dataframe["Revenue"] = dataframe["Revenue"].apply(lambda x : int(x.replace("US$", "") if "US$" in x else 1.04*int(x.replace("€", ""))))
dataframe["LinkedIn"] = dataframe["Name"].apply(lambda x : "https://www.linkedin.com/company/" + x.lower().replace(" ","-") +"/")

  dataframe = pd.DataFrame(data=np.array([companies,effectif,site_web,revenue,siege_social]).T, columns=["Name", "Effectif", "Site Web", "Revenue", "Adresse"])


For the map we need to find the geographical coordinates (lontitude, latitude) from the scraped addresses.

In [5]:
from geopy import Nominatim
from geopy.extra.rate_limiter import RateLimiter


locator = Nominatim(user_agent="myGeocoder")
location = locator.geocode("Washington, U.S.")

# 1 - conveneint function to delay between geocoding calls
geocode = RateLimiter(locator.geocode, min_delay_seconds=2)

def get_geocode(address):
    """
    Get longitude and latitude. If the 
    starting address does not allow it.
    Words are deleted until the address
    is understood by the 
    geopy.Nominatim.geocode function.
    
    Parameters :
    ------------
    
    address : str
    
    Return 
    ------
    
    longitude, latitude : float
    
    """
    
    while geocode(address) == None :
        address = " ".join(address.split()[1:])


        if address == "":
            return None, None
        
    return geocode(address).longitude, geocode(address).latitude

In [None]:
longitude = []
latitude = []

for address in dataframe["Adresse"] :
    
    long, lat = get_geocode(address)
    
    longitude.append(long)
    latitude.append(lat)

RateLimiter caught an error, retrying (0/2 tries). Called with (*('Tower San Francisco, California, U.S.',), **{}).
Traceback (most recent call last):
  File "C:\Anaconda\lib\site-packages\urllib3\connectionpool.py", line 445, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "C:\Anaconda\lib\site-packages\urllib3\connectionpool.py", line 440, in _make_request
    httplib_response = conn.getresponse()
  File "C:\Anaconda\lib\http\client.py", line 1347, in getresponse
    response.begin()
  File "C:\Anaconda\lib\http\client.py", line 307, in begin
    version, status, reason = self._read_status()
  File "C:\Anaconda\lib\http\client.py", line 268, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "C:\Anaconda\lib\socket.py", line 669, in readinto
    return self._sock.recv_into(b)
  File "C:\Anaconda\lib\ssl.py", line 1241, in recv_into
    return self.read(nbytes, buffer)
  File "C:\Anaconda\lib\ssl.py", line 1

RateLimiter swallowed an error after 2 retries. Called with (*('Tower San Francisco, California, U.S.',), **{}).
Traceback (most recent call last):
  File "C:\Anaconda\lib\site-packages\urllib3\connectionpool.py", line 445, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "C:\Anaconda\lib\site-packages\urllib3\connectionpool.py", line 440, in _make_request
    httplib_response = conn.getresponse()
  File "C:\Anaconda\lib\http\client.py", line 1347, in getresponse
    response.begin()
  File "C:\Anaconda\lib\http\client.py", line 307, in begin
    version, status, reason = self._read_status()
  File "C:\Anaconda\lib\http\client.py", line 268, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "C:\Anaconda\lib\socket.py", line 669, in readinto
    return self._sock.recv_into(b)
  File "C:\Anaconda\lib\ssl.py", line 1241, in recv_into
    return self.read(nbytes, buffer)
  File "C:\Anaconda\lib\ssl.py", line 1099

RateLimiter caught an error, retrying (1/2 tries). Called with (*('San Francisco, California, U.S.',), **{}).
Traceback (most recent call last):
  File "C:\Anaconda\lib\site-packages\urllib3\connectionpool.py", line 445, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "C:\Anaconda\lib\site-packages\urllib3\connectionpool.py", line 440, in _make_request
    httplib_response = conn.getresponse()
  File "C:\Anaconda\lib\http\client.py", line 1347, in getresponse
    response.begin()
  File "C:\Anaconda\lib\http\client.py", line 307, in begin
    version, status, reason = self._read_status()
  File "C:\Anaconda\lib\http\client.py", line 268, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "C:\Anaconda\lib\socket.py", line 669, in readinto
    return self._sock.recv_into(b)
  File "C:\Anaconda\lib\ssl.py", line 1241, in recv_into
    return self.read(nbytes, buffer)
  File "C:\Anaconda\lib\ssl.py", line 1099, i

RateLimiter caught an error, retrying (0/2 tries). Called with (*('Francisco, California, U.S.',), **{}).
Traceback (most recent call last):
  File "C:\Anaconda\lib\site-packages\urllib3\connectionpool.py", line 445, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "C:\Anaconda\lib\site-packages\urllib3\connectionpool.py", line 440, in _make_request
    httplib_response = conn.getresponse()
  File "C:\Anaconda\lib\http\client.py", line 1347, in getresponse
    response.begin()
  File "C:\Anaconda\lib\http\client.py", line 307, in begin
    version, status, reason = self._read_status()
  File "C:\Anaconda\lib\http\client.py", line 268, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "C:\Anaconda\lib\socket.py", line 669, in readinto
    return self._sock.recv_into(b)
  File "C:\Anaconda\lib\ssl.py", line 1241, in recv_into
    return self.read(nbytes, buffer)
  File "C:\Anaconda\lib\ssl.py", line 1099, in re

RateLimiter swallowed an error after 2 retries. Called with (*('Francisco, California, U.S.',), **{}).
Traceback (most recent call last):
  File "C:\Anaconda\lib\site-packages\urllib3\connectionpool.py", line 445, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "C:\Anaconda\lib\site-packages\urllib3\connectionpool.py", line 440, in _make_request
    httplib_response = conn.getresponse()
  File "C:\Anaconda\lib\http\client.py", line 1347, in getresponse
    response.begin()
  File "C:\Anaconda\lib\http\client.py", line 307, in begin
    version, status, reason = self._read_status()
  File "C:\Anaconda\lib\http\client.py", line 268, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "C:\Anaconda\lib\socket.py", line 669, in readinto
    return self._sock.recv_into(b)
  File "C:\Anaconda\lib\ssl.py", line 1241, in recv_into
    return self.read(nbytes, buffer)
  File "C:\Anaconda\lib\ssl.py", line 1099, in read


RateLimiter caught an error, retrying (1/2 tries). Called with (*('California, U.S.',), **{}).
Traceback (most recent call last):
  File "C:\Anaconda\lib\site-packages\urllib3\connectionpool.py", line 445, in _make_request
    six.raise_from(e, None)
  File "<string>", line 3, in raise_from
  File "C:\Anaconda\lib\site-packages\urllib3\connectionpool.py", line 440, in _make_request
    httplib_response = conn.getresponse()
  File "C:\Anaconda\lib\http\client.py", line 1347, in getresponse
    response.begin()
  File "C:\Anaconda\lib\http\client.py", line 307, in begin
    version, status, reason = self._read_status()
  File "C:\Anaconda\lib\http\client.py", line 268, in _read_status
    line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
  File "C:\Anaconda\lib\socket.py", line 669, in readinto
    return self._sock.recv_into(b)
  File "C:\Anaconda\lib\ssl.py", line 1241, in recv_into
    return self.read(nbytes, buffer)
  File "C:\Anaconda\lib\ssl.py", line 1099, in read
    retu

In [None]:
dataframe["Longitude"] = longitude
dataframe["Latitude"] = latitude

In [None]:
dataframe

In [118]:
dataframe.to_csv("C:\Users\Yacine Mam'd\Desktop\2IA\delpha\collected_data\delpha_data.txt")

SyntaxError: (unicode error) 'unicodeescape' codec can't decode bytes in position 2-3: truncated \UXXXXXXXX escape (<ipython-input-118-06676fb32b11>, line 1)