# Web Scraping project 

## 1. Import libraries and set  options for visualization:

In [1]:
import requests 
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
import time
import os

# Setting options for DataFrame visualization:
pd.set_option('display.width', 1000)
pd.set_option('display.max_columns', 25)

## 2. Creating final dataframe and defining a random sleep interval function and a random user agent function:

Setting path for export:

In [2]:
PATH = os.getcwd()

Individual dataframes for each year will be appended here:

In [3]:
hits_spain = pd.DataFrame()

A random sleep interval generator is defined:

In [4]:
def delay():
    delay_list = [i for i in range(13)]
    lag = np.random.choice(delay_list)
    return lag

A random user agent generator is defined:

In [5]:
def get_random_ua():
    random_ua = ''
    ua_file = 'ua_file.txt'
    try:
        with open(ua_file) as f:
            lines = f.readlines()
        if len(lines) > 0:
            prng = np.random.RandomState()
            index = prng.permutation(len(lines) - 1)
            idx = np.asarray(index, dtype=np.integer)[0]
            random_ua = lines[int(idx)]
    except Exception as ex:
        print('Exception in random_ua')
        print(str(ex))
    finally:
        return random_ua

## 3. Loop to iterate over urls for the last 10 years:

A dictionary is created containing the year as a key and the url as a value

In [6]:
url_dict = {}
# The pattern of url changes in 2016, that's why two subloops are created:
for i in range(2011,2020):
    if i < 2016:
        url_dict[i] = 'https://es.wikipedia.org/wiki/Anexo:Los_n%C3%BAmeros_uno_de_Los_40_Principales_(Espa%C3%B1a)_'+str(i)
    else:
        url_dict[i] = 'https://es.wikipedia.org/wiki/Anexo:Los_n%C3%BAmeros_uno_de_LOS40_(Espa%C3%B1a)_'+str(i)

In [7]:
url_dict

{2011: 'https://es.wikipedia.org/wiki/Anexo:Los_n%C3%BAmeros_uno_de_Los_40_Principales_(Espa%C3%B1a)_2011',
 2012: 'https://es.wikipedia.org/wiki/Anexo:Los_n%C3%BAmeros_uno_de_Los_40_Principales_(Espa%C3%B1a)_2012',
 2013: 'https://es.wikipedia.org/wiki/Anexo:Los_n%C3%BAmeros_uno_de_Los_40_Principales_(Espa%C3%B1a)_2013',
 2014: 'https://es.wikipedia.org/wiki/Anexo:Los_n%C3%BAmeros_uno_de_Los_40_Principales_(Espa%C3%B1a)_2014',
 2015: 'https://es.wikipedia.org/wiki/Anexo:Los_n%C3%BAmeros_uno_de_Los_40_Principales_(Espa%C3%B1a)_2015',
 2016: 'https://es.wikipedia.org/wiki/Anexo:Los_n%C3%BAmeros_uno_de_LOS40_(Espa%C3%B1a)_2016',
 2017: 'https://es.wikipedia.org/wiki/Anexo:Los_n%C3%BAmeros_uno_de_LOS40_(Espa%C3%B1a)_2017',
 2018: 'https://es.wikipedia.org/wiki/Anexo:Los_n%C3%BAmeros_uno_de_LOS40_(Espa%C3%B1a)_2018',
 2019: 'https://es.wikipedia.org/wiki/Anexo:Los_n%C3%BAmeros_uno_de_LOS40_(Espa%C3%B1a)_2019'}

## 4. Defining functions to clean the information that is going to be retrieved:

In [8]:
def list_cleaner(input_list):
    '''
    This function 'cleans' the artist result, as sometimes there is
    more than one artist, and the connectors may vary (mostly 'y' and 'con')
    '''
    output_list = [i.replace(' y ', ', ') for i in input_list]
    output_list = [i.replace(' con ', ', ') for i in output_list]
    output_list = [i.split(', ') for i in output_list]
    return output_list

def info_finder(items, info):
    '''
    This function extracts artists and title information from every item
    of the provided list.
    It considers the cases where a title/artists is in the numer 1 for more
    than one week.
    '''
    info_list = []
    if info == 'titles':
        subindex = 1
    elif info == 'artists':
        subindex = 2
    else:
        return 'Error: for now you can only extract info for "titles" or "artists"'
    
    for i in range(1,len(items)): #because items[0] is an empty list.
        # if an elif are to avoid empty elements when there is more than one week with the same number 1
        if len(items[i]) != 4 and len(items[i-1]) == 4:
            a = items[i-1][subindex].get_text().replace('\n','')
            info_list.append(a)
        elif len(items[i]) != 4 and len(items[i-1]) != 4:
            a = items[i-2][subindex].get_text().replace('\n','')
            info_list.append(a)
        elif len(items[i]) != 4 and len(items[i-1]) != 4 and len(items[i-2]) != 4:
            a = items[i-3][subindex].get_text().replace('\n','')
            info_list.append(a)
        else:
            a = items[i][subindex].get_text().replace('\n','')
            info_list.append(a)
    else:
        return info_list

## 4. Loop to parse every url in url and extract the information:

This loop parses the url, looks for tables (class:'wikitable'), extracts the information, insert it in a dataframe and appends it to the hits_df created itially:

In [9]:
for key, value in url_dict.items():
    df_year = pd.DataFrame()
    # Requesting information from url and parsing it:
    user_agent = get_random_ua().replace('\n','')
    headers = {'user_agent':user_agent}
    url = value
    request = requests.get(url, headers=headers)
    web = request.text
    soup = BeautifulSoup(web, 'html.parser')
    
    # Retrieving all tables:
    tables = soup.find_all('table',{'class':'wikitable'})
    
    # Filtering the information found in the first table ([0]), which is the one that contains the needed info:
    rows = [row for row in tables[0].find_all('tr')]

    # Retrieving content of the table:
    items = [row.find_all('td') for row in rows]
    
    # Retrieving list of dates:
    # first [x] is for element of items lists
    # second [x] is for element of each of the items: {0: date, 1:title, 2:artist}
    # .replace to delete the linebreak
    # starts in 1 because first element of items list is an empty list
    dates = [items[i][0].get_text().replace('\n','') for i in range(1,len(items))]
    
    # Retrieving list of titles and artists using info_finder function:
    titles = info_finder(items,'titles')
    artists = info_finder(items,'artists')
    
    # Cleaning artist list:
    artists = list_cleaner(artists)
    
    # df is created and info is included:
    info = {'year': key, 'week': range(1,len(dates)+1), 'date': dates, 
            'track': titles, 'artists': artists, 'url': value}   
    temp = pd.DataFrame(info)
    
    # Appending local dataframe to main dataframe:
    hits_spain = pd.concat([hits_spain,temp])
    
    # Setting the random timer between every loop:
    delay()

## 5. Explore and export final dataframe

In [10]:
hits_spain.head()

Unnamed: 0,year,week,date,track,artists,url
0,2011,1,1 de enero,Loca,"[Shakira, El Cata]",https://es.wikipedia.org/wiki/Anexo:Los_n%C3%B...
1,2011,2,8 de enero,Barbie de extrarradio,[Melendi],https://es.wikipedia.org/wiki/Anexo:Los_n%C3%B...
2,2011,3,15 de enero,Love the Way You Lie,"[Eminem, Rihanna]",https://es.wikipedia.org/wiki/Anexo:Los_n%C3%B...
3,2011,4,22 de enero,The Time (Dirty Bit),[The Black Eyed Peas],https://es.wikipedia.org/wiki/Anexo:Los_n%C3%B...
4,2011,5,29 de enero,The Time (Dirty Bit),[The Black Eyed Peas],https://es.wikipedia.org/wiki/Anexo:Los_n%C3%B...


Resetting df index:

In [11]:
hits_spain.reset_index(drop=True, inplace=True)

In [12]:
hits_spain.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 447 entries, 0 to 446
Data columns (total 6 columns):
year       447 non-null int64
week       447 non-null int64
date       447 non-null object
track      447 non-null object
artists    447 non-null object
url        447 non-null object
dtypes: int64(2), object(4)
memory usage: 21.0+ KB


Cleaning the quotation marks from songs in 2012 list:

In [13]:
hits_spain.shape

(447, 6)

In [14]:
# Demonstrating the problem:
hits_spain.loc[hits_spain.year == 2012].head()

Unnamed: 0,year,week,date,track,artists,url
53,2012,1,7 de enero,«Someone like You»,[Adele],https://es.wikipedia.org/wiki/Anexo:Los_n%C3%B...
54,2012,2,14 de enero,«Someone like You»,[Adele],https://es.wikipedia.org/wiki/Anexo:Los_n%C3%B...
55,2012,3,21 de enero,«Ai se eu te pego!»,[Michel Teló],https://es.wikipedia.org/wiki/Anexo:Los_n%C3%B...
56,2012,4,28 de enero,«Someone like You»,[Adele],https://es.wikipedia.org/wiki/Anexo:Los_n%C3%B...
57,2012,5,4 de febrero,«Got 2 Luv U»,"[Sean Paul, Alexis Jordan]",https://es.wikipedia.org/wiki/Anexo:Los_n%C3%B...


In [15]:
# Defining the function to clean column
def string_cleaner(i):
    i = i.rstrip('»')
    i = i.lstrip('«')
    return i

# Replacing items:
hits_spain.track = hits_spain.track.apply(string_cleaner)

In [16]:
hits_spain.loc[hits_spain.year == 2012].head()

Unnamed: 0,year,week,date,track,artists,url
53,2012,1,7 de enero,Someone like You,[Adele],https://es.wikipedia.org/wiki/Anexo:Los_n%C3%B...
54,2012,2,14 de enero,Someone like You,[Adele],https://es.wikipedia.org/wiki/Anexo:Los_n%C3%B...
55,2012,3,21 de enero,Ai se eu te pego!,[Michel Teló],https://es.wikipedia.org/wiki/Anexo:Los_n%C3%B...
56,2012,4,28 de enero,Someone like You,[Adele],https://es.wikipedia.org/wiki/Anexo:Los_n%C3%B...
57,2012,5,4 de febrero,Got 2 Luv U,"[Sean Paul, Alexis Jordan]",https://es.wikipedia.org/wiki/Anexo:Los_n%C3%B...


In [17]:
hits_spain.shape

(447, 6)

Export dataframe

In [19]:
hits_spain_export = hits_spain.to_csv('%s/output/wiki_hits_spain.csv'%PATH, index=False)