# Get data from lightning talks

The following code scrapes the html from the lightning talks sites to put the data (title, speaker, description, votes) into csv files for each year. Note the older lightning talks (2010-2014) are from a different website that is now defunct; html files are from the Internet Archive's Wayback Machine. Links to the sites are in the comments.  

In [1]:
import csv
from bs4 import BeautifulSoup

In [2]:
# http://lightningtalks.ire.org/2015.html - 2015
# http://lightningtalks.ire.org/2016.html - 2016
# https://github.com/ireapps/lightning-talks/blob/master/static/2017.html - 2017
# http://lightningtalks.ire.org/ - 2018

def get_talks_data(year) : 

    soup = BeautifulSoup(open(year + '.html'), 'html.parser')

    content = soup.find_all('div', class_ = 'content-box')
    votes_content = soup.find_all('div', class_ = 'num')

    f = csv.writer(open(year + '.csv', 'w'))
    f.writerow(['title', 'speaker', 'description', 'votes'])

    i = 0

    for con in content : 
        title = con.find('h3').get_text().encode('utf-8').strip()
        speaker = con.find('p').get_text().encode('utf-8')
        description_full = con.find_all('p', class_ = lambda x: x != 'speaker')
        for desc in description_full : 
            description = desc.get_text().encode('utf-8')
        votes = votes_content[i].get_text().encode('utf-8')
        f.writerow([title, speaker, description, votes])

        i += 1

In [3]:
get_talks_data('2015')
get_talks_data('2016')
get_talks_data('2017')
get_talks_data('2018')

In [4]:
# https://web.archive.org/web/20100428202810/http://ire.aronpilhofer.com/ - 2010
# https://web.archive.org/web/20110403205043/http://ire.aronpilhofer.com/ - 2011
# https://web.archive.org/web/20120310122142/http://ire.aronpilhofer.com:80/proposals/sort/popular - 2012
# https://web.archive.org/web/20130511043212/http://ire.aronpilhofer.com/proposals/sort/popular - 2013
# https://web.archive.org/web/20140323004809/http://ire.aronpilhofer.com/ - 2014

def get_talks_data_old(year) : 

    soup = BeautifulSoup(open(year + '.html'), 'html.parser')
    content = soup.find_all('div', class_ = 'proposal_text')
    votes_content = soup.find_all('p', id = 'vote_number_number')
    
    f = csv.writer(open(year + '.csv', 'w'))
    f.writerow(['title', 'speaker', 'description', 'votes'])

    i = 0

    for con in content :
        title = con.find('h2').get_text()
        speaker = con.find('p', class_ = 'byline').get_text()
        description = con.find('p', class_ = 'description').get_text()
        votes = votes_content[i].get_text()

        f.writerow([title, speaker, description, votes])

        i += 1

In [5]:
get_talks_data_old('2010')
get_talks_data_old('2011')
get_talks_data_old('2012')
get_talks_data_old('2013')
get_talks_data_old('2014')