In [10]:
#import beautifulsoup4
from bs4 import BeautifulSoup
import pandas as pd
import requests
from collections import defaultdict
from pathlib import Path
import os

In [2]:
"""
    This function gets the html from the dblp website
"""
def get_html(year):
    url = f'https://dblp.org/db/conf/lats/lats{year}.html'

    # get the html
    response = requests.get(url)
    html = response.content

    # extract the html from https://dblp.org/db/conf/lats/lats2017.html
    soup = BeautifulSoup(html, "html.parser")

    return soup

In [3]:
# Get HTML for relevant years.
# Since I was tasked with 2016, 2017 and 2022 I retrieved the HTML for those years
soup_2016 = get_html(2016) 
soup_2017 = get_html(2017)
soup_2022 = get_html(2022) 

In [6]:
"""
    This function creates creates adds a row to the dataframe for each paper.
    Each row contains the following columns:
        - Year
        - Session Name
        - Full Paper or WIP?
        - Authors
        - Paper Title
        - First Author Last Name
        - First Author First Name
        - Link
        - DOI
        - File Name
"""

def add_rows(df, soup, year):
    # Find the "ul class='publ-list" immediately after each H2. 
    for ul in soup.find_all("ul", class_="publ-list"):
        
        # Then loop through each "li class='entry inproceedings'" and print out the text
        for li in ul.find_all("li", class_="entry inproceedings"):
            paper_dict = defaultdict(str)
            paper_dict['Year'] = year 
            paper_dict['Session Name'] = li.find_previous('h2').text
            
            # Ignore keynote and demonstrations. You might need to add more, like Tutorials, Posters and Workshops that you'd like to ignore
            if paper_dict['Session Name'] in ['Keynote Address', 'Demonstrations']:
                continue

            if paper_dict['Session Name'] == 'Work in Progress':
                paper_dict['Full Paper or WIP?'] = 'WIP'
            else:
                paper_dict['Full Paper or WIP?'] = 'Full Paper'
            
            for cite in li.find_all("cite"):
                paper_dict['Authors'] = cite.text.split(':')[0]
                paper_dict['Paper Title'] = ':'.join(cite.text.split(':')[1:]).strip().split('.')[0]
                
                first_author = paper_dict['Authors'].split(',')[0]
                
                paper_dict['First Author Last Name'] = first_author.split(' ')[-1]
                
                paper_dict['First Author First Name'] = ' '.join(first_author.split(' ')[:-1])
                
            for nav_publ in li.find_all("nav", class_="publ"):
                for dropdown in nav_publ.find_all("li", class_="drop-down"):
                    for div in dropdown.find_all("div", class_="head"):
                        for a in div.find_all("a"):
                            if 'https://doi.org' in a['href']:

                                paper_dict['Link'] = a['href']
                                paper_dict['DOI']  = a['href'].split('https://doi.org/')[1]
                                paper_dict['Link'] = 'https://dl.acm.org/doi/pdf/' + paper_dict['DOI'] 

                                paper_dict['File Name'] = f'{year}_{paper_dict["Paper Title"]}.pdf'
                        
                # Add row to dataframe
                df = pd.concat([df,pd.DataFrame([paper_dict])], ignore_index=True)
    return df
                

In [7]:
# Create and populate the dataframe with relevant papers
columns = ['First Author First Name','First Author Last Name','Paper Title','Year',	'Full Paper or WIP?','Session Name','Authors','Link','File Name','DOI']
df = pd.DataFrame(columns=columns)
df=add_rows(df, soup_2016, 2016)
df=add_rows(df, soup_2017, 2017)
df=add_rows(df, soup_2022, 2022)
display(df)

Unnamed: 0,First Author First Name,First Author Last Name,Paper Title,Year,Full Paper or WIP?,Session Name,Authors,Link,File Name,DOI
0,Justin,Reich,The Civic Mission of MOOCs: Measuring Engageme...,2016,Full Paper,Global Village,"Justin Reich, Brandon M. Stewart, Kimia Mavon,...",https://dl.acm.org/doi/pdf/10.1145/2876034.287...,2016_The Civic Mission of MOOCs: Measuring Eng...,10.1145/2876034.2876045
1,Cynthia,Breazeal,Mobile Devices for Early Literacy Intervention...,2016,Full Paper,Global Village,"Cynthia Breazeal, Robin Morris, Stephanie Gott...",https://dl.acm.org/doi/pdf/10.1145/2876034.287...,2016_Mobile Devices for Early Literacy Interve...,10.1145/2876034.2876046
2,Ben U.,Gelman,Online Urbanism: Interest-based Subcultures as...,2016,Full Paper,Global Village,"Ben U. Gelman, Chris Beckley, Aditya Johri, Ca...",https://dl.acm.org/doi/pdf/10.1145/2876034.287...,2016_Online Urbanism: Interest-based Subcultur...,10.1145/2876034.2876052
3,Geza,Kovacs,Effects of In-Video Quizzes on MOOC Lecture Vi...,2016,Full Paper,Engagement,Geza Kovacs,https://dl.acm.org/doi/pdf/10.1145/2876034.287...,2016_Effects of In-Video Quizzes on MOOC Lectu...,10.1145/2876034.2876041
4,Eleanor,O'Rourke,Brain Points: A Deeper Look at a Growth Mindse...,2016,Full Paper,Engagement,"Eleanor O'Rourke, Erin Peach, Carol S. Dweck, ...",https://dl.acm.org/doi/pdf/10.1145/2876034.287...,2016_Brain Points: A Deeper Look at a Growth M...,10.1145/2876034.2876040
...,...,...,...,...,...,...,...,...,...,...
208,Dragos,Corlatescu,Where are the Large N Studies in Education?: I...,2022,WIP,Work in Progress,"Dragos Corlatescu, Stefan Ruseti, Irina Toma, ...",https://dl.acm.org/doi/pdf/10.1145/3491140.352...,2022_Where are the Large N Studies in Educatio...,10.1145/3491140.3528315
209,Robert,Stanyon,Demo of Graide: AI Powered Assistive Grading E...,2022,Full Paper,Demos,"Robert Stanyon, Enrico Martello, Manjinder Kai...",https://dl.acm.org/doi/pdf/10.1145/3491140.352...,2022_Demo of Graide: AI Powered Assistive Grad...,10.1145/3491140.3528263
210,Danielle R.,Chine,Development of Scenario-based Mentor Lessons: ...,2022,Full Paper,Demos,"Danielle R. Chine, Pallavi Chhabra, Adetunji A...",https://dl.acm.org/doi/pdf/10.1145/3491140.352...,2022_Development of Scenario-based Mentor Less...,10.1145/3491140.3528262
211,Scott,Bunin,Incorporating Habitats in Conceptual Models an...,2022,Full Paper,Demos,"Scott Bunin, Willventchy Celestin, Andrew Horn...",https://dl.acm.org/doi/pdf/10.1145/3491140.352...,2022_Incorporating Habitats in Conceptual Mode...,10.1145/3491140.3528261


In [8]:
# By default, the name of the downloaded file from Chrome is the DOI. 
# Adding this in for later
df['Original File Name']  = df['DOI'].str.split('/').str[-1]+'.pdf'

# Drop ("Year == 2022" and "Full Paper or WIP? == Full Paper) since that is not my part
df = df.drop(df[(df['Year'] == 2022) & (df['Full Paper or WIP?'] == 'Full Paper')].index)
display(df)

# Export the file
df.to_csv('./output/2016_2017_2022.csv', index=False)

Unnamed: 0,First Author First Name,First Author Last Name,Paper Title,Year,Full Paper or WIP?,Session Name,Authors,Link,File Name,DOI,Original File Name
0,Justin,Reich,The Civic Mission of MOOCs: Measuring Engageme...,2016,Full Paper,Global Village,"Justin Reich, Brandon M. Stewart, Kimia Mavon,...",https://dl.acm.org/doi/pdf/10.1145/2876034.287...,2016_The Civic Mission of MOOCs: Measuring Eng...,10.1145/2876034.2876045,2876034.2876045.pdf
1,Cynthia,Breazeal,Mobile Devices for Early Literacy Intervention...,2016,Full Paper,Global Village,"Cynthia Breazeal, Robin Morris, Stephanie Gott...",https://dl.acm.org/doi/pdf/10.1145/2876034.287...,2016_Mobile Devices for Early Literacy Interve...,10.1145/2876034.2876046,2876034.2876046.pdf
2,Ben U.,Gelman,Online Urbanism: Interest-based Subcultures as...,2016,Full Paper,Global Village,"Ben U. Gelman, Chris Beckley, Aditya Johri, Ca...",https://dl.acm.org/doi/pdf/10.1145/2876034.287...,2016_Online Urbanism: Interest-based Subcultur...,10.1145/2876034.2876052,2876034.2876052.pdf
3,Geza,Kovacs,Effects of In-Video Quizzes on MOOC Lecture Vi...,2016,Full Paper,Engagement,Geza Kovacs,https://dl.acm.org/doi/pdf/10.1145/2876034.287...,2016_Effects of In-Video Quizzes on MOOC Lectu...,10.1145/2876034.2876041,2876034.2876041.pdf
4,Eleanor,O'Rourke,Brain Points: A Deeper Look at a Growth Mindse...,2016,Full Paper,Engagement,"Eleanor O'Rourke, Erin Peach, Carol S. Dweck, ...",https://dl.acm.org/doi/pdf/10.1145/2876034.287...,2016_Brain Points: A Deeper Look at a Growth M...,10.1145/2876034.2876040,2876034.2876040.pdf
...,...,...,...,...,...,...,...,...,...,...,...
204,Jungwook,Rhim,Understanding the Relationship Between Student...,2022,WIP,Work in Progress,"Jungwook Rhim, Gahgene Gweon",https://dl.acm.org/doi/pdf/10.1145/3491140.352...,2022_Understanding the Relationship Between St...,10.1145/3491140.3528311,3491140.3528311.pdf
205,Zachary,Felker,Using a Planning Prompt Survey to Encourage Ea...,2022,WIP,Work in Progress,"Zachary Felker, Zhongzhou Chen",https://dl.acm.org/doi/pdf/10.1145/3491140.352...,2022_Using a Planning Prompt Survey to Encoura...,10.1145/3491140.3528297,3491140.3528297.pdf
206,Yu,Li,Using Chatbots to Teach Languages,2022,WIP,Work in Progress,"Yu Li, Chun-Yen Chen, Dian Yu, Sam Davidson, R...",https://dl.acm.org/doi/pdf/10.1145/3491140.352...,2022_Using Chatbots to Teach Languages.pdf,10.1145/3491140.3528329,3491140.3528329.pdf
207,Kimberly,Williamson,Large-scale Analysis of Discussion Networks in...,2022,WIP,Work in Progress,"Kimberly Williamson, René F. Kizilcec",https://dl.acm.org/doi/pdf/10.1145/3491140.352...,2022_Large-scale Analysis of Discussion Networ...,10.1145/3491140.3528321,3491140.3528321.pdf


In [None]:
# NOTE: Manual steps required!!
# After running this step, we cannot automatically download the paper
# This is because of the authentication required.
# Thus, I manually opened each paper link in the dataframe and saved it to the "papers" folder
# I used the chrome extension "Open Multiple URLs" to open all the links at once.
# I then used the chrome extension "Download All PDFs" to download all the papers at once with the default name.

In [287]:
# Script to rename the file name from "Original File Name" (DOI) to "File Name" (required by Dr Duncan)
import os

# Find all files in "./papers"
files = os.listdir('./papers')

# Loop through each file
for file in files:

    # Find the row in the dataframe that matches the file
    row = df[df['Original File Name'] == file]

    # If there is a match, rename the file
    if len(row) > 0:
        os.rename(f'./papers/{file}', f'./papers/{row["File Name"].values[0]}')


In [13]:
# # Remove files that do not appear in this dataframe
# files = os.listdir('./papers')

# # Loop through each file
# for file in files:

#     # Find the row in the dataframe that matches the file
#     row = df[df['File Name'] == file]

#     # If there is no match, delete the file
#     if len(row) == 0:
#         os.remove(f'./papers/{file}')

#         print('deleted')

deleted
deleted
deleted
deleted
deleted
deleted
deleted
deleted
deleted
deleted
deleted
deleted
deleted
deleted
deleted
deleted
