<a href="https://colab.research.google.com/github/vrathi101/dblp_analysis/blob/main/ResearcherDBLP_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [22]:
import requests
import gzip
from lxml import etree
import pandas as pd

**Get DBLP XML Data**
The dblp computer science bibliography provides open bibliographic information on major computer science journals and proceedings

In [23]:
#url of the gzipped XML file
url = "https://dblp.uni-trier.de/xml/dblp.xml.gz"
#make a GET request to fetch the gzipped XML content
response = requests.get(url)
#save the content to a file
with open("dblp.xml.gz", "wb") as f:
  f.write(response.content)

In [24]:
#save the content to a file
with open("dblp.xml.gz", "wb") as f:
  f.write(response.content)

Read the Researchers Google Sheet as a csv  - This CSV contains data about researchers (committee members) from CoNEXT, Sigcomm, and NSDI PC. We want to find their publications in DBLP XML.

In [25]:
from google.colab import drive
drive.mount('/content/gdrive')
proj_path = '/content/gdrive/MyDrive/Colab Notebooks/save_data/'
dfResearcher = pd.read_csv(proj_path + 'researchers.csv')
dfResearcher.head()

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


Unnamed: 0,Name,Google Scholar Link,email,Institution,Expertise,Region,Approximate PhD Graduation Year,Female or NB,Minority (Race if known),Industry
0,Sergey Gorinsky,,sergey.gorinsky@imdea.org,IMDEA,"CDN,CLOUD-DC,ECONOMICS,",EU,senior,,,
1,Hitesh Ballani,,hitesh.ballani@microsoft.com,Microsoft Research,"CLOUD-DC,ROUTING,",EU,senior,,,
2,Haya Shulman,,haya.shulman@sit.fraunhofer.de,Fraunhofer and Hebrew university of Jerusalem,"SECURITY,",EU,,1.0,,1.0
3,Rüdiger Birkner,,mail@rbirkner.ch,???,"SDN,",EU,2018,,,
4,Klaus Wehrle,,wehrle@comsys.rwth-aachen.de,Aachen University,"VERIFICATION,SECURITY,",EU,senior,,,


Remove some unnecessary columns

In [26]:
dfResearcher = dfResearcher.drop(['Google Scholar Link', 'Minority (Race if known)'], axis=1, errors='ignore')
dfResearcher.head()

Unnamed: 0,Name,email,Institution,Expertise,Region,Approximate PhD Graduation Year,Female or NB,Industry
0,Sergey Gorinsky,sergey.gorinsky@imdea.org,IMDEA,"CDN,CLOUD-DC,ECONOMICS,",EU,senior,,
1,Hitesh Ballani,hitesh.ballani@microsoft.com,Microsoft Research,"CLOUD-DC,ROUTING,",EU,senior,,
2,Haya Shulman,haya.shulman@sit.fraunhofer.de,Fraunhofer and Hebrew university of Jerusalem,"SECURITY,",EU,,1.0,1.0
3,Rüdiger Birkner,mail@rbirkner.ch,???,"SDN,",EU,2018,,
4,Klaus Wehrle,wehrle@comsys.rwth-aachen.de,Aachen University,"VERIFICATION,SECURITY,",EU,senior,,


Merge the XML and the CSV and print few records

In [28]:
import random
import gzip
from lxml import etree

# Function to extract author names
def extract_authors(element):
    authors = []
    for author_elem in element.xpath("author"):
        authors.append(author_elem.text)
    return authors

# Create a dictionary to store XML records indexed by researcher name
xml_records = {}

# Process the DBLP XML file
with gzip.open("dblp.xml.gz", "rb") as f:
    context = etree.iterparse(f, events=("start", "end"), recover=True)

    for event, element in context:
        if element.tag == 'article':
            # Extract the article key, title, authors, and year
            article_key = element.get('key')
            article_title = element.xpath('title')
            if article_title:
                article_title = article_title[0].text
            else:
                article_title = 'N/A'
            article_authors = extract_authors(element)
            article_year = element.xpath('year')
            if article_year:
                article_year = article_year[0].text
            else:
                article_year = 'N/A'

            # Add the record to the dictionary indexed by each author's name
            for author in article_authors:
                if author not in xml_records:
                    xml_records[author] = []
                xml_records[author].append({
                    'key': article_key,
                    'title': article_title,
                    'authors': article_authors,
                    'year': article_year
                })

        # Clear the processed elements to free up memory
        element.clear()

# Get the common authors between CSV and DBLP
common_authors = set(dfResearcher['Name']).intersection(xml_records.keys())

# Print three random authors from the common set
random_authors = random.sample(common_authors, k=3)

for author in random_authors:
    # Print the researcher name and their corresponding information
    print('Researcher:', author)
    print('Google Sheets Data:')
    print(dfResearcher[dfResearcher['Name'] == author])
    print('XML Records:')
    for record in xml_records[author]:
        print(record)
    print('---')


Researcher: Krishna P. Gummadi
Google Sheets Data:
                   Name                email            Institution  \
496  Krishna P. Gummadi  gummadi@mpi-sws.org  Max Planck Institute    

                     Expertise Region Approximate PhD Graduation Year  \
496  SYSTEMS,SECURITY,NETWORKS     EU                            2005   

     Female or NB  Industry  
496           NaN       1.0  
XML Records:
{'key': 'journals/jmlr/ZafarVGG19', 'title': 'Fairness Constraints: A Flexible Approach for Fair Classification.', 'authors': ['Muhammad Bilal Zafar', 'Isabel Valera', 'Manuel Gomez-Rodriguez', 'Krishna P. Gummadi'], 'year': '2019'}
{'key': 'journals/internet/MondalMGGK17', 'title': 'Longitudinal Privacy Management in Social Media: The Need for Better Controls.', 'authors': ['Mainack Mondal', 'Johnnatan Messias', 'Saptarshi Ghosh 0001', 'Krishna P. Gummadi', 'Aniket Kate'], 'year': '2017'}
{'key': 'journals/popets/MinaeiMLGK19', 'title': 'Lethe: Conceal Content Deletion from Pers

since Python 3.9 and will be removed in a subsequent version.
  random_authors = random.sample(common_authors, k=3)


**Merge the XML and the CSV to reate three output files:**

*   Matched with data from Researcher CSV & DBLP XML - all the publications detailed are pulled from the XML and listed for each researcher
*   Unmatched file - Researcher name does not match with DBLP XML, needs to fix input data file, such as spelling, special characters, etc.
*   Matched file with just name - We can use this file for further fine tuning the program. We will skip records from processing, if found in match file. This enhancement is not there yet.

In [30]:
import gzip
from lxml import etree
from google.colab import drive
import unicodedata

# Mount Google Drive
drive.mount('/content/drive')

# Function to extract author names
def extract_authors(element):
    authors = []
    for author_elem in element.xpath("author"):
        authors.append(author_elem.text)
    return authors

# Create a dictionary to store XML records indexed by researcher name
xml_records = {}

# Process the DBLP XML file
with gzip.open("dblp.xml.gz", "rb") as f:
    context = etree.iterparse(f, events=("start", "end"), recover=True)

    for event, element in context:
        if element.tag == 'article':
            # Extract the article key, title, authors, and year
            article_key = element.get('key')
            article_title = element.xpath('title')
            if article_title:
                article_title = article_title[0].text
            else:
                article_title = 'N/A'
            article_authors = extract_authors(element)
            article_year = element.xpath('year')
            if article_year:
                article_year = article_year[0].text
            else:
                article_year = 'N/A'

            # Add the record to the dictionary indexed by each author's name
            for author in article_authors:
                if author not in xml_records:
                    xml_records[author] = []
                xml_records[author].append({
                    'key': article_key,
                    'title': article_title,
                    'authors': article_authors,
                    'year': article_year
                })

        # Clear the processed elements to free up memory
        element.clear()

# Create files in Google Drive to store the results
output_file_path = '/content/drive/MyDrive/Output.txt'
matches_file_path = '/content/drive/MyDrive/Matches.txt'
non_matches_file_path = '/content/drive/MyDrive/NonMatches.txt'

# Open the files in write mode
with open(output_file_path, 'w') as output_file, \
     open(matches_file_path, 'w') as matches_file, \
     open(non_matches_file_path, 'w') as non_matches_file:

    for _, row in dfResearcher.iterrows():
        researcher_name = row['Name']
        encoded_researcher_name = unicodedata.normalize('NFKD', researcher_name).encode('utf-8', 'ignore').decode('utf-8')

        # Check if the researcher exists in the XML records
        if researcher_name in xml_records:
            # Write the researcher name and their corresponding information to the output file
            output_file.write('Researcher: {}\n'.format(researcher_name))
            output_file.write('Google Sheets Data:\n')
            output_file.write(row.to_string())
            output_file.write('\n')
            output_file.write('XML Records:\n')
            for record in xml_records[researcher_name]:
                output_file.write(str(record))
                output_file.write('\n')
            output_file.write('---\n')

            # Write the researcher name to the matches file
            matches_file.write(str(researcher_name))
            matches_file.write('\n')
        else:
            # Write the researcher name to the non-matches file
            non_matches_file.write(str(researcher_name))
            non_matches_file.write('\n')

# Print the paths of the output files in Google Drive
print('Output file saved to:', output_file_path)
print('Matches file saved to:', matches_file_path)
print('Non-Matches file saved to:', non_matches_file_path)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Output file saved to: /content/drive/MyDrive/Output.txt
Matches file saved to: /content/drive/MyDrive/Matches.txt
Non-Matches file saved to: /content/drive/MyDrive/NonMatches.txt
