<a id="top"></a>

<div class="list-group" id="list-tab" role="tablist">
<h3 class="list-group-item list-group-item-action active" data-toggle="list" style='color:white; background:#7a96ea; border:0' role="tab" aria-controls="home"><center><font color = 'white'>Quick Navigation</font></center></h3>

* [1. Mapping Faculty.csv to DBLP Names and PID](#0)
* [2. Exporting Each XML to CSV](#1)


In [1]:
import pandas as pd
import xml.etree.ElementTree as ET
from collections import defaultdict
import numpy as np

import os
from tqdm import tqdm

<a id="0"></a>
<h2 style='background:#7a96ea; border:0; color:white'><center><font color = 'white'>1. Mapping Faculty.csv to DBLP Names and PID</font><center><h2>

In [2]:
fac_df = pd.read_csv('../data/Faculty.csv')
fac_df['Faculty'] = fac_df['Faculty'].apply(lambda x: x.strip())
fac_df['author-pid'] = [np.nan]*85
fac_df.head()


Unnamed: 0,Faculty,Position,Gender,Management,DBLP,Area,Unnamed: 6,Unnamed: 7,Unnamed: 8,author-pid
0,A S Madhukumar,Associate Professor,M,N,https://dblp.uni-trier.de/pers/m/Madhukumar:A=...,Computer Networks,,,,
1,Alexei Sourin,Associate Professor,M,N,https://dblp.org/pers/s/Sourin:Alexei.html,Computer Graphics,,,,
2,Anupam Chattopadhyay,Associate Professor,M,N,https://dblp.org/pers/c/Chattopadhyay:Anupam.html,Computer Architecture,,,,
3,Anwitaman Datta,Associate Professor,M,N,https://dblp.org/pers/d/Datta:Anwitaman.html,Distributed Systems,,,,
4,Arijit Khan,Assistant Professor,M,N,https://dblp.org/pers/k/Khan:Arijit.html,Data Management,,,,


In [3]:
# We can verify each xml file name is the same as the Faculty column in Faculty.csv.
# Note that Tay Kian Boon does not have an XML file.
fac = defaultdict(set)
found = 0
for i, f in enumerate(os.listdir('../xml/')):
    if (list(fac_df.loc[fac_df['Faculty']==f.replace('.xml', '')]['Faculty'])):
        found += 1
    else:
        print("Unmatched Files:", f.replace('.xml', ''))
print("Total matches found between .XML file names and fac_df:", found)

Unmatched Files: Problematic
Total matches found between .XML file names and fac_df: 84


In [4]:
for f in os.listdir('../xml/'):
    cur_f = f.replace('.xml', '')
    try:
        tree = ET.parse('../xml/'+f)
        root = tree.getroot()
        fac_df.loc[fac_df['Faculty']==cur_f, 'author-pid'] = root.attrib['pid']
    except:
        continue
print("Number of rows with 'author-pid' filled up:", len(fac_df.loc[~fac_df['author-pid'].isnull()]))
print("Number of rows without 'author-pid' filled up:", len(fac_df.loc[fac_df['author-pid'].isnull()]))
fac_df = fac_df.drop(columns=['Unnamed: 6', 'Unnamed: 7', 'Unnamed: 8'])
fac_df.to_csv('../data/Faculty.csv', index=False)
fac_df.head()



Number of rows with 'author-pid' filled up: 84
Number of rows without 'author-pid' filled up: 1


Unnamed: 0,Faculty,Position,Gender,Management,DBLP,Area,author-pid
0,A S Madhukumar,Associate Professor,M,N,https://dblp.uni-trier.de/pers/m/Madhukumar:A=...,Computer Networks,66/549
1,Alexei Sourin,Associate Professor,M,N,https://dblp.org/pers/s/Sourin:Alexei.html,Computer Graphics,15/3108
2,Anupam Chattopadhyay,Associate Professor,M,N,https://dblp.org/pers/c/Chattopadhyay:Anupam.html,Computer Architecture,99/4535
3,Anwitaman Datta,Associate Professor,M,N,https://dblp.org/pers/d/Datta:Anwitaman.html,Distributed Systems,d/AnwitamanDatta
4,Arijit Khan,Assistant Professor,M,N,https://dblp.org/pers/k/Khan:Arijit.html,Data Management,67/2933


[Back to Top](#top)
<a id="1"></a>
<h2 style='background:#7a96ea; border:0; color:white'><center><font color = 'white'>2. Exporting Each XML to CSV</font><center><h2>

In [5]:
for f in tqdm(os.listdir('../xml/')):
    try:
        cols=['author','author-pid', 'paper', 'conference', 'year', 'title']
        rows = []
        tree = ET.parse('../xml/'+f)
        root = tree.getroot()

        for node1 in root:
            paper = "" #node2.attrib['key']
            author = [] #node3.text
            author_pid = [] #node3.attrib['pid']
            conference = "" # node3.booktitle
            year = 0 # node3.year
            title = "" # node3.title
            if node1.tag == "r":
                for node2 in node1:
                    paper = node2.attrib['key']
                    for node3 in node2:
                        if node3.tag == "author" or node3.tag == "editor":
                            author.append(node3.text)
                            author_pid.append(node3.attrib['pid'])
                        elif node3.tag == "booktitle":
                            conference = node3.text
                        elif node3.tag == "year":
                            year = node3.text
                        elif node3.tag == "title":
                            title = node3.text

                for i in range(len(author)):
                    rows.append({
                        'author':author[i],
                        'author-pid':author_pid[i], 
                        'paper':paper, 
                        'conference':conference,     
                        'year':year, 
                        'title':title
                    })
    except:
        continue
    df = pd.DataFrame(rows, columns=cols)
    name = f.replace('.xml', '.csv')
    df.to_csv("C:/Users/lowbe/Dropbox/CZ4071 Network Science/Project/data/"+name, index=False)

100%|██████████████████████████████████████████████████████████████████████████████████| 85/85 [00:01<00:00, 79.36it/s]
