# Prepare the CORA Dataset

The is the full CORA dataset version 1.0.

A link to the "baby" CORA dataset. Not used and should be the same as the included `.tsv` files from https://relational.fit.cvut.cz/dataset/CORA.

In [1]:
#!wget http://www.cs.umd.edu/~sen/lbc-proj/data/cora.tgz

## Download Raw Data

Download the full CORA dataset version 1.0 from the original source.

In [3]:
from pathlib import Path

!wget -N http://people.cs.umass.edu/~mccallum/data/cora-classify.tar.gz
!tar --skip-old-files -zxf cora-classify.tar.gz
CORA_PATH = Path('cora')

--2023-02-03 04:41:04--  http://people.cs.umass.edu/~mccallum/data/cora-classify.tar.gz
Resolving people.cs.umass.edu (people.cs.umass.edu)... 128.119.240.99
Connecting to people.cs.umass.edu (people.cs.umass.edu)|128.119.240.99|:80... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://people.cs.umass.edu/~mccallum/data/cora-classify.tar.gz [following]
--2023-02-03 04:41:04--  https://people.cs.umass.edu/~mccallum/data/cora-classify.tar.gz
Connecting to people.cs.umass.edu (people.cs.umass.edu)|128.119.240.99|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 264768650 (253M) [application/x-gzip]
Saving to: ‘cora-classify.tar.gz’


2023-02-03 04:41:13 (27.6 MB/s) - ‘cora-classify.tar.gz’ saved [264768650/264768650]



## Process `papers`

Find all of the reference fields.

In [7]:
import re

# Open the file
tag_set = set()

with open(CORA_PATH / "papers", 'r') as file:
    for i, line in enumerate(file):
        # Use a regular expression to find all tags in the line
        tags = re.findall(r'</(.*?)>', line)
        tag_set |= set(tags)

print("Description fields in at least one paper's descriptor:")
print(tag_set)

# Make a reasonable ordering
all_tags = ['author', 'title', 'type', 'institution', 'booktitle', 'publisher', 'editor', 'address', 'journal', 'volume', 'pages', 'month', 'year', 'note']

assert set(all_tags) == set(tag_set)

Description fields in at least one paper's descriptor:
{'author', 'publisher', 'type', 'volume', 'journal', 'pages', 'editor', 'note', 'title', 'year', 'month', 'booktitle', 'institution', 'address'}


Process the entries of `papers` one line at a time, ignoring duplicate entries which have the same id.

In [19]:
import pandas as pd
import string

# Initialize an empty dataframe
df = pd.DataFrame(columns=['id', 'filename', 'reference'] + all_tags) 

missing_val = ""

last_id = None
with open(CORA_PATH / "papers", 'r') as file:
    # Loop through each line in the file
    for i, line in enumerate(file):
        #if i >= 2:
        #    break
        
        parts = line.strip().split("\t")
        
        # Many of the entries only have 2 parts. Ignore these
        if len(parts) != 3:
            continue
        
        id = parts[0]
        # Skip repeated ids
        if id == last_id:
            continue
        last_id = id
        
        filename = parts[1]
        
        # the first group matches a citation, e.g [B & G] if present
        try:
            m = re.match(r'\[(.+)\] (.+)', parts[2])
        except IndexError as e:
            print(f"Error {e} on entry {i}. Parts {parts}")
            raise e

        if m is None:
            reference = missing_val
            tagged_list = parts[2]
            #print(f'entry: {i}, {parts[2]}')
        else:
            reference = m.group(1)
            tagged_list = m.group(2)
        
        #reference = re.search(r'[(.*?)]', parts[2])
        #reference = missing_val if reference is None else reference

        #reference = parts[2].split(" ")[0].strip("[]")
        
        try:
            #details = parts[2].split("]")[1].strip().split("<")[1:]
            details = tagged_list.split("<")[1:]
            details = [x.split(">") for x in details]
            details = {x[0]: x[1].strip() for x in details}
        except IndexError as e:
            print(f"Error {e} on entry {i}. Parts {parts}")
            raise e
        
        row = {'id': id, 'filename': filename, 'reference': reference}
        
        for i, tag in enumerate(all_tags):
            row[tag] = details.get(tag, missing_val).rstrip(".,")
            
        # additional clean up for year field
        row['year'] = row['year'].strip("[()];:").rstrip(string.ascii_letters).rstrip("(),.")
        
        # author = details.get("author", "")
        # title = details.get("title", "")
        # publisher = details.get("publisher", "")
        # address = details.get("address", "")
        # year = details.get("year", "")
        # Add the values to the dataframe
        df = df.append(row, ignore_index=True)
        # {'number': number, 'url': url, 'reference': reference, 
        #                'author': author, 'title': title, 'publisher': publisher, 
        #                'address': address, 'month': month, 'year': year}, ignore_index=True)

df.set_index('id', inplace=True)
# Print the dataframe
display(df)

Unnamed: 0_level_0,filename,reference,author,title,type,institution,booktitle,publisher,editor,address,journal,volume,pages,month,year,note
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2,http:##dimacs.rutgers.edu#techps#1994#94-07.ps,Gar,M.R. Garey & D.S. Johnson,Computers and Intractibility: A Guide to the T...,,,,Freeman,,New York,,,,,1979,
16,http:##www.cs.wisc.edu#~fischer#ftp#pub#tech-r...,DeWitt90,"D. DeWitt, P. Futtersack, D. Maier, F. Velez","""A Study of Three Alternative Workstation-Serv...",,,Proceedings of the 16th International Conferec...,,,"Brisbane, Australia",,,,August,1990,
18,ftp:##ftp.cs.purdue.edu#pub#hosking#papers#oop...,Hoski93a,"A. Hosking, J. E. B. Moss","""Object Fault Handling for Persistent Programm...",,,Proceedings of the 16th International Conferec...,,,,,,pp. 288-303,,1993,
20,http:##www.pmg.lcs.mit.edu#papers#dist-mgmt.ps.gz,Liskov93,"Liskov B., Day M., Shrira L",Distributed Object Management in Thor,,,Distributed Object Management,,In M. Tamer Ozsu and Umesh Dayal and Patrick V...,"San Mateo, California",,,,,1993,
22,http:##www.pmg.lcs.mit.edu#papers#osdi94-opplo...,Otoole94,"J. O\'Toole, L. Shrira","""Opportunistic Log: Efficient Installation Rea...",,,USENIX Symposium on Operating Systems Design a...,,,,,,pp. 39-48,November,1994,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1102114,http:##wwwpub.utdallas.edu#~herve#abdi.josa.ps,29,"Valentin, D. and Abdi, H","""Can a linear autoassociator recognize faces f...",,,,,,,Journal of the Optical Society of America A 13,,717-724,,1996,
1102216,www.cs.bilkent.edu.tr#~oulusoy#jss2.ps.Z,,"Cetintemel U., Zimmermann J., Ulusoy O., and B...",OBJECTIVE: A Benchmark for Object-Oriented Act...,Technical Report BU-CEIS-9610,"Bilkent University, Ankara, Turkey",,,,,,,,,1996,
1102254,http:##www.ri.cmu.edu#afs#cs#user#kseymore#htm...,3,S. F. Chen et al,Topic Adaptation for Language Modeling Using U...,,,in Proc. ICASSP\'98,,,,,Vol. 2,pp. 681-684,May 12-15,1998,
1102262,http:##www.cs.jhu.edu#~junwu#topic-lm.ps,11,S. Khudanpur and J. Wu,"""A Maximum Entropy Language Model to Integrate...",,,Proceedings of ICASSP\'99,,,,,,pp. 553-556,,,


In [26]:
years = df['year']
print(f'There are {len(years)} entries with unique paper ids.')
W = [y for y in years if y == ""]
print(f'There are {len(W)} entries missing the publication year.')
Y = [int(y) for y in years if y.isdigit()]
print(f'There are {len(Y)} valid entries.')

bad = [y for y in years if y != "" and not y.isdigit()]
assert len(years) == len(W) + len(bad) + len(Y)
print(f'There are {len(bad)} entries with badly formed year fields.')
print(bad)
pd.Series(Y).value_counts()
Z = [y for y in Y if y is not None and 1990 <= y and y <= 1999]
#Z = [y for y in Y if y is not None]
print(len(Z))

print(min(Y))
max(Y)

df.loc[lambda df: df.year.isdigit()].shape

There are 19396 entries with unique paper ids.
There are 2957 entries missing the publication year.
There are 16424 valid entries.
There are 15 entries with badly formed year fields.
['1988/89', '1996, 1997', '1991, 1991', '1996. 1996', '19(1996', '1994, 1994', 'Oct.1994', '1994), 1994', '1997, 1997', '1987. ftp://ftp.cs.ruu.nl/pub/RUU/CS/techreps/CS-1986/1986-16.ps', '1994, pp.1901-1905', '1995. ftp://cse.ogi.edu/pub/tech-reports/1995/95-010.ps', '1995, 1995', '1998?', '807-815,1998']
15711
1913


AttributeError: 'Series' object has no attribute 'isdigit'

In [32]:
sum(df.year.apply(str.isdigit))

16424

In [None]:
import plotly.express as px

fig = px.histogram(df, x="year")
fig.show()

In [64]:
PAPERS_FILE = 'papers'

filename = CORA_PATH / PAPERS_FILE

with open(filename, 'r') as file:
    for i, line in enumerate(file):
        if i >= 2:
            break
        # process the line
        parts = line.strip().split("\t")
        print("Line number:", parts[0])
        print("URL:", parts[1])
        
        print("parts[2]:", parts[2])
        m = re.match(r'\[(\w+)\] (.+)', parts[2])
        print(m)
        print(m.group(1))
        print(m.group(2))

        m = re.match(r'[(\w+)] (.+)', parts[2])
        print(m.group(1))
        print(m.group(2))

        reference = parts[2].split(" ")[0].strip("[]")
        print(f'reference: {reference}')

        details = parts[2].split("]")[1].strip().split("<")[1:]
        details = [x.split(">") for x in details]
        details = {x[0]: x[1].strip() for x in details}
        print("Author(s):", details.get("author", ""))
        print("Title:", details.get("title", ""))
        print("Publisher:", details.get("publisher", ""))
        print("Address:", details.get("address", ""))
        print("Year:", details.get("year", ""))
        print("\n")
        
 

Line number: 2
URL: http:##dimacs.rutgers.edu#techps#1994#94-07.ps
parts[2]: [Gar] <author> M.R. Garey & D.S. Johnson, </author> <title> Computers and Intractibility: A Guide to the Theory of NP-Completeness, W.H. </title> <publisher> Freeman, </publisher> <address> New York, </address> <year> 1979. </year>
<re.Match object; span=(0, 231), match='[Gar] <author> M.R. Garey & D.S. Johnson, </autho>
Gar
<author> M.R. Garey & D.S. Johnson, </author> <title> Computers and Intractibility: A Guide to the Theory of NP-Completeness, W.H. </title> <publisher> Freeman, </publisher> <address> New York, </address> <year> 1979. </year>


AttributeError: 'NoneType' object has no attribute 'group'

In [20]:
# Open the file
with open(filename, 'r') as file:
    # Loop through each line in the file
    for i, line in enumerate(file):
        if i >= 5:
            break
        # Split the line by tab
        line_split = line.strip().split('\t')
        # Extract the tags from the line
        tags = [x.split('<')[1].split('>')[0] for x in line_split[3].split('<')[1:]]
        # Print the list of tags
        print(tags)


IndexError: list index out of range

In [10]:
tags = elements[2].split(' ')


In [11]:
tags

['[Gar]',
 '<author>',
 'M.R.',
 'Garey',
 '&',
 'D.S.',
 'Johnson,',
 '</author>',
 '<title>',
 'Computers',
 'and',
 'Intractibility:',
 'A',
 'Guide',
 'to',
 'the',
 'Theory',
 'of',
 'NP-Completeness,',
 'W.H.',
 '</title>',
 '<publisher>',
 'Freeman,',
 '</publisher>',
 '<address>',
 'New',
 'York,',
 '</address>',
 '<year>',
 '1979.',
 '</year>']

In [12]:
tags[2].strip('<author>').strip('</author>')

'M.R.'

In [13]:
tags[2]

'M.R.'