In [1]:
from datetime import datetime

from sqlalchemy import create_engine
from sqlalchemy import Column, Integer, String, DateTime, ForeignKey
from sqlalchemy.orm import sessionmaker, relationship
from sqlalchemy.exc import SQLAlchemyError
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.engine.url import URL

import pandas as pd

In [2]:
# Used full file path for testing purpose
json_file = pd.read_json("/Users/EmilyWang/Desktop/ops/paper-collector/DeepLearningArticles.json", orient='index')
json_file.head()

Unnamed: 0,arxiv_comment,arxiv_primary_category,author,authors,id,published,summary,tags,title,updated
0,,"{'term': 'cs.AI', 'scheme': 'http://arxiv.org/...",Aaron Hertzmann,[Aaron Hertzmann],http://arxiv.org/abs/1903.05696v2,2019-03-13T19:45:54Z,This paper proposes a way to understand neural...,"[{'term': 'cs.AI', 'scheme': 'http://arxiv.org...",Aesthetics of Neural Network Art,2019-03-18T17:58:15Z
1,,"{'term': 'cs.CV', 'scheme': 'http://arxiv.org/...",Ali Farhadi,"[Eric Kolve, Roozbeh Mottaghi, Winson Han, Eli...",http://arxiv.org/abs/1712.05474v3,2017-12-14T23:17:24Z,"We introduce The House Of inteRactions (THOR),...","[{'term': 'cs.CV', 'scheme': 'http://arxiv.org...",AI2-THOR: An Interactive 3D Environment for Vi...,2019-03-15T18:29:15Z
2,arXiv admin note: substantial text overlap wit...,"{'term': 'cs.CV', 'scheme': 'http://arxiv.org/...",Kazuhiro Fukui,"[Naoya Sogi, Rui Zhu, Jing-Hao Xue, Kazuhiro F...",http://arxiv.org/abs/1903.06549v1,2019-03-14T07:14:34Z,"In this paper, we propose a method for image-s...","[{'term': 'cs.CV', 'scheme': 'http://arxiv.org...",Constrained Mutual Convex Cone Method for Imag...,2019-03-14T07:14:34Z
3,In submission,"{'term': 'cs.LG', 'scheme': 'http://arxiv.org/...",Ambuj Tewari,"[Aditya Modi, Ambuj Tewari]",http://arxiv.org/abs/1903.06187v1,2019-03-14T18:02:09Z,We consider the recently proposed reinforcemen...,"[{'term': 'cs.LG', 'scheme': 'http://arxiv.org...",Contextual Markov Decision Processes using Gen...,2019-03-14T18:02:09Z
4,Updated version: Refined analysis of primary a...,"{'term': 'cs.LG', 'scheme': 'http://arxiv.org/...",James Robinson,"[Mark Herbster, James Robinson]",http://arxiv.org/abs/1806.06439v2,2018-06-17T20:17:33Z,We address the problem of predicting the label...,"[{'term': 'cs.LG', 'scheme': 'http://arxiv.org...",Predicting Switching Graph Labelings with Clus...,2019-03-14T18:21:05Z


In [3]:
# Extracting information
json_file['published_date'] = json_file['published'].str.extract('(\d\d\d\d-\d\d-\d\d)', expand=True)
json_file['published_time'] = json_file['published'].str.extract('(\d\d:\d\d:\d\d)', expand=True)
json_file['updated_date'] = json_file['updated'].str.extract('(\d\d\d\d-\d\d-\d\d)', expand=True)
json_file['updated_time'] = json_file['updated'].str.extract('(\d\d:\d\d:\d\d)', expand=True)
json_file['unique_id'] = json_file['id'].str.extract('(\d\d\d\d\.\d\d\d\d\d)', expand=True)
json_file['version_number'] = json_file['id'].str.extract('(\d$)', expand=True)

final_json = json_file[['unique_id', 'version_number', 'author', 
                        'title', 'summary', 'arxiv_comment', 
                        'published_date', 'published_time', 
                        'updated_date', 'updated_time', 
                        'tags', 'authors']]

final_json = final_json.drop_duplicates(subset='unique_id', keep='first', inplace=False)

In [4]:
# Local db url
db_url = {'drivername': 'postgres',
          'username': 'postgres',
          'password': 'postgres',
          'host': '127.0.0.1',
          'port': 5432}
engine = create_engine(URL(**db_url))

Base = declarative_base()

In [5]:
class PaperTable(Base):
    __tablename__ = 'PaperTable'
    id = Column(String, primary_key=True)
    version = Column(Integer, nullable=False)
    author = Column(String, nullable=False)
    authors = relationship('AuthorTable', backref='PaperTable')
    tags = relationship('TagTable', backref='PaperTable')
    title = Column(String, nullable=False)
    summary = Column(String)
    arxiv_comment = Column(String)
    published_date = Column(String)
    published_time = Column(String)
    updated_date = Column(String)
    updated_time = Column(String)
    timestamp = Column(DateTime, default=datetime.utcnow)

class AuthorTable(Base):
    __tablename__ = 'AuthorTable'
    id = Column(String, primary_key=True)
    author = Column(String, nullable=False)
    paper_id = Column(String, ForeignKey('PaperTable.id'))

class TagTable(Base):
    __tablename__ = 'TagTable'
    id = Column(String, primary_key=True)
    paper_tag = Column(String, nullable=False)
    paper_id = Column(String, ForeignKey('PaperTable.id'))

# create tables
Base.metadata.create_all(bind=engine)

# create session
Session = sessionmaker()
Session.configure(bind=engine)
session = Session()

In [6]:
# Adding records into Paper Table
json_dict = final_json.to_dict('records')

for i in range(len(json_dict)):
    row = PaperTable(id = json_dict[i]['unique_id'], 
                    version = json_dict[i]['version_number'], 
                    author = json_dict[i]['author'], 
                    title = json_dict[i]['title'], 
                    summary = json_dict[i]['summary'], 
                    arxiv_comment = json_dict[i]['arxiv_comment'], 
                    published_date = json_dict[i]['published_date'], 
                    published_time = json_dict[i]['published_time'], 
                    updated_date = json_dict[i]['updated_date'], 
                    updated_time = json_dict[i]['updated_time'])
    session.add(row)
    
session.commit()

In [7]:
def extract_category(json_file):
    ids = json_file['unique_id'].tolist()
    tags = json_file['tags'].tolist()
    tags_list = []
    for i in range(len(tags)):
        tags_list.append([tags[i][j]['term'] for j in range(len(tags[i]))])
    
    tags_dict = {}
    for i in range(len(ids)):
        tags_dict[ids[i]] = tags_list[i]
    return tags_dict

tags_dict = extract_category(final_json)
tags_dict

{'1903.05696': ['cs.AI', 'cs.CV'],
 '1712.05474': ['cs.CV', 'cs.AI', 'cs.LG'],
 '1903.06549': ['cs.CV', 'cs.LG'],
 '1903.06187': ['cs.LG', 'cs.AI', 'stat.ML'],
 '1806.06439': ['cs.LG', 'stat.ML'],
 '1809.09061': ['cs.CV'],
 '1707.02469': ['cs.LG', 'cs.NE'],
 '1903.06236': ['cs.LG', 'stat.ML'],
 '1901.04555': ['cs.SD', 'cs.LG', 'cs.MM', 'eess.AS', 'stat.ML'],
 '1903.06275': ['cs.CV'],
 '1903.06278': ['cs.RO', 'cs.AI', 'cs.LG'],
 '1903.06282': ['cs.RO', 'cs.AI', 'cs.LG'],
 '1810.12278': ['cs.LG', 'stat.ML'],
 '1903.06315': ['cs.CV'],
 '1903.05926': ['cs.LG', 'cs.AI', 'stat.ML'],
 '1903.06333': ['cs.IT', 'cs.LG', 'math.IT'],
 '1903.06336': ['stat.ML', 'cs.LG'],
 '1903.06342': ['cs.CV'],
 '1810.05943': ['cs.CV'],
 '1810.06118': ['cond-mat.mtrl-sci', 'cs.LG', 'physics.data-an', 'stat.ML'],
 '1903.06372': ['cs.LG', 'stat.ML'],
 '1811.01506': ['cs.LG', 'stat.ML'],
 '1903.05980': ['cs.SI', 'cs.LG', 'stat.ML'],
 '1903.00875': ['cs.CV'],
 '1903.06399': ['cs.CV'],
 '1809.03408': ['cs.CL', 'cs.CV'

In [8]:
def extract_authors(json_file):
    ids = json_file['unique_id'].tolist()
    authors = json_file['authors'].tolist()
    
    authors_dict = {}
    for i in range(len(ids)):
        authors_dict[ids[i]] = authors[i]
    return authors_dict

author_dict = extract_authors(final_json)

In [9]:
# Adding records into Author Table
for key, val in author_dict.items():
    for i in range(len(val)):
        id_str = str(key) + "-" + str(i)
        row = AuthorTable(id = id_str, 
                         author = val[i], 
                         paper_id = key)
        
        session.add(row)
    
session.commit()

In [10]:
# Adding records into Tag Table
for key, val in tags_dict.items():
    for i in range(len(val)):
        id_str = str(key) + "-" + str(i)
        row = TagTable(id = id_str, 
                       paper_tag = val[i], 
                       paper_id = key)
        
        session.add(row)
    
session.commit()

In [12]:
# Testing
result = engine.execute('SELECT * FROM '
                        '"PaperTable" LIMIT 5')
for _r in result:
   print(_r)

('1903.05696', 2, 'Aaron Hertzmann', 'Aesthetics of Neural Network Art', 'This paper proposes a way to understand neural network artworks as\njuxtapositions of natural image cues. It is hypothesized that images with\nunusua ... (270 characters truncated) ... ariations. This analysis is applied to\nneural art based on Generative Adversarial Networks, image stylization, Deep\nDreams, and Perception Engines.', None, '2019-03-13', '19:45:54', '2019-03-18', '17:58:15', datetime.datetime(2019, 3, 21, 4, 4, 6, 283248))
('1712.05474', 3, 'Ali Farhadi', 'AI2-THOR: An Interactive 3D Environment for Visual AI', 'We introduce The House Of inteRactions (THOR), a framework for visual AI\nresearch, available at http://ai2thor.allenai.org. AI2-THOR consists of nea ... (401 characters truncated) ... rning models of cognition. The goal of AI2-THOR is to\nfacilitate building visually intelligent models and push the research forward\nin this domain.', None, '2017-12-14', '23:17:24', '2019-03-15', '18:29:1

In [13]:
# Testing if foreign keys/relationships work and it does!
query = session.query(PaperTable, AuthorTable, TagTable).limit(10)
for _p, _a, _t in query.all():
    print(_p.id, _a.author, _t.paper_tag)

1903.05696 Aaron Hertzmann cs.AI
1903.05696 Eric Kolve cs.AI
1903.05696 Roozbeh Mottaghi cs.AI
1903.05696 Winson Han cs.AI
1903.05696 Eli VanderBilt cs.AI
1903.05696 Luca Weihs cs.AI
1903.05696 Alvaro Herrasti cs.AI
1903.05696 Daniel Gordon cs.AI
1903.05696 Yuke Zhu cs.AI
1903.05696 Abhinav Gupta cs.AI
