In [2]:
from datetime import datetime

from sqlalchemy import create_engine
from sqlalchemy import Column, Integer, String, DateTime
from sqlalchemy.orm import sessionmaker
from sqlalchemy.exc import SQLAlchemyError
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.engine.url import URL

import pandas as pd

In [3]:
# Used full file path for testing purpose
json_file = pd.read_json("/Users/EmilyWang/Desktop/ops/paper-collector/DeepLearningArticles.json", orient='index')

In [4]:
# Extracting information
json_file['published_date'] = json_file['published'].str.extract('(\d\d\d\d-\d\d-\d\d)', expand=True)
json_file['published_time'] = json_file['published'].str.extract('(\d\d:\d\d:\d\d)', expand=True)
json_file['updated_date'] = json_file['updated'].str.extract('(\d\d\d\d-\d\d-\d\d)', expand=True)
json_file['updated_time'] = json_file['updated'].str.extract('(\d\d:\d\d:\d\d)', expand=True)
json_file['unique_id'] = json_file['id'].str.extract('(\d\d\d\d\.\d\d\d\d\d)', expand=True)
json_file['version_number'] = json_file['id'].str.extract('(\d$)', expand=True)

final_json = json_file[['unique_id', 'version_number', 'author', 
                        'title', 'summary', 'arxiv_comment', 
                        'published_date', 'published_time', 
                        'updated_date', 'updated_time']]

final_json = final_json.drop_duplicates(subset='unique_id', keep='first', inplace=False)

In [5]:
# Local db url
db_url = {'drivername': 'postgres',
          'username': 'postgres',
          'password': 'postgres',
          'host': '127.0.0.1',
          'port': 5432}
engine = create_engine(URL(**db_url))

Base = declarative_base()

In [6]:
class TestTable(Base):
    __tablename__ = 'Test Table'
    id = Column(String, primary_key=True)
    version = Column(Integer, nullable=False)
    author = Column(String, nullable=False)
    title = Column(String, nullable=False)
    summary = Column(String)
    arxiv_comment = Column(String)
    published_date = Column(String)
    published_time = Column(String)
    updated_date = Column(String)
    updated_time = Column(String)
    timestamp = Column(DateTime, default=datetime.utcnow)

# create tables
Base.metadata.create_all(bind=engine)

# create session
Session = sessionmaker()
Session.configure(bind=engine)
session = Session()

In [7]:
json_dict = final_json.to_dict('records')

for i in range(len(json_dict)):
    row = TestTable(id = json_dict[i]['unique_id'], 
                    version = json_dict[i]['version_number'], 
                    author = json_dict[i]['author'], 
                    title = json_dict[i]['title'], 
                    summary = json_dict[i]['summary'], 
                    arxiv_comment = json_dict[i]['arxiv_comment'], 
                    published_date = json_dict[i]['published_date'], 
                    published_time = json_dict[i]['published_time'], 
                    updated_date = json_dict[i]['updated_date'], 
                    updated_time = json_dict[i]['updated_time'])
    session.add(row)
    
session.commit()

In [8]:
# Testing
result = engine.execute('SELECT * FROM '
                        '"Test Table" LIMIT 5')
for _r in result:
   print(_r)

('1903.05696', 2, 'Aaron Hertzmann', 'Aesthetics of Neural Network Art', 'This paper proposes a way to understand neural network artworks as\njuxtapositions of natural image cues. It is hypothesized that images with\nunusua ... (270 characters truncated) ... ariations. This analysis is applied to\nneural art based on Generative Adversarial Networks, image stylization, Deep\nDreams, and Perception Engines.', None, '2019-03-13', '19:45:54', '2019-03-18', '17:58:15', datetime.datetime(2019, 3, 20, 1, 44, 37, 106419))
('1712.05474', 3, 'Ali Farhadi', 'AI2-THOR: An Interactive 3D Environment for Visual AI', 'We introduce The House Of inteRactions (THOR), a framework for visual AI\nresearch, available at http://ai2thor.allenai.org. AI2-THOR consists of nea ... (401 characters truncated) ... rning models of cognition. The goal of AI2-THOR is to\nfacilitate building visually intelligent models and push the research forward\nin this domain.', None, '2017-12-14', '23:17:24', '2019-03-15', '18:29