In [1]:
from datetime import datetime

from sqlalchemy import create_engine
from sqlalchemy import Column, Integer, String, DateTime, ForeignKey
from sqlalchemy.orm import sessionmaker, relationship, backref
from sqlalchemy.exc import SQLAlchemyError
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.engine.url import URL

import pandas as pd

In [2]:
# Used full file path for testing purpose
json_file = pd.read_json("/Users/EmilyWang/Downloads/ops-master/paper-collector/DeepLearningArticles.json", orient='index')
json_file.head()

Unnamed: 0,arxiv_comment,arxiv_primary_category,author,authors,id,published,summary,tags,title,updated
0,"21 pages, 6 figures","{'term': 'cs.LG', 'scheme': 'http://arxiv.org/...",Raman Arora,"[Nikita Ivkin, Daniel Rothchild, Enayat Ullah,...",http://arxiv.org/abs/1903.04488v1,2019-03-12T17:59:48Z,Large-scale distributed training of neural net...,"[{'term': 'cs.LG', 'scheme': 'http://arxiv.org...",Communication-efficient distributed SGD with S...,2019-03-12T17:59:48Z
1,Accepted At THE 1ST WORKSHOP ON ENERGY EFFICIE...,"{'term': 'cs.CV', 'scheme': 'http://arxiv.org/...",Mickey Aleksic,"[Tao Sheng, Chen Feng, Shaojie Zhuo, Xiaopeng ...",http://arxiv.org/abs/1803.08607v3,2018-03-22T23:06:38Z,As deep learning (DL) is being rapidly pushed ...,"[{'term': 'cs.CV', 'scheme': 'http://arxiv.org...",A Quantization-Friendly Separable Convolution ...,2019-03-12T17:58:19Z
2,,"{'term': 'cs.NE', 'scheme': 'http://arxiv.org/...",Pavlos Protopapas,"[Jacob Reinier Maat, Nikos Gianniotis, Pavlos ...",http://arxiv.org/abs/1903.05071v1,2019-03-12T17:27:19Z,Echo State Networks (ESNs) are recurrent neura...,"[{'term': 'cs.NE', 'scheme': 'http://arxiv.org...",Efficient Optimization of Echo State Networks ...,2019-03-12T17:27:19Z
3,,"{'term': 'cs.LG', 'scheme': 'http://arxiv.org/...",Daniel Wintz,"[Michael Lingzhi Li, Elliott Wolf, Daniel Wintz]",http://arxiv.org/abs/1903.05063v1,2019-03-12T17:12:07Z,Optimizing storage assignment is a central pro...,"[{'term': 'cs.LG', 'scheme': 'http://arxiv.org...",Application of Duration-of-Stay Storage Assign...,2019-03-12T17:12:07Z
4,CVPR 2019,"{'term': 'cs.CV', 'scheme': 'http://arxiv.org/...",Andrei Bursuc,"[Yann Lifchitz, Yannis Avrithis, Sylvaine Pica...",http://arxiv.org/abs/1903.05050v1,2019-03-12T16:58:08Z,Training deep neural networks from few example...,"[{'term': 'cs.CV', 'scheme': 'http://arxiv.org...",Dense Classification and Implanting for Few-Sh...,2019-03-12T16:58:08Z


In [3]:
# Extracting information
json_file['published_date'] = json_file['published'].str.extract('(\d\d\d\d-\d\d-\d\d)', expand=True)
json_file['published_time'] = json_file['published'].str.extract('(\d\d:\d\d:\d\d)', expand=True)
json_file['updated_date'] = json_file['updated'].str.extract('(\d\d\d\d-\d\d-\d\d)', expand=True)
json_file['updated_time'] = json_file['updated'].str.extract('(\d\d:\d\d:\d\d)', expand=True)
json_file['unique_id'] = json_file['id'].str.extract('(\d\d\d\d\.\d\d\d\d\d)', expand=True)
json_file['version_number'] = json_file['id'].str.extract('(\d$)', expand=True)

final_json = json_file[['unique_id', 'version_number', 'author', 
                        'title', 'summary', 'arxiv_comment', 
                        'published_date', 'published_time', 
                        'updated_date', 'updated_time', 
                        'tags', 'authors']]

final_json = final_json.drop_duplicates(subset='unique_id', keep='first', inplace=False)

In [4]:
# Local db url
db_url = {'drivername': 'postgres',
          'username': 'postgres',
          'password': 'postgres',
          'host': '127.0.0.1',
          'port': 5432}
engine = create_engine(URL(**db_url))

Base = declarative_base()

In [5]:
class AuthorTable(Base):
    __tablename__ = 'AuthorTable'
    id = Column(String, primary_key=True)
    author = Column(String, nullable=False)
    paper_id = Column(String, ForeignKey('PaperTable.id'))

class TagTable(Base):
    __tablename__ = 'TagTable'
    id = Column(String, primary_key=True)
    paper_tag = Column(String, nullable=False)
    paper_id = Column(String, ForeignKey('PaperTable.id'))

class PaperTable(Base):
    __tablename__ = 'PaperTable'
    id = Column(String, primary_key=True)
    version = Column(Integer, nullable=False)
    author = Column(String, nullable=False)
    authors = relationship(AuthorTable, 
                           primaryjoin=id==AuthorTable.paper_id, 
                           post_update=True)
    tags = relationship(TagTable, 
                        primaryjoin=id==TagTable.paper_id, 
                        post_update=True)
    title = Column(String, nullable=False)
    summary = Column(String)
    arxiv_comment = Column(String)
    published_date = Column(String)
    published_time = Column(String)
    updated_date = Column(String)
    updated_time = Column(String)
    timestamp = Column(DateTime, default=datetime.utcnow)

# create tables
Base.metadata.create_all(bind=engine)

# create session
Session = sessionmaker()
Session.configure(bind=engine)
session = Session()

In [6]:
# Adding records into Paper Table
json_dict = final_json.to_dict('records')

for i in range(len(json_dict)):
    row = PaperTable(id = json_dict[i]['unique_id'], 
                    version = json_dict[i]['version_number'], 
                    author = json_dict[i]['author'], 
                    title = json_dict[i]['title'], 
                    summary = json_dict[i]['summary'], 
                    arxiv_comment = json_dict[i]['arxiv_comment'], 
                    published_date = json_dict[i]['published_date'], 
                    published_time = json_dict[i]['published_time'], 
                    updated_date = json_dict[i]['updated_date'], 
                    updated_time = json_dict[i]['updated_time'])
    session.add(row)
    
session.commit()

In [7]:
def extract_category(json_file):
    ids = json_file['unique_id'].tolist()
    tags = json_file['tags'].tolist()
    tags_list = []
    for i in range(len(tags)):
        tags_list.append([tags[i][j]['term'] for j in range(len(tags[i]))])
    
    tags_dict = {}
    for i in range(len(ids)):
        tags_dict[ids[i]] = tags_list[i]
    return tags_dict

tags_dict = extract_category(final_json)
tags_dict

{'1903.04488': ['cs.LG', 'cs.AI', 'cs.DC', 'math.OC', 'stat.ML'],
 '1803.08607': ['cs.CV'],
 '1903.05071': ['cs.NE', 'cs.LG', 'stat.ML'],
 '1903.05063': ['cs.LG', 'stat.ML'],
 '1903.05050': ['cs.CV'],
 '1903.04268': ['math.OC', 'cs.LG', 'stat.ML'],
 '1903.04988': ['cs.CV', 'cs.LG'],
 '1903.04959': ['cs.LG', 'cs.AI', 'cs.MA', 'stat.ML'],
 '1902.09868': ['cs.CV'],
 '1903.04932': ['cs.RO', 'cs.LG'],
 '1903.03911': ['cs.CV', 'cs.GR'],
 '1902.10467': ['cs.CV'],
 '1809.09970': ['cs.CV'],
 '1902.09980': ['cs.AI', 'cs.LG', 'I.2.6; I.2.8'],
 '1903.04797': ['stat.ML', 'cs.LG', 'stat.CO'],
 '1902.01194': ['cs.LG', 'stat.ML'],
 '1903.04778': ['cs.CV'],
 '1903.04774': ['cs.LG', 'stat.ML'],
 '1903.04772': ['cs.CV'],
 '1811.11615': ['cs.RO', 'cs.LG'],
 '1903.01003': ['cs.LG', 'cs.AI'],
 '1903.04717': ['cs.LG', 'cs.CR', 'stat.ML'],
 '1903.04714': ['cs.LG', 'cs.AI', 'cs.MA'],
 '1903.04711': ['cs.CV', 'cs.LG', 'cs.NE'],
 '1903.04019': ['cs.CV'],
 '1903.03313': ['cs.CV'],
 '1903.04704': ['cs.CV'],
 '1806

In [8]:
def extract_authors(json_file):
    ids = json_file['unique_id'].tolist()
    authors = json_file['authors'].tolist()
    
    authors_dict = {}
    for i in range(len(ids)):
        authors_dict[ids[i]] = authors[i]
    return authors_dict

author_dict = extract_authors(final_json)

In [9]:
# Adding records into Author Table
for key, val in author_dict.items():
    for i in range(len(val)):
        id_str = str(key) + "-" + str(i)
        row = AuthorTable(id = id_str, 
                         author = val[i], 
                         paper_id = key)
        
        session.add(row)
    
session.commit()

In [10]:
# Adding records into Tag Table
for key, val in tags_dict.items():
    for i in range(len(val)):
        id_str = str(key) + "-" + str(i)
        row = TagTable(id = id_str, 
                       paper_tag = val[i], 
                       paper_id = key)
        
        session.add(row)
    
session.commit()

In [11]:
# Testing
result = engine.execute('SELECT * FROM '
                        '"PaperTable" LIMIT 5')
for _r in result:
   print(_r)

('1903.04488', 1, 'Raman Arora', 'Communication-efficient distributed SGD with Sketching', 'Large-scale distributed training of neural networks is often limited by\nnetwork bandwidth, wherein the communication time overwhelms the local\ncomp ... (894 characters truncated) ... compression\nratio of 4, or about 1 percentage point drop with a compression ratio of 8. We\nalso demonstrate that our method scales to many workers.', '21 pages, 6 figures', '2019-03-12', '17:59:48', '2019-03-12', '17:59:48', datetime.datetime(2019, 3, 24, 18, 19, 51, 204973))
('1803.08607', 3, 'Mickey Aleksic', 'A Quantization-Friendly Separable Convolution for MobileNets', 'As deep learning (DL) is being rapidly pushed to edge computing, researchers\ninvented various ways to make inference computation more efficient on\n ... (730 characters truncated) ... eNet2012 dataset, our modified MobileNetV1\nmodel can archive 8-bit inference top-1 accuracy in 68.03%, almost closed the\ngap to the float pipeline.', 'Accep

In [12]:
# Testing if foreign keys/relationships work and it does!
query = session.query(PaperTable, AuthorTable, TagTable).limit(10)
for _p, _a, _t in query.all():
    print(_p.id, _a.author, _t.paper_tag)

1903.04488 Nikita Ivkin cs.LG
1903.04488 Daniel Rothchild cs.LG
1903.04488 Enayat Ullah cs.LG
1903.04488 Vladimir Braverman cs.LG
1903.04488 Ion Stoica cs.LG
1903.04488 Raman Arora cs.LG
1903.04488 Tao Sheng cs.LG
1903.04488 Chen Feng cs.LG
1903.04488 Shaojie Zhuo cs.LG
1903.04488 Xiaopeng Zhang cs.LG


In [13]:
# Import modules
from sqlalchemy import create_engine, MetaData, Table, update
from sqlalchemy import Column, Integer, String, DateTime, ForeignKey
from sqlalchemy.orm import sessionmaker, relationship
from sqlalchemy.exc import SQLAlchemyError
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.engine.url import URL

import arxiv
import pandas as pd
from datetime import datetime

def create_tables():
    # Initiate base
    Base = declarative_base()

    # Create tables
    class PaperTable(Base):
        __tablename__ = 'PaperTable'
        id = Column(String, primary_key=True)
        version = Column(Integer, nullable=False)
        author = Column(String, nullable=False)
        authors = relationship(AuthorTable, 
                               primaryjoin=id==AuthorTable.paper_id, 
                               post_update=True)
        tags = relationship(TagTable, 
                            primaryjoin=id==TagTable.paper_id, 
                            post_update=True)
        title = Column(String, nullable=False)
        summary = Column(String)
        arxiv_comment = Column(String)
        published_date = Column(String)
        published_time = Column(String)
        updated_date = Column(String)
        updated_time = Column(String)
        timestamp = Column(DateTime, default=datetime.utcnow)

    class AuthorTable(Base):
        __tablename__ = 'AuthorTable'
        id = Column(String, primary_key=True)
        author = Column(String, nullable=False)
        paper_id = Column(String, ForeignKey('PaperTable.id'))

    class TagTable(Base):
        __tablename__ = 'TagTable'
        id = Column(String, primary_key=True)
        paper_tag = Column(String, nullable=False)
        paper_id = Column(String, ForeignKey('PaperTable.id'))

    Base.metadata.create_all(bind=engine)

def obtain_new_articles():
    new_articles = arxiv.query(search_query, max_results=5000,
                               sort_by="lastUpdatedDate", sort_order="descending")
    new_articles_df = pd.DataFrame.from_dict(new_articles)
    ordered_new_articles = new_articles_df.reindex(
                            columns=['title', 'author', 'authors', 'id', 'arxiv_comment',
                                     'arxiv_primary_category', 'published', 'summary',
                                     'tags', 'updated'])
    return ordered_new_articles

def extract_column(df_file):
    # Extracting information
    df_file['published_date'] = df_file['published'].str.extract('(\d\d\d\d-\d\d-\d\d)', expand=True)
    df_file['published_time'] = df_file['published'].str.extract('(\d\d:\d\d:\d\d)', expand=True)
    df_file['updated_date'] = df_file['updated'].str.extract('(\d\d\d\d-\d\d-\d\d)', expand=True)
    df_file['updated_time'] = df_file['updated'].str.extract('(\d\d:\d\d:\d\d)', expand=True)
    df_file['unique_id'] = df_file['id'].str.extract('(\d\d\d\d\.\d\d\d\d\d)', expand=True)
    df_file['version_number'] = df_file['id'].str.extract('(\d$)', expand=True)

    final_df = df_file[['unique_id', 'version_number', 'author', 
                        'title', 'summary', 'arxiv_comment', 
                        'published_date', 'published_time', 
                        'updated_date', 'updated_time', 
                        'tags', 'authors']]

    final_df = final_df.drop_duplicates(subset='unique_id', keep='first', inplace=False)
    return final_df

def check_existence(session, PaperTable, id_string):
    query = session.query(PaperTable).filter(PaperTable.id==id_string)
    
    if query.one_or_none():
        return True
    else:
        return False

def update_existing_articles(session, PaperTable, id_string, df):
    update(PaperTable).where(PaperTable.id == id_string).\
            values(version = df.iloc[0, 1], \
                   summary = df.iloc[0, 4], \
                   arxiv_comment = df.iloc[0, 5], \
                   updated_date = df.iloc[0, 8], \
                   updated_time = df.iloc[0, 9], \
                   timestamp=datetime.utcnow)

    session.commit()

def extract_category(df):
    tags = df.iloc[0, 10]
    tags_list = []
    for i in range(len(tags)):
        tags_list.append([tags[i]['term']])
    return tags_list

def insert_new_articles(session, PaperTable, AuthorTable, TagTable, id_string, df):
    # Adding records into Paper Table
    paper_row = PaperTable(id = id_string, 
                           version = df.iloc[0, 1], 
                           author = df.iloc[0, 2], 
                           title = df.iloc[0, 3], 
                           summary = df.iloc[0, 4], 
                           arxiv_comment = df.iloc[0, 5], 
                           published_date = df.iloc[0, 6], 
                           published_time = df.iloc[0, 7], 
                           updated_date = df.iloc[0, 8], 
                           updated_time = df.iloc[0, 9], 
                           timestamp = Column(DateTime, default=datetime.utcnow))
    session.add(paper_row)
    session.commit()
    
    # Adding records into Author Table
    authors = df.iloc[0, 11]
    for i in range(len(authors)):
        id_str = id_string + "-" + str(i)
        author_row = AuthorTable(id = id_str, 
                                 author = authors[i], 
                                 paper_id = id_string)
        session.add(author_row)
    session.commit()
    
    # Adding records into Tag Table
    tags_list = extract_category(df)
    for i in range(tags_list):
        id_str = id_string + "-" + str(i)
        tag_row = TagTable(id = id_str, 
                           paper_tag = tags_list[i], 
                           paper_id = id_string)
        session.add(tag_row)
    session.commit()


def main():
    db_url = {'drivername': 'postgres',
              'username': 'postgres',
              'password': 'postgres',
              'host': '127.0.0.1',
              'port': 5432}
    
    # Initiate engine
    engine = create_engine(URL(**db_url))
    
    metadata = MetaData()

    Base = automap_base(metadata=metadata)
    
    # reflect the tables
    Base.prepare(engine, reflect=True)

    # Mapped classes with names matching that of the table name
    PaperTable = Base.classes.PaperTable
    AuthorTable = Base.classes.AuthorTable
    TagTable = Base.classes.TagTable
    
    Session = sessionmaker(bind=engine)
    session = Session()
    
    # article_df = extract_column(obtain_new_articles())
    article_df = extract_column(pd.read_json("/Users/EmilyWang/Desktop/ops/paper-collector/DeepLearningArticles.json", orient='index'))
    article_id_lst = article_df['unique_id'].values
    
    for id_string in article_df['unique_id'].values:
        row_df = article_df[article_df['unique_id']==id_string]
        
        if check_existence(session, PaperTable, id_string):
            exist_version = session.query(PaperTable.version).\
                                    filter(PaperTable.id==id_string)
            if int(article_df[article_df['unique_id']==id_string]['version_number']) \
                    > exist_version:
                update_existing_articles(session, PaperTable, id_string, row_df)
                
        else:
            insert_new_articles(session, PaperTable, AuthorTable, TagTable, id_string, row_df)

main()

CompileError: Cannot compile Column object until its 'name' is assigned.

In [None]:
# Discarded
def load_session():
    # Local db url
    db_url = {'drivername': 'postgres',
              'username': 'postgres',
              'password': 'postgres',
              'host': '127.0.0.1',
              'port': 5432}

    # Initiate engine
    engine = create_engine(URL(**db_url))
    metadata = MetaData()
    metadata.reflect(bind=engine)
    PaperTable = metadata.tables['PaperTable']
    AuthorTable = metadata.tables['AuthorTable']
    TagTable = metadata.tables['TagTable']
    
    # Create session
    Session = sessionmaker(bind=engine)
    session = Session()
    return session, PaperTable, AuthorTable, TagTable