### Create a dict from magv1 papers

In [1]:
import time
import json
import sys
from tqdm import tqdm

In [None]:
# Make ID-title dict from magv1 papers 
def json_to_dict(files):
    id_dict = dict()
    for file in files:
        print(file)
        start_time = time.time()

        with open(file) as f:
            data = f.readlines()
            for line in data:
                jsondata = json.loads(line)
                paper_id = jsondata['id']
                title = jsondata['title']
                id_dict.update({paper_id : title})
                end_time = time.time()

        seconds_elapsed = (end_time-start_time)
        print("Added papers from file to dict in {:.2f} seconds.".format(seconds_elapsed))
        print("Total of " + str(len(id_dict)) + " lines in the dictionary after running")
    
    return id_dict

id_dict = json_to_dict(files)  

/tmp/data/magone/mag_papers_0.txt
Added papers from file to dict in 28.02 seconds.
Total of 1000000 lines in the dictionary after running
/tmp/data/magone/mag_papers_1.txt
Added papers from file to dict in 29.06 seconds.
Total of 2000000 lines in the dictionary after running
/tmp/data/magone/mag_papers_2.txt
Added papers from file to dict in 28.58 seconds.
Total of 3000000 lines in the dictionary after running
/tmp/data/magone/mag_papers_3.txt
Added papers from file to dict in 28.27 seconds.
Total of 4000000 lines in the dictionary after running
/tmp/data/magone/mag_papers_4.txt
Added papers from file to dict in 28.55 seconds.
Total of 5000000 lines in the dictionary after running
/tmp/data/magone/mag_papers_5.txt
Added papers from file to dict in 28.69 seconds.
Total of 6000000 lines in the dictionary after running
/tmp/data/magone/mag_papers_6.txt
Added papers from file to dict in 28.16 seconds.
Total of 7000000 lines in the dictionary after running
/tmp/data/magone/mag_papers_7.txt


In [12]:
# Replace the ids with the titles and write the file back

data_dir = '/tmp/data/magone/'
files = [data_dir + 'mag_papers_{}.txt'.format(i) for i in range(167)]

n_key_errors = 0
n_no_citations = 0
file_num = 0

for file in files:
    file_out = data_dir + 'mag_papers_{}_clean.txt'.format(file_num)
    n_file_key_errors = 0
    n_file_no_citations = 0
    
    print("Creating {}...".format(file_out))
    with open (file) as f:
        with open(file_out, 'w') as f1:
            data = f.readlines()
            for line in data:
                jsondata = json.loads(line)
                new_line = line
                if 'references' in jsondata.keys():
                    ref_ids = jsondata['references']
                    for ref_id in ref_ids:
                        try:
                            title = json.dumps(id_dict[ref_id])
                            new_line = str.replace(new_line, ref_id, title)
                        except KeyError:
                            n_file_key_errors += 1
                else: # this paper doesn't cite any other papers
                    n_file_no_citations += 1
                f1.write(new_line)
                
    print("\tKey Errors: {}".format(n_file_key_errors))
    print("\tNo Citations: {}".format(n_file_no_citations))    
                
    n_key_errors += n_file_key_errors
    n_no_citations += n_file_no_citations
    
    file_num = int(file_num_str)
    file_num += 1 

print("\nDONE!\nTotal Key Errors: {}".format(n_file_key_errors))
print("Total No Citations: {}".format(n_file_no_citations))       

/u/skokada/data/magone/mag_papers_0_clean.txt


FileNotFoundError: [Errno 2] No such file or directory: '/u/skokada/data/magone/mag_papers_0_clean.txt'

### SQL

In [None]:
import sqlite3

db_filename = ':memory:'
#db_filename = 'example.db'

conn = sqlite3.connect(db_filename)
c = conn.cursor()

In [None]:
c.execute('''DROP TABLE mag''')

In [None]:
# Create table
c.execute('''CREATE TABLE mag
         (id varchar(15), title text, refs text)''')

In [None]:
# For importing everything a df and read the lines
# TODO Update this to an optimized df
files = ['/Users/timholdsworth/code/scaling-science/notebooks/data/v1/mag_1e3.txt']
big_frame = pd.DataFrame()
for file in files:
    df = dd.read_json(file, lines=True)
    #big_frame = big_frame.append(df, ignore_index=True)
    df_big = dd.concat([df], axis=1)
    

df = pd.read_json(file, lines=True)


for row in df.iterrows():
    print(row[1]['references'])
    #c.execute("insert into mag values (?, ?)", [str(row[1]['id']), row[1]['title']])
    try:
        c.execute("insert into mag values (?, ?, ?)", [row[1]['id'], row[1]['title'], row[1]['references']])
    except:
        print("Interface error, probably NaN value")
        

conn.commit()

# Close db connection
# 36 characters long for id field


In [None]:
c.execute('SELECT * FROM mag')
all_rows = c.fetchall()
all_rows

### Creating the MAGv1 with titles in references using Pandas

In [None]:
import pandas as pd
import math
import dask.dataframe as dd
import numpy as np
from numpy import nan


In [None]:
# For importing everything into one big dataframe
start_time = time.time()
paper_files = ['/Users/timholdsworth/code/scaling-science/notebooks/data/v1/mag_papers_8/mag_papers_100k_165.txt']
big_frame = pd.DataFrame()
for file in files:
    df = dd.read_json(file, lines=True)
    #big_frame = big_frame.append(df, ignore_index=True)
    df_big = dd.concat([df], axis=1)
    

df = pd.read_json(file, lines=True)

end_time = time.time()
seconds_elapsed = end_time-start_time
print("Query completed in {:.2f} seconds.".format(seconds_elapsed))

In [None]:
df = df.set_index('id')
df.head()

In [None]:
file_out = "/Users/timholdsworth/code/scaling-science/notebooks/data/v1/mag_papers_clean_0.txt"
start_time = time.time()
with open(file_out, "w+") as f:
    for row in df.iterrows(): 
        paper_id = row[1][5]
        title = row[1][14] 
        refs_list = row[1][13] 
                
        if type(refs_list) is float: # there are no citations
            #print('no citations')
            f.write(str({"id": paper_id, "title": title}))
        
        else: # there are citations
            #print('existing citations, matching id to title')
            refs_titled = []
            for ref in refs_list:

                # Once/if all the ids are in the database we can take this if else out
                if df[df['id'] == ref].empty: # the id doesn't match for paper in the df
                    pass
                else:
                    ref_titled = df[df['id'] == ref].title.values[0]
                    refs_titled.append(ref_titled)
            
            # Once/if all the ids are in the database we can take this if else out
            if not refs_titled: # none of the cited paper ids were in the df
                f.write(str({"id": paper_id, "title": title}))
            else: 
                f.write(str({"id": paper_id, "title": title, "refs": refs_titled}))

end_time = time.time()
seconds_elapsed = end_time-start_time
minutes_elapsed = (end_time-start_time)/60
print("Query completed in {:.2f} seconds.".format(seconds_elapsed))

### SQLLITE

In [None]:
# Import the data from the json
with open('/Users/timholdsworth/code/scaling-science/notebooks/data/v1/mag_1e3.txt') as f:
    data = f.readline()
    jsondata = json.loads(data)
    print(jsondata['title'])

### Dask - Task Scheduler

In [None]:
import dask.dataframe as dd
df = dd.read_json(file, lines=True)
df.head()

In [None]:
# Takes in a given row (paper), returns the data to be written to the file

def make_paper():
    data = {"id": row[1][5],
        "title": row[1][14],
        "refs": row[1][13]}
    
    print(data)
    return data 

res = df.apply(make_paper(), axis=1)
res


In [None]:
print ("The dict takes up GB:", sys.getsizeof(id_dict)/1000000000)