### Download Dataset

In [0]:
%sh wget https://originalstatic.aminer.cn/misc/dblp.v13.7z

--2022-05-14 06:40:42--  https://originalstatic.aminer.cn/misc/dblp.v13.7z
Resolving originalstatic.aminer.cn (originalstatic.aminer.cn)... 159.27.2.14
Connecting to originalstatic.aminer.cn (originalstatic.aminer.cn)|159.27.2.14|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2568255035 (2.4G) [application/x-7z-compressed]
Saving to: ‘dblp.v13.7z’

     0K .......... .......... .......... .......... ..........  0%  266K 2h37m
    50K .......... .......... .......... .......... ..........  0%  131K 3h57m
   100K .......... .......... .......... .......... ..........  0% 19.5M 2h39m
   150K .......... .......... .......... .......... ..........  0%  261K 2h39m
   200K .......... .......... .......... .......... ..........  0% 12.3M 2h8m
   250K .......... .......... .......... .......... ..........  0%  274K 2h12m
   300K .......... .......... .......... .......... ..........  0% 32.7M 1h53m
   350K .......... .......... .......... .......... ..........  0% 10.

Unzipping

In [0]:
%sh sudo apt-get install p7zip

Reading package lists...
Building dependency tree...
Reading state information...
Suggested packages:
  p7zip-full
The following NEW packages will be installed:
  p7zip
0 upgraded, 1 newly installed, 0 to remove and 24 not upgraded.
Need to get 358 kB of archives.
After this operation, 1,010 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu focal/universe amd64 p7zip amd64 16.02+dfsg-7build1 [358 kB]
debconf: delaying package configuration, since apt-utils is not installed
Fetched 358 kB in 1s (591 kB/s)
Selecting previously unselected package p7zip.
(Reading database ... 
(Reading database ... 5%
(Reading database ... 10%
(Reading database ... 15%
(Reading database ... 20%
(Reading database ... 25%
(Reading database ... 30%
(Reading database ... 35%
(Reading database ... 40%
(Reading database ... 45%
(Reading database ... 50%
(Reading database ... 55%
(Reading database ... 60%
(Reading database ... 65%
(Reading database ... 70%
(Reading database ... 75%


### Decompress Data

In [0]:
%sh p7zip -d dblp.v13.7z


7-Zip (a) [64] 16.02 : Copyright (c) 1999-2016 Igor Pavlov : 2016-05-21
p7zip Version 16.02 (locale=C.UTF-8,Utf16=on,HugeFiles=on,64 bits,2 CPUs Intel(R) Xeon(R) CPU E5-2686 v4 @ 2.30GHz (406F1),ASM,AES-NI)

Scanning the drive for archives:
1 file, 2568255035 bytes (2450 MiB)

Extracting archive: dblp.v13.7z
--
Path = dblp.v13.7z
Type = 7z
Physical Size = 2568255035
Headers Size = 130
Method = LZMA2:24
Solid = -
Blocks = 1

Everything is Ok

Size:       17352640799
Compressed: 2568255035


Uncomment to count lines (takes 5-6 minutes). The count is 409,129,302.

In [0]:
#%sh wc -l dblpv13.json 

### Preprocess and Split

Remove NumberInt(...) and split into files with 250,000 objects per file

In [0]:
%pip install tqdm

Python interpreter will be restarted.
Collecting tqdm
  Downloading tqdm-4.64.0-py2.py3-none-any.whl (78 kB)
Installing collected packages: tqdm
Successfully installed tqdm-4.64.0
Python interpreter will be restarted.


In [0]:
import json
import re
import time
from tqdm import tqdm
import logging

# configuration
object_threshold = 250_000
validate_json = True
log_level = logging.INFO

# constants
file_path = "dblpv13.json"
file_lines = 409_129_302

logging.basicConfig(level=log_level)
logger = logging.getLogger(__name__)

replace_pattern = re.compile(r"NumberInt\(([0-9]+)\)")
def correct_text(text):
    return replace_pattern.sub(r"\1", text)

def is_object_start(line):
    return line == "{\n"

def is_object_end(line):
    return line == "},\n" or line == "}\n"

def write_objects(file_name, objects):
    with open(file_name, "w", encoding="utf-8") as out_file:
        logger.info(f"Wrote {len(objects)} json objects to {file_name}")
        out_file.write("[\n" + ',\n'.join(objects) + "\n]")
        
def check_json(text):
    try:
        json.loads(text)
    except:
        print(text)
        raise
    

with open(file_path, "r", encoding="utf-8") as f:
    objects = []
    current_object = []
    count = 0
    
    # skip first line
    next(f)
    
    for line in tqdm(f, total = file_lines - 1):
        assert not is_object_start(line) or len(current_object) == 0

        if is_object_end(line):
            assert len(current_object) > 0
            
            current_object.append("}")
            
            raw_json_object = "".join(current_object)
            
            # check if object is parsed
            if validate_json:
                check_json(raw_json_object)
            
            objects.append(raw_json_object)
            
            if len(objects) >= object_threshold:
                write_objects(f"dblpv13.{count}.json", objects)
                count += 1
                objects = []

            current_object = []

        else:
            current_object.append(correct_text(line))
        
    # write the remaining objects in our "buffer"
    if len(objects) > 0:
        write_objects(f"dblpv13.{count}.json", objects)


  0%|          | 0/409129301 [00:00<?, ?it/s]  0%|          | 13586/409129301 [00:00<50:11, 135854.30it/s]  0%|          | 27559/409129301 [00:00<49:21, 138130.87it/s]  0%|          | 41373/409129301 [00:00<50:56, 133824.64it/s]  0%|          | 55286/409129301 [00:00<50:10, 135875.22it/s]  0%|          | 70051/409129301 [00:00<48:40, 140062.06it/s]  0%|          | 84863/409129301 [00:00<47:44, 142775.30it/s]  0%|          | 99152/409129301 [00:00<48:36, 140251.41it/s]  0%|          | 113193/409129301 [00:00<49:07, 138776.78it/s]  0%|          | 127607/409129301 [00:00<48:32, 140417.42it/s]  0%|          | 142361/409129301 [00:01<47:48, 142585.97it/s]  0%|          | 156631/409129301 [00:01<51:53, 131354.83it/s]  0%|          | 170995/409129301 [00:01<50:32, 134855.30it/s]  0%|          | 185285/409129301 [00:01<49:41, 137181.45it/s]  0%|          | 199222/409129301 [00:01<49:27, 137816.69it/s]  0%|          | 213486/409129301 [00:01<48:56, 139236.84it/s]  0%|          

In [0]:
%sh ls -lh dblpv13.*.json

-rw-r--r-- 1 root root 655M May 14 07:09 dblpv13.0.json
-rw-r--r-- 1 root root 734M May 14 07:12 dblpv13.1.json
-rw-r--r-- 1 root root 759M May 14 07:34 dblpv13.10.json
-rw-r--r-- 1 root root 813M May 14 07:37 dblpv13.11.json
-rw-r--r-- 1 root root 679M May 14 07:39 dblpv13.12.json
-rw-r--r-- 1 root root 717M May 14 07:42 dblpv13.13.json
-rw-r--r-- 1 root root 706M May 14 07:44 dblpv13.14.json
-rw-r--r-- 1 root root 798M May 14 07:46 dblpv13.15.json
-rw-r--r-- 1 root root 818M May 14 07:49 dblpv13.16.json
-rw-r--r-- 1 root root 837M May 14 07:52 dblpv13.17.json
-rw-r--r-- 1 root root 818M May 14 07:54 dblpv13.18.json
-rw-r--r-- 1 root root 804M May 14 07:57 dblpv13.19.json
-rw-r--r-- 1 root root 744M May 14 07:14 dblpv13.2.json
-rw-r--r-- 1 root root 747M May 14 07:59 dblpv13.20.json
-rw-r--r-- 1 root root 211M May 14 08:00 dblpv13.21.json
-rw-r--r-- 1 root root 747M May 14 07:17 dblpv13.3.json
-rw-r--r-- 1 root root 733M May 14 07:19 dblpv13.4.json
-rw-r--r-- 1 root root 731M May 14 0

In [0]:
import fnmatch
import os

json_path = "file:/databricks/driver/"

for name in fnmatch.filter(map(lambda x: x.name, dbutils.fs.ls(json_path)), "dblpv13.*.json"):
    i = name.split(".")[-2]
    print(f"writing split {i}")
    
    df = spark.read.option("multiline", True).json(os.path.join(json_path, f'dblpv13.{i}.json'))
    df.write.mode('overwrite').parquet(f'dbfs:/user/dblpv13/dblpv13.{i}.parquet')

writing split 12
writing split 19
writing split 20
writing split 7
writing split 18
writing split 17
writing split 9
writing split 4
writing split 5
writing split 10
writing split 15
writing split 14
writing split 21
writing split 16
writing split 11
writing split 0
writing split 13
writing split 3
writing split 8
writing split 1
writing split 2
writing split 6
