In [1]:
#import transformers
#import datasets

import gzip
import json

from pathlib import Path
import os
import math

import itertools as itr



from tqdm.auto import tqdm

import numpy as np
import pandas as pd
import sklearn.preprocessing as skl_prep
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [2]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [9]:
def count_encode_errors_in_file(infile_name, size_hint=1):
    res = dict(
        cid_count = 0,
        empty_count = 0,
        cid_ids = [],
        empty_ids = []
    )
    
    with gzip.open(infile_name) as infile, \
         tqdm(total=size_hint, desc="CID") as cid_bar, \
         tqdm(total=size_hint, desc="Empty") as empty_bar:

        for linecount, line in tqdm(enumerate(infile), total=size_hint):
            res_item = {}
            #if linecount > 100: break
            try:
                parsed_item = json.loads(line)
                #res.append(parsed_line)
                #print(f"\t{line[:10]}...{line[-10:]}")
            except json.JSONDecodeError as e:
                #print(f"!---{line}")
                continue

            # Process the paper.
            #res_item['label_major'] = parsed_item['major_category']
            #res_item['label_minor'] = parsed_item['prime_category']
            
            # default value for text
            #text = parsed_item['title'].strip() + " " + parsed_item['abstract'].strip()
            
            if 'fulltext' in parsed_item:
                fulltext = parsed_item['fulltext'].strip()
                if fulltext.startswith('(cid'):
                    res['cid_count'] += 1
                    cid_bar.update(res['cid_count'])
                    res['cid_ids'].append(parsed_item)
                elif len(fulltext) < 10:
                    res['empty_count'] += 1
                    empty_bar.update(res['empty_count'])
                    res['empty_ids'].append(parsed_item)
            else:
                res['empty_count'] += 1
    return res    

In [7]:
size_dict = {
    'train':{
        'major': 31753,
        'minor': 111860,
    },
    'test':{
        'major': 8000,
        'minor': 27967,
    },
}

file_pairs = []

for ds_type, m_dict in size_dict.items():
    for m_type, size in m_dict.items():
            res_pair = {
                "infile_name": f"./data/fulltext/{ds_type}_{m_type}_cats_full.json.gz",
                #"outfile_name": f"./data/full_text/{ds_type}_{m_type}_cats_full_split_token.json.gz",
                "size_hint": size,
            }
            file_pairs.append(res_pair)
            
file_pairs 

[{'infile_name': './data/fulltext/train_major_cats_full.json.gz',
  'size_hint': 31753},
 {'infile_name': './data/fulltext/train_minor_cats_full.json.gz',
  'size_hint': 111860},
 {'infile_name': './data/fulltext/test_major_cats_full.json.gz',
  'size_hint': 8000},
 {'infile_name': './data/fulltext/test_minor_cats_full.json.gz',
  'size_hint': 27967}]

In [30]:
for file_pair_args in file_pairs:
    print(file_pair_args['infile_name'])
    res = count_encode_errors_in_file(**file_pair_args)

./data/fulltext/train_major_cats_full.json.gz


CID:   0%|          | 0/31753 [00:00<?, ?it/s]

Empty:   0%|          | 0/31753 [00:00<?, ?it/s]

  0%|          | 0/31753 [00:00<?, ?it/s]

./data/fulltext/train_minor_cats_full.json.gz


CID:   0%|          | 0/111860 [00:00<?, ?it/s]

Empty:   0%|          | 0/111860 [00:00<?, ?it/s]

  0%|          | 0/111860 [00:00<?, ?it/s]

./data/fulltext/test_major_cats_full.json.gz


CID:   0%|          | 0/8000 [00:00<?, ?it/s]

Empty:   0%|          | 0/8000 [00:00<?, ?it/s]

  0%|          | 0/8000 [00:00<?, ?it/s]

./data/fulltext/test_minor_cats_full.json.gz


CID:   0%|          | 0/27967 [00:00<?, ?it/s]

Empty:   0%|          | 0/27967 [00:00<?, ?it/s]

  0%|          | 0/27967 [00:00<?, ?it/s]

In [24]:
for file_pair_args in file_pairs:
    print(file_pair_args['infile_name'])
    res = count_encode_errors_in_file(**file_pair_args)
    break

./data/fulltext/train_major_cats_full.json.gz


CID:   0%|          | 0/31753 [00:00<?, ?it/s]

Empty:   0%|          | 0/31753 [00:00<?, ?it/s]

  0%|          | 0/31753 [00:00<?, ?it/s]

In [20]:
for p in res['empty_ids']:
    if len(p['fulltext']) > 0:
        if 'abstract' in p: del p['abstract']
        if 'prime_category' in p: del p['prime_category']
        if 'major_category' in p: del p['major_category']
        print(json.dumps(p, indent=4))
        p['fulltext']
    

{
    "paper_id": "1207.4145",
    "version": "1",
    "yymm": "1207",
    "created": "2012-07-11T14:55:26",
    "title": "Joint discovery of haplotype blocks and complex trait associations from SNP sequences",
    "abs_categories": "q-bio.GN cs.CE stat.ME",
    "fulltext": "\f\f\f\f\f\f\f"
}


'\x0c\x0c\x0c\x0c\x0c\x0c\x0c'

In [28]:
[q['paper_id'] for q in res['cid_ids']]

['1102.1185',
 '0909.3340',
 '1907.07491',
 '1309.7889',
 '0803.3591',
 '1604.03591',
 '1005.1992',
 '1902.00053',
 '1908.05994',
 '1304.6232',
 '1210.5886',
 '1802.10495',
 '1702.06987',
 '1103.5660']

In [26]:
for q in res['cid_ids']:
    p = q.copy()
    if len(p['fulltext']) > 0:
        if 'abstract' in p: del p['abstract']
        if 'prime_category' in p: del p['prime_category']
        if 'major_category' in p: del p['major_category']
        p['fulltext'] = p['fulltext'][:7*12]
        print(json.dumps(p, indent=4))

    

{
    "paper_id": "1102.1185",
    "version": "2",
    "yymm": "1102",
    "created": "2011-02-09T12:39:51",
    "title": "Delta-like singularity in the Radial Laplace Operator and the Status of the Radial Schrodinger Equation",
    "abs_categories": "math-ph math.MP quant-ph",
    "fulltext": "(cid:68)(cid:101)(cid:108)(cid:116)(cid:97)(cid:45)(cid:108)(cid:105)(cid:107)(cid:1"
}
{
    "paper_id": "0909.3340",
    "version": "2",
    "yymm": "0909",
    "created": "2010-06-22T15:05:27",
    "title": "Deconfined SU(2) vector fields at zero temperature",
    "abs_categories": "hep-lat hep-th",
    "fulltext": "(cid:0)(cid:2)(cid:3)\u0003\u0002(cid:6)\u0002(cid:2)(cid:7) (cid:8)(cid:9)\u0004(cid:11)\u0005 (cid:13)(cid:"
}
{
    "paper_id": "1907.07491",
    "version": "1",
    "yymm": "1907",
    "created": "2019-07-17T13:12:43",
    "title": "The cyclicality of loan loss provisions under three different accounting models: the United Kingdom, Spain, and Brazil",
    "abs_categories": "ec