# Analysis of numeric coverage in CSKG

This notebook:
* computes numbers of statements in CSKG that contain numeric information, either exact or approximate; either in English or as digits
* extracts various collections and stores them as KGTK files

## Analysis

In [23]:
import gzip
from tqdm import tqdm

In [7]:
graph='../output/cskg.tsv.gz'

In [1]:
units = [
    "zero", "one", "two", "three", "four", "five", "six", "seven", "eight",
    "nine", "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen",
    "sixteen", "seventeen", "eighteen", "nineteen",
]

tens = ["twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety"]

scales = ["hundred", "thousand", "million", "billion", "trillion"]

In [2]:
keywords=set(units+tens+scales)

In [3]:
len(keywords)

33

In [20]:
def num_there(s, kw):
    return any(i.isdigit() for i in s) or any(k in s for k in kw)

In [15]:
def line2edge(l):
    return l.decode().strip().split('\t')

In [29]:
only_n1=[]
only_n2=[]
both=[]
with gzip.open(graph, 'r') as f:
    header=line2edge(next(f))
    for line in tqdm(f, total=6001531):
        edge=line2edge(line)
        n1_label=edge[4]
        n2_label=edge[5]
        
        num_n1=num_there(n1_label, keywords)
        num_n2=num_there(n2_label, keywords)

        if num_n1 and num_n2:
            both.append(line)
        elif num_n1:
            only_n1.append(line)
        elif num_n2:
            only_n2.append(line)


  0%|          | 0/6001531 [00:00<?, ?it/s][A
  0%|          | 11337/6001531 [00:00<00:52, 113360.92it/s][A
  0%|          | 22227/6001531 [00:00<00:53, 111984.03it/s][A
  1%|          | 32861/6001531 [00:00<00:54, 110226.23it/s][A
  1%|          | 44283/6001531 [00:00<00:53, 111390.80it/s][A
  1%|          | 55611/6001531 [00:00<00:53, 111948.96it/s][A
  1%|          | 67103/6001531 [00:00<00:52, 112823.33it/s][A
  1%|▏         | 78160/6001531 [00:00<00:52, 112135.99it/s][A
  1%|▏         | 89648/6001531 [00:00<00:52, 112945.05it/s][A
  2%|▏         | 100913/6001531 [00:00<00:52, 112848.23it/s][A
  2%|▏         | 111804/6001531 [00:01<00:52, 111636.07it/s][A
  2%|▏         | 123051/6001531 [00:01<00:52, 111881.95it/s][A
  2%|▏         | 134020/6001531 [00:01<00:52, 111131.36it/s][A
  2%|▏         | 145315/6001531 [00:01<00:52, 111669.46it/s][A
  3%|▎         | 156677/6001531 [00:01<00:52, 112247.27it/s][A
  3%|▎         | 168199/6001531 [00:01<00:51, 113121.14it/s][A


In [37]:
print('BOTH HEAD AND TAIL CONTAIN NUMBERS:', len(both))
print('ONLY HEAD CONTAINS NUMBER:', len(only_n1)) 
print('ONLY TAIL CONTAINS NUMBER:',  len(only_n2))

BOTH HEAD AND TAIL CONTAIN NUMBERS: 78928
ONLY HEAD CONTAINS NUMBER: 174269
ONLY TAIL CONTAINS NUMBER: 144686


In [41]:
def write_to_file(filename, data, header):
    with open(filename, 'w') as w:
        w.write('\t'.join(header) + '\n')
        for line in data:
            w.write(line.decode())

In [42]:
write_to_file('../tmp/numeric_n1.tsv', only_n1, header)
write_to_file('../tmp/numeric_n2.tsv', only_n2, header)
write_to_file('../tmp/numeric_both.tsv', both, header)