Document similarity:
1. Compress and save the dictionary.
2. Compare dictionaries.
3. Simulated order to language.

Dictionary resemblance:
1. Does the combination appear?

How many times does the combination appear?

Compress is based on https://rosettacode.org/wiki/LZW_compression#Python

In [70]:
import json
import os
from random import shuffle
from tqdm import tqdm

In [68]:
def compress(uncompressed):
    """Compress a string to a list of output symbols."""
    
    last_char_id = 256
    compression_dictionary = {chr(i): i for i in range(last_char_id)}
    
    w = ""
    result = []
    for c in uncompressed:
        wc = w + c
        if wc in compression_dictionary:
            w = wc
        else:
            if not w in compression_dictionary:
                compression_dictionary[w] = last_char_id
                last_char_id += 1

            result.append(compression_dictionary[w])
            compression_dictionary[wc] = last_char_id
            last_char_id += 1
            w = c

    # Output the code for w.
    if w:
        if not w in compression_dictionary:
            compression_dictionary[w] = last_char_id
        result.append(compression_dictionary[w])
    return result,compression_dictionary

 
def decompress(compressed):
    """Decompress a list of output ks to a string."""
    from io import StringIO
 
    # Build the dictionary.
    dict_size = 256
    dictionary = dict((i, chr(i)) for i in range(dict_size))
    # in Python 3: dictionary = {i: chr(i) for i in range(dict_size)}
 
    # use StringIO, otherwise this becomes O(N^2)
    # due to string concatenation in a loop
    result = StringIO()
    w = chr(compressed.pop(0))
    result.write(w)
    for k in compressed:
        if k in dictionary:
            entry = dictionary[k]
        elif k == dict_size:
            entry = w + w[0]
        else:
            raise ValueError('Bad compressed k: %s' % k)
        result.write(entry)
 
        # Add w+entry[0] to the dictionary.
        dictionary[dict_size] = w + entry[0]
        dict_size += 1
 
        w = entry
    return result.getvalue()

In [17]:
# download the data
!kaggle datasets download -d allen-institute-for-ai/CORD-19-research-challenge

Downloading CORD-19-research-challenge.zip to /home/ido/data/idc/advanced ml/clustering_compression
100%|█████████████████████████████████████▉| 8.56G/8.56G [14:51<00:00, 11.1MB/s]
100%|██████████████████████████████████████| 8.56G/8.56G [14:51<00:00, 10.3MB/s]


In [18]:
!ls

 CORD-19-research-challenge.zip  'first hack.ipynb'   README.md


In [3]:
!unzip -q CORD-19-research-challenge.zip

In [8]:
with open(r'document_parses/pmc_json/PMC7683814.xml.json') as f:
    data = json.load(f)

In [22]:
def get_body_text(path:str):
    with open(path) as f:
        data = json.load(f)
        result =' '.join(body_part for body_part in (i['text'] for i in data['body_text']))
        return result

In [23]:
ex = get_body_text(r'document_parses/pmc_json/PMC7683814.xml.json')
ex

'Temperatures were measured with TAT-2000 and TAT-5000 TemporalScanner thermometers (Exergen Corp, Watertown, MA) and the Athena Elevated Temperature Detection System (Athena Security, Austin, TX). Exergen reports their instruments to be accurate within 0.2°C and 0.1°C, respectively.3,4 The Athena telethermographic system uses artificial intelligence to detect human faces by measuring the temperature of multiple points on the face relative to a blackbody temperature reference source.5 According to Athena Security, the system is accurate within 0.3°C.5 Systems were purchased from Athena Security. Accepting manufacturer specifications, detecting 0.2°C difference between devices (assuming standard deviation of ±0.3°C) required 26 measurements from each device. One subject was measured 104 times with 4 different TAT-2000s (26 measurements per device) and 104 times with 4 different TAT-5000s (26 measurements per device) by a single operator, and 13 times with the Athena system at a single l

In [31]:
folder = os.listdir(r'document_parses/pmc_json/')
folder[0]

'PMC7683814.xml.json'

In [60]:
def get_n_file_paths_from_foler(folder_path:str,n_files:int=10_000):
    folder = os.listdir(folder_path)
    shuffle(folder)
    result = [folder_path+file for file in folder[:n_files]]
    return result

In [72]:
files = get_n_file_paths_from_foler(r'document_parses/pmc_json/')
files[0].split('.')[0]

'document_parses/pmc_json/PMC7400710'

In [74]:
os.mkdir('data_for_compression')

In [75]:
for file in files:
    text = get_body_text(file)
    output_path = 'data_for_compression/' + file.split('pmc_json/')[-1].split('.')[0] +'.txt'
    with open(output_path,'w') as f:
        f.write(text)

Exception ignored in: <function tqdm.__del__ at 0x7f287d1d53a0>
Traceback (most recent call last):
  File "/home/ido/anaconda3/envs/aml3/lib/python3.9/site-packages/tqdm/std.py", line 1145, in __del__
    self.close()
  File "/home/ido/anaconda3/envs/aml3/lib/python3.9/site-packages/tqdm/notebook.py", line 283, in close
    self.disp(bar_style='danger', check_delay=False)
AttributeError: 'tqdm_notebook' object has no attribute 'disp'


In [56]:
os.mkdir('compresed')

In [62]:
files = get_n_file_paths_from_foler(r'data_for_compression/')
files[0].split('/')[1].split('.')[0]

'PMC7126579'

In [79]:
files = get_n_file_paths_from_foler(r'data_for_compression/')
write_to = r'compresed/'
for file in tqdm(files):
    with open(file,'r') as f:
        data = f.read()
    compressed,dict_representation = compress(data)
    file_output_name = file.split('/')[1].split('.')[0]
    file_name = write_to + file_output_name +'.json'
    with open(file_name, 'w') as fp:
        json.dump(dict_representation, fp)

100%|██████████| 10000/10000 [01:30<00:00, 110.40it/s]


In [None]:
def read_all_jsons_in_path(path:str):
    pass

def compare_single(first:dict,second:dict):
    pass

def compere_single_to_many_file(path:str):
    # read new file
    # list_of_dicts = read_all_jsons_in_path()
    # l =[compare_single(f1,f2) for f2 in list_of_dicts]
    # sort l by simelarty metrix
    # print file name of first file
    pass