# Data optimization

In [102]:
import pandas as pd
import numpy as np
import json

## Reading data

In [103]:
with open('data/basic_data.jsonl', "r") as f:
    data = "[" + ",".join(f.readlines()) + "]"
data = json.loads(data)
df = pd.DataFrame(data)

In [104]:
df

Unnamed: 0,cluster_id,key_concepts,num_recent_articles,cluster_category,growth_rating,x,y
0,84872,"[historical performance practice, Don Juan, Ba...",228,humanities,15.43,-90.2114,-71.3960
1,72062,"[Early Modern Art, John Singleton Copley, Scot...",239,humanities,3.22,-76.1376,-37.2588
2,62380,"[Malay gamelan music, folk music, Music Theory...",283,humanities,33.33,-160.4460,20.6144
3,65319,"[Rite of Spring, Indonesian piano music, piano...",242,humanities,3.11,-78.1228,-68.1877
4,72024,"[Mediterranean Marine Caves, Marine Caves, mar...",104,biology,74.75,-45.6533,178.6693
...,...,...,...,...,...,...,...
85638,6047,"[Tissue Engineering, cell culture, Cell Cultur...",686,materials science,52.10,320.7678,181.4553
85639,129,"[graphene oxide, Reduced Graphene, Functionali...",1982,materials science,33.59,230.4844,350.5407
85640,41756,"[OLED display, crystalline oxide semiconductor...",376,materials science,44.26,168.4628,392.2027
85641,213,"[chemical vapor deposition, CVD graphene films...",1651,materials science,19.63,98.1491,416.9048


## Counting Clusters

In [25]:
cats = sorted(list(set(data_point["cluster_category"] for data_point in data)))

In [31]:
def cluster_name_to_index(cluster_name):
    match cluster_name:
        case 'biology': return 0
        case 'chemistry': return 1
        case 'computer science': return 2
        case 'earth science': return 3
        case 'engineering': return 4
        case 'humanities': return 5
        case 'materials science': return 6
        case 'mathematics': return 7
        case 'medicine': return 8
        case 'physics': return 9
        case 'social science': return 10
        case _: return -1

def cluster_index_to_name(cluster_index):
    match cluster_index:
        case 0: return 'biology'
        case 1: return 'chemistry'
        case 2: return 'computer science'
        case 3: return 'earth science'
        case 4: return 'engineering'
        case 5: return 'humanities'
        case 6: return 'materials science'
        case 7: return 'mathematics'
        case 8: return 'medicine'
        case 9: return 'physics'
        case 10: return 'social science'
        case _: return ''

### Counting Key concepts

In [116]:
df.loc[df["key_concepts"].isna(),"key_concepts"] = "[]"

In [117]:
keys = sorted(list(set([key for group in df["key_concepts"] for key in group])))

In [122]:
keys_dict = {key: i for i, key in enumerate(keys)}
keys_dict

{'12-20210031': 0,
 '1790-1920': 1,
 '19-9 levels': 2,
 '1‒3 November': 3,
 '2012-13 French pill': 4,
 '2012-2013 Central African': 5,
 '2018-2020 Period': 6,
 '25-6981': 7,
 '2950:1974 Brown coal': 8,
 '4,4′': 9,
 'A-Ring Substituted Anthraquinone': 10,
 'A-Site Cation Ordering': 11,
 'A-Site Columnar-Ordered Quadruple': 12,
 'A-induced Liver Injury': 13,
 'A-induced autoimmune hepatitis': 14,
 'A-kinase anchor protein': 15,
 'A-kinase anchoring protein': 16,
 'A-kinase interacting protein': 17,
 'A-level geology': 18,
 'A-level scenic spots': 19,
 'A-numerical radius': 20,
 'A-sandwich radome': 21,
 'A-separable modules': 22,
 'A-share listed companies': 23,
 'A-site non-stoichiometry': 24,
 'A-site spinel': 25,
 'A-type PACs': 26,
 'A-type carbonate': 27,
 'A-type cost factors': 28,
 'A-type lamins': 29,
 'A.chinensis adults': 30,
 'AAA': 31,
 'AAA Annual Meeting': 32,
 'AAA formation': 33,
 'AAA growth': 34,
 'AAA growth rate': 35,
 'AAA patients': 36,
 'AAA repair': 37,
 'AAA rupt

In [118]:
len(keys)

274422

## Optimization

In [128]:
X = df["x"].to_list()
Y = df["y"].to_list()
ID = df["cluster_id"].to_list()
SIZE = df["num_recent_articles"].to_list()
CATEGORY = [cluster_name_to_index(cat) for cat in df["cluster_category"].to_list()]
GROWTH = df["growth_rating"].to_list()

In [129]:
KEYS = []

for idx, group in enumerate(df["key_concepts"].to_list()):
    KEYS.append([keys_dict[key] for key in group])

### TO CSV

In [130]:
pd.DataFrame({
    "cluster_id": ID,
    "x": X,
    "y": Y,
    "num_recent_articles": SIZE,
    "cluster_category": CATEGORY,
    "growth_rating": GROWTH,
    "key_concepts": KEYS
}).to_csv("data/processed0_data.csv", index=False)

In [131]:
pd.DataFrame({
    "cluster_id": ID,
    "x": X,
    "y": Y,
    "num_recent_articles": SIZE,
    "cluster_category": CATEGORY,
    "growth_rating": GROWTH,
    "key_concepts": KEYS
}).to_csv("data/processed0_data.tsv", index=False, sep="\t")

In [132]:
!ls -la ./data

total 90044
drwxrwxr-x 2 szymon szymon     4096 lip  5 18:46 .
drwxrwxr-x 3 szymon szymon     4096 lip  5 18:49 ..
-rw-r--r-- 1 szymon szymon 23155895 lip  5 09:33 basic_data_fixed.jsonl
-rw-r--r-- 1 szymon szymon 23070251 lip  5 09:25 basic_data.jsonl
-rw-rw-r-- 1 szymon szymon 27062067 lip  5 09:25 data_obj.json
-rw-rw-r-- 1 szymon szymon  6582948 lip  5 18:57 processed0_data.csv
-rw-rw-r-- 1 szymon szymon  6411666 lip  5 18:57 processed0_data.tsv
-rw-rw-r-- 1 szymon szymon  5899376 lip  5 18:49 processed1_data.tsv


In [133]:
processed1 = open("data/processed0_data.tsv", "r").read()
processed1 = processed1 \
    .replace("[", "") \
    .replace("]", "") \
    .replace(" ", "")
open("data/processed1_data.tsv", "w").write(processed1)

5899376

In [134]:
!ls -la ./data

total 90044
drwxrwxr-x 2 szymon szymon     4096 lip  5 18:46 .
drwxrwxr-x 3 szymon szymon     4096 lip  5 18:49 ..
-rw-r--r-- 1 szymon szymon 23155895 lip  5 09:33 basic_data_fixed.jsonl
-rw-r--r-- 1 szymon szymon 23070251 lip  5 09:25 basic_data.jsonl
-rw-rw-r-- 1 szymon szymon 27062067 lip  5 09:25 data_obj.json
-rw-rw-r-- 1 szymon szymon  6582948 lip  5 18:57 processed0_data.csv
-rw-rw-r-- 1 szymon szymon  6411666 lip  5 18:57 processed0_data.tsv
-rw-rw-r-- 1 szymon szymon  5899376 lip  5 18:57 processed1_data.tsv


In [139]:
df_keys = pd.DataFrame({
    "index": [keys_dict[key] for key in keys],
    "key": keys,
})
df_keys.to_csv("data/processed0_keys.csv", index=False, sep="\t")

In [140]:
!ls -la ./data

total 97428
drwxrwxr-x 2 szymon szymon     4096 lip  5 18:59 .
drwxrwxr-x 3 szymon szymon     4096 lip  5 18:49 ..
-rw-r--r-- 1 szymon szymon 23155895 lip  5 09:33 basic_data_fixed.jsonl
-rw-r--r-- 1 szymon szymon 23070251 lip  5 09:25 basic_data.jsonl
-rw-rw-r-- 1 szymon szymon 27062067 lip  5 09:25 data_obj.json
-rw-rw-r-- 1 szymon szymon  6582948 lip  5 18:57 processed0_data.csv
-rw-rw-r-- 1 szymon szymon  6411666 lip  5 18:57 processed0_data.tsv
-rw-rw-r-- 1 szymon szymon  7558356 lip  5 18:59 processed0_keys.csv
-rw-rw-r-- 1 szymon szymon  5899376 lip  5 18:57 processed1_data.tsv


In [141]:
7558356 + 5899376

13457732

In [142]:
df_cats = pd.DataFrame({
    "index": [cluster_name_to_index(cat) for cat in cats],
    "category": cats,
})
df_cats.to_csv("data/processed0_categories.csv", index=False, sep="\t")