In [4]:
import glob
import io
import itertools
import json
import re
import string
from collections import Counter

import matplotlib.pyplot as plt
import nltk
import numpy as np
import pandas as pd
import regex as re
import tensorflow as tf
import torch_geometric
import tqdm
from keras.layers import Embedding, TextVectorization
from nltk import word_tokenize
from nltk.corpus import wordnet as wn
from nltk.stem import WordNetLemmatizer

nltk.download('wordnet')
nltk.download('omw-1.4')

SEED = 42
AUTOTUNE = tf.data.experimental.AUTOTUNE
%load_ext tensorboard
import io

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


[nltk_data] Error loading wordnet: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1129)>
[nltk_data] Error loading omw-1.4: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1129)>


## Import Metadata

In [2]:
path = "/Users/xavierevans/nasa/metadata"
json_files = glob.glob(path + "/*.json")

In [3]:
json_keys = []
for file in json_files:
    with open(file) as f:
        data = json.load(f)
        json_keys += data.keys()
json_keys = list(set(json_keys))

In [4]:
json_values = []
for file in json_files:
    with open(file) as f:
        data = json.load(f)
        json_values.append(
            [data[key] if key in data.keys() else None for key in json_keys]
        )

In [5]:
df = pd.DataFrame(json_values, columns=json_keys)
columns = df.columns

## Process Text

In [6]:
def clean_abstract(abstract):
    return abstract.replace("\n", "").replace("\t", "")

In [7]:
nltk.download('stopwords')
stop_words = set(nltk.corpus.stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/xavierevans/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
lemmatizer = WordNetLemmatizer()

In [9]:
def clean_text(text):
    # Remove extra white space
    text = re.sub(r'\s+', ' ', text, flags=re.I)      
    # Remove special characters
    text = re.sub(r'\W', ' ', str(text))
    # Remove single characters
    text = re.sub(r'\s+[a-zA-Z]\s+', ' ', text)

    # Convert to lowercase
    text = text.lower()

    # Tokenize    
    tokens = text.split()
    # Remove stop words
    tokens_no_stop = [word for word in tokens if word not in stop_words]
    # Drop short words
    tokens_no_stop = [word for word in tokens_no_stop if len(word) > 3]

    clean_text = ' '.join(tokens_no_stop)

    return clean_text

In [10]:
def clean_title(title):
    return re.sub(r"[()]", "", title)

In [11]:
abstracts_list = df["Abstract"].to_list()
titles_list = df["EntryTitle"].to_list()

clean_abstracts_list = [clean_text(abstract) for abstract in abstracts_list]
clean_titles_list = [clean_text(title) for title in titles_list]

In [12]:
def create_vocab(tokens):
    vocab = set(tokens)

    token2index = dict(zip(vocab, range(1, len(vocab) + 1)))
    token2index['OOV'] = -1
    token2index['PAD'] = 0

    index2token = {index: token for token, index in token2index.items()}

    return vocab, token2index, index2token

In [13]:
abstract_tokens = ' '.join(clean_abstracts_list).split()
title_tokens = ' '.join(clean_titles_list).split()

In [14]:
tokens = list(set(abstract_tokens + title_tokens))
vocab, token2index, index2token = create_vocab(tokens)

In [15]:
abstract_tokens = [abstract.split() for abstract in clean_abstracts_list]
abstract_sequences = [[token2index[token] for token in abstract] for abstract in abstract_tokens]
title_tokens = [title.split() for title in clean_titles_list]
title_sequences = [[token2index[token] for token in title] for title in title_tokens]

In [16]:
padded_sequences = tf.keras.utils.pad_sequences(
    abstract_sequences + title_sequences,
    padding='post'
)

## Word Embeddings 

In [17]:
from gensim.models.fasttext import FastText

In [18]:
dimensions = 300
window_size = 5
min_count = 1
 
model = FastText(
    abstract_tokens + title_tokens,
    vector_size=dimensions,
    window=window_size, 
    min_count=min_count
)

In [19]:
token2embed = {token: model.wv[token].tolist() for token in tokens}
token2embed['OOV'] = np.zeros(dimensions, dtype=int)

## Synonyms

In [20]:
def get_synonyms(word):
    synsets = wn.synsets(word)
    lemma_names = []
    for synset in synsets:
        for lemma in synset.lemma_names():
            spaced_lemma = lemma.replace("_", " ")
            if spaced_lemma != word:
                lemma_names.append(spaced_lemma)
    return list(set(lemma_names))

In [21]:
token2synon = {token: get_synonyms(token) for token in tokens}

In [22]:
synons = [item for sublist in list(token2synon.values()) for item in sublist]
tokens_synons = list(set(tokens + synons))

In [23]:
token2graph = dict(zip(tokens_synons, range(len(tokens_synons))))

In [24]:
def create_synon_edges(synon_dict):
    synon_edges = []
    for token, synons in synon_dict.items():
        token_id = token2graph[token]
        for synon in synons:
            synon_id = token2graph[synon]
            synon_edges.append((token_id, synon_id, 1.0))
    return synon_edges

In [25]:
synon_edges = create_synon_edges(token2synon)

## Word Similarity

In [None]:
from itertools import permutations

from numpy.linalg import norm

In [None]:
token_perms = list(permutations(tokens, 2))

In [None]:
def cosine_similarity(t1, t2):
    v1, v2 = np.array(model.wv[t1]), np.array(model.wv[t2])
    return np.dot(v1, v2) / (norm(v1)*norm(v2))

In [None]:
weights = [cosine_similarity(t1, t2) for t1, t2 in token_perms]

In [None]:
cosine_edges = [(token2graph[t1], token2graph[t2], weight) for (t1, t2), weight in zip(token_perms, weights)]

## Graph Creation

In [None]:
edges = synon_edges + cosine_edges

In [None]:
with open('/users/xavierevans/nasa/notebooks/07-18-2022_EdgeListWeights.txt', 'w') as fp:
    for edge in edges:
        fp.write(f"{edge}")
    print('Done')

In [None]:
import networkx as nx

graph = nx.Graph()

for source, target, weight in edges:
    graph.add_edge(source, target, weight=weight)

In [None]:
import networkx as nx
from node2vec import Node2Vec

node2vec = Node2Vec(graph, dimensions=3, walk_length=2, num_walks=1, workers=10)

## tf–idf

In [26]:
# Create list of combined title and abstract text
title_abstract_list = list(zip(clean_titles_list, clean_abstracts_list))
title_abstract_list = [' '.join(title_abstract) for title_abstract in title_abstract_list]

In [27]:
# Give each dataset a unique ID
title2index = dict(zip(titles_list, range(len(titles_list))))
index2title = {index: title for title, index in title2index.items()}

In [28]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [29]:
# Calculate tf–idf for all words
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(title_abstract_list)
feature_names = vectorizer.get_feature_names()
dense = vectors.todense()
dense_list = dense.tolist()
df_tfidf = pd.DataFrame(dense_list, columns=feature_names)



## Relevance

In [30]:
query = "measuring precipitation passive microwave sensors"

In [31]:
index2title

{0: 'Gridded Monthly Time-Mean Observation minus Analysis (oma) Values 0.5 x 0.667 degree V001 (MA_HIRS2_TIROSN_OMA) at GES DISC',
 1: 'Suomi NPP CrIS Level 1B Full Spectral Resolution V2 (SNPPCrISL1B) at GES DISC',
 2: 'GPM SSM/I on F13 (GPROF) Climate-based Radiometer Precipitation Profiling L2 1.5 hours 12 km V05 (GPM_2AGPROFF13SSMI_CLIM) at GES DISC',
 3: 'Sentinel-5P TROPOMI Radiance product band 7 (SWIR detector) L1B 5.5km x 7km V1 (S5P_L1B_RA_BD7_HiR) at GES DISC',
 4: 'OMI/Aura NO2 Tropospheric, Stratospheric & Total Columns MINDS Daily L2 Global Gridded 0.25 degree x 0.25 degree V1.1 (OMI_MINDS_NO2G) at GES DISC',
 5: 'MLS/Aura Level 3 Monthly Binned Hydroxyl (OH) Mixing Ratio on Assorted Grids V004 (ML3MBOH) at GES DISC',
 6: 'Gridded Monthly Time-Mean Observation minus Analysis (oma) Values 0.5 x 0.667 degree V001 (MA_SSU_NOAA07_OMA) at GES DISC',
 7: 'Suomi NPP CrIS Level 1B Normal Spectral Resolution V1 (SNPPCrISL1BNSR) at GES DISC',
 8: 'MODIS/Aqua Aerosol 5-Min L2 Swath 

In [32]:
list(df_tfidf.columns)[:5]

['00003', '00004', '00007', '0001', '00010']

In [33]:
relevance_dict = {index: 0 for index in range(len(titles_list))}

In [34]:
split_query = clean_text(query).split()
for word in split_query:
    tfidf_scores = df_tfidf[word].tolist()
    for index, score in enumerate(tfidf_scores):
        relevance_dict[index] += score

In [35]:
sorted_relevance_dict = sorted(relevance_dict.items(), key=lambda item: item[1], reverse=True)

In [36]:
def get_top_datasets(n):
    top_n = sorted_relevance_dict[:n]
    return [index2title[index] for index, score in top_n]

In [37]:
get_top_datasets(10)

['GPM DPR and GMI (Combined Precipitation) L3 1 day 0.25 degree x 0.25 degree V06 (GPM_3CMB_DAY) at GES DISC',
 'GPM DPR and GMI (Combined Precipitation) L3 1 day 0.25 degree x 0.25 degree V07 (GPM_3CMB_DAY) at GES DISC',
 'GPM DPR and GMI (Combined Precipitation) L3 1 month 0.25 degree x 0.25 degree V06 (GPM_3CMB) at GES DISC',
 'GPM DPR and GMI (Combined Precipitation) L3 1 month 0.25 degree x 0.25 degree V07 (GPM_3CMB) at GES DISC',
 'GPM MHS on METOP-C (GPROF) Radiometer Precipitation Profiling L3 1 day 0.25 degree x 0.25 degree V05 (GPM_3GPROFMETOPCMHS_DAY) at GES DISC',
 'GPM ATMS on SUOMI-NPP (GPROF) Radiometer Precipitation Profiling L3 1 month 0.25 degree x 0.25 degree V05 (GPM_3GPROFNPPATMS) at GES DISC',
 'GPM MHS on METOP-C (GPROF) Radiometer Precipitation Profiling L3 1 month 0.25 degree x 0.25 degree V05 (GPM_3GPROFMETOPCMHS) at GES DISC',
 'GPM ATMS on NOAA-20 (GPROF) Radiometer Precipitation Profiling L3 1 month 0.25 degree x 0.25 degree V05 (GPM_3GPROFNOAA20ATMS) at GE

In [38]:
title2index

{'Gridded Monthly Time-Mean Observation minus Analysis (oma) Values 0.5 x 0.667 degree V001 (MA_HIRS2_TIROSN_OMA) at GES DISC': 0,
 'Suomi NPP CrIS Level 1B Full Spectral Resolution V2 (SNPPCrISL1B) at GES DISC': 1,
 'GPM SSM/I on F13 (GPROF) Climate-based Radiometer Precipitation Profiling L2 1.5 hours 12 km V05 (GPM_2AGPROFF13SSMI_CLIM) at GES DISC': 2,
 'Sentinel-5P TROPOMI Radiance product band 7 (SWIR detector) L1B 5.5km x 7km V1 (S5P_L1B_RA_BD7_HiR) at GES DISC': 3,
 'OMI/Aura NO2 Tropospheric, Stratospheric & Total Columns MINDS Daily L2 Global Gridded 0.25 degree x 0.25 degree V1.1 (OMI_MINDS_NO2G) at GES DISC': 4,
 'MLS/Aura Level 3 Monthly Binned Hydroxyl (OH) Mixing Ratio on Assorted Grids V004 (ML3MBOH) at GES DISC': 5,
 'Gridded Monthly Time-Mean Observation minus Analysis (oma) Values 0.5 x 0.667 degree V001 (MA_SSU_NOAA07_OMA) at GES DISC': 6,
 'Suomi NPP CrIS Level 1B Normal Spectral Resolution V1 (SNPPCrISL1BNSR) at GES DISC': 7,
 'MODIS/Aqua Aerosol 5-Min L2 Swath Sub

## Embedding Datasets

### Edge Index

In [39]:
science_keywords = df['ScienceKeywords'].tolist()

In [40]:
len(science_keywords)

1738

In [41]:
"""
Create the hierarchy of keywords for each dataset.
Take each keyword and backtrack all the way to the root, Earth Science.
Now, for each keyword, we can get the list of all superfield keywords so we know the structure of keywords for making the graph.
"""
keyword_dict = {}
for dataset in science_keywords:
    for keyword in dataset:
        title = lambda x: x.title()
        hierarchy, deepest_class = list(map(title, list(keyword.values())))[:-1], list(keyword.keys())[-1]
        hierarchy.reverse()
        deepest_keyword = keyword[deepest_class].title()
        if deepest_keyword not in keyword_dict.keys():
            keyword_dict[deepest_keyword] = hierarchy

In [42]:
all_keywords = list(set(list(keyword_dict.keys()) + [item for sublist in list(keyword_dict.values()) for item in sublist]))

In [43]:
kywrd2kywid = dict(zip(all_keywords, range(len(index2title), len(index2title) + len(all_keywords))))
kywid2kywrd = {index: keyword for keyword, index in kywrd2kywid.items()}

In [44]:
kywrd2kywid

{'Solar Activity': 1738,
 'Precipitation Amount': 1739,
 'Cloud Radiative Forcing': 1740,
 'Energy Deposition': 1741,
 'Methyl Cyanide': 1742,
 'Biosphere': 1743,
 'Trace Gases/Trace Species': 1744,
 'Temperature Indicators': 1745,
 'Ionosphere/Magnetosphere Dynamics': 1746,
 'Canopy Characteristics': 1747,
 'Cloud Fraction': 1748,
 'Eruption Dynamics': 1749,
 'Air Quality': 1750,
 'Vegetation': 1751,
 'Antenna Temperature': 1752,
 'Aerosol Optical Depth': 1753,
 'Infrared Flux': 1754,
 'Glaciers': 1755,
 'Average Flow': 1756,
 'Halocarbons And Halogens': 1757,
 'Ozone (O3) Profile': 1758,
 'Phosphate': 1759,
 'Earth Radiation Budget': 1760,
 'Soil Moisture': 1761,
 'Radar': 1762,
 'Cloud Precipitable Water': 1763,
 'Reflected Flux': 1764,
 '12.1 Micron Extinction': 1765,
 'Flight Data Logs': 1766,
 'Sensible Heat Flux': 1767,
 'Air Temperature Profile': 1768,
 'Common Sense Climate Index': 1769,
 'Atmospheric Chemistry': 1770,
 'Snow Water Equivalent': 1771,
 'Atmospheric Temperature'

In [45]:
titles_keywords_raw = df[['EntryTitle', 'ScienceKeywords']].to_numpy()

In [46]:
def get_deepest_keywords(dataset):
    deepest_keywords = []
    for keyword in dataset:
        deepest_class = list(keyword.keys())[-1]
        deepest_keywords.append(keyword[deepest_class].title())
    return deepest_keywords

In [47]:
title2kywrd = {}
for title, keywords_raw in titles_keywords_raw:
    keywords = get_deepest_keywords(keywords_raw)
    title2kywrd[title] = keywords

In [48]:
index2kywrd = {title2index[title]: keywords for title, keywords in title2kywrd.items()}

In [50]:
get_kywrd_index = lambda x: kywrd2kywid[x]
keyword_index_dict = {kywrd2kywid[key]: list(map(get_kywrd_index, value)) for key, value in keyword_dict.items()}

In [51]:
keyword_edges = set()
for key, value in keyword_index_dict.items():
    val_len = len(value)
    if val_len > 0:
        keyword_edges.add((key, value[0]))
        keyword_edges.add((value[0], key))
        for i in range(val_len - 1):
            keyword_edges.add((value[i], value[i+1]))
            keyword_edges.add((value[i+1], value[i]))

In [53]:
to_name = lambda x: (kywid2kywrd[x[0]], kywid2kywrd[x[1]])
list(map(to_name, keyword_edges))[:10]

[('Cloud Properties', 'Cloud Height'),
 ('Snow', 'Solid Precipitation'),
 ('Atmospheric Winds', 'Surface Winds'),
 ('Particulate Matter', 'Aerosols'),
 ('Cloud Properties', 'Clouds'),
 ('Cloud Top Pressure', 'Cloud Properties'),
 ('Chlorofluorocarbons', 'Halocarbons And Halogens'),
 ('Solid Precipitation', 'Snow'),
 ('Cryosphere', 'Sea Ice'),
 ('Aerosol Optical Depth/Thickness', 'Aod')]

In [54]:
dataset_edges = set()
for index, keywords in index2kywrd.items():
    if keywords:
        dataset_edges.add((index, kywrd2kywid[keywords[0]]))
        dataset_edges.add((kywrd2kywid[keywords[0]], index))
        for i in range(len(keywords) - 1):
            dataset_edges.add((kywrd2kywid[keywords[i]], kywrd2kywid[keywords[i+1]]))
            dataset_edges.add((kywrd2kywid[keywords[i+1]], kywrd2kywid[keywords[i]]))

In [57]:
list(dataset_edges)[:10]

[(1954, 1803),
 (1838, 1857),
 (1964, 1996),
 (1196, 2124),
 (800, 2124),
 (1727, 1760),
 (1928, 1750),
 (2084, 2034),
 (1815, 1994),
 (1920, 61)]

In [58]:
edges = keyword_edges | dataset_edges
edges = sorted(edges, key=lambda x: x[0])

In [59]:
edges[:10]

[(0, 2124),
 (1, 2034),
 (2, 2098),
 (3, 2136),
 (4, 1941),
 (5, 1805),
 (6, 2124),
 (7, 2034),
 (8, 2082),
 (9, 2098)]

In [60]:
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch_geometric.transforms as T
import umap.umap_ as umap
from sklearn.linear_model import LogisticRegression
from torch_cluster import random_walk
from torch_geometric.datasets import Planetoid
from torch_geometric.loader import NeighborSampler as RawNeighborSampler
from torch_geometric.nn import SAGEConv

In [61]:
from torch_geometric.data import Data

edge_index = torch.tensor(edges, dtype=torch.long).t().contiguous()
edge_index

tensor([[   0,    1,    2,  ..., 2149, 2149, 2149],
        [2124, 2034, 2098,  ..., 1970, 1609, 1908]])

### Node Features

In [62]:
kywrd2kywid

{'Solar Activity': 1738,
 'Precipitation Amount': 1739,
 'Cloud Radiative Forcing': 1740,
 'Energy Deposition': 1741,
 'Methyl Cyanide': 1742,
 'Biosphere': 1743,
 'Trace Gases/Trace Species': 1744,
 'Temperature Indicators': 1745,
 'Ionosphere/Magnetosphere Dynamics': 1746,
 'Canopy Characteristics': 1747,
 'Cloud Fraction': 1748,
 'Eruption Dynamics': 1749,
 'Air Quality': 1750,
 'Vegetation': 1751,
 'Antenna Temperature': 1752,
 'Aerosol Optical Depth': 1753,
 'Infrared Flux': 1754,
 'Glaciers': 1755,
 'Average Flow': 1756,
 'Halocarbons And Halogens': 1757,
 'Ozone (O3) Profile': 1758,
 'Phosphate': 1759,
 'Earth Radiation Budget': 1760,
 'Soil Moisture': 1761,
 'Radar': 1762,
 'Cloud Precipitable Water': 1763,
 'Reflected Flux': 1764,
 '12.1 Micron Extinction': 1765,
 'Flight Data Logs': 1766,
 'Sensible Heat Flux': 1767,
 'Air Temperature Profile': 1768,
 'Common Sense Climate Index': 1769,
 'Atmospheric Chemistry': 1770,
 'Snow Water Equivalent': 1771,
 'Atmospheric Temperature'

In [63]:
import json

with open('06-27-2022_ScienceKeywordClassificationDict.txt') as f:
    data = f.read()

kywrd2class = json.loads(data)

In [64]:
class2index = {
    'Dataset': 0,
    'Category': 1,
    'Topic': 2,
    'Term': 3,
    'VariableLevel1': 4,
    'VariableLevel2': 5,
    'VariableLevel3': 6,
    'DetailedVariable': 7
}

In [65]:
dataset_classes = torch.zeros(1738)

In [66]:
ordered_kywrds = kywrd2kywid.keys()
keyword_classes = torch.tensor([class2index[kywrd2class[keyword]] for keyword in ordered_kywrds], dtype=torch.long)

In [67]:
y = torch.cat((dataset_classes, keyword_classes))

In [68]:
tfidf_tensor = torch.tensor(df_tfidf.to_numpy(), dtype=torch.long)
keyword_tfidf_tensor = torch.zeros((len(kywrd2kywid), df_tfidf.shape[1]))
x = torch.cat((tfidf_tensor, keyword_tfidf_tensor))

In [69]:
train_mask = [True]*400 + [False]*(len(index2title)-400) + [True]*100 + [False]*(len(kywid2kywrd)-100)
val_mask = [False]*400 + [True]*400 + [False]*(len(index2title)-800) + [False]*100 + [True]*100 + [False]*(len(kywid2kywrd)-200)
test_mask = [False]*800 + [True]*(len(index2title)-800) + [False]*200 + [True]*(len(kywid2kywrd)-200)

In [70]:
data = Data(x=x, edge_index=edge_index, y=y, train_mask=train_mask, val_mask=val_mask, test_mask=test_mask)

### Learn Embeddings

In [146]:
class NeighborSampler(RawNeighborSampler):
    def sample(self, batch):
        batch = torch.tensor(batch)
        row, col, _ = self.adj_t.coo()

        # For each node in `batch`, we sample a direct neighbor (as positive
        # example) and a random node (as negative example):
        pos_batch = random_walk(row, col, batch, walk_length=1, coalesced=False)[:, 1]

        neg_batch = torch.randint(
            0, self.adj_t.size(1), (batch.numel(),), dtype=torch.long
        )

        batch = torch.cat([batch, pos_batch, neg_batch], dim=0)
        return super(NeighborSampler, self).sample(batch)


train_loader = NeighborSampler(
    data.edge_index,
    sizes=[10, 10],
    batch_size=256,
    shuffle=True,
    num_nodes=data.num_nodes,
)


class SAGE(nn.Module):
    def __init__(self, in_channels, hidden_channels, num_layers):
        super(SAGE, self).__init__()
        self.num_layers = num_layers
        self.convs = nn.ModuleList()

        for i in range(num_layers):
            in_channels = in_channels if i == 0 else hidden_channels
            self.convs.append(SAGEConv(in_channels, hidden_channels))

    def forward(self, x, adjs):
        for i, (edge_index, _, size) in enumerate(adjs):
            x_target = x[: size[1]]  # Target nodes are always placed first.
            x = self.convs[i]((x, x_target), edge_index)
            if i != self.num_layers - 1:
                x = x.relu()
                x = F.dropout(x, p=0.5, training=self.training)
        return x

    def full_forward(self, x, edge_index):
        for i, conv in enumerate(self.convs):
            x = conv(x, edge_index)
            if i != self.num_layers - 1:
                x = x.relu()
                x = F.dropout(x, p=0.5, training=self.training)
        return x


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = SAGE(data.num_node_features, hidden_channels=64, num_layers=2)
model = model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
x, edge_index = data.x.to(device), data.edge_index.to(device)


def train():
    model.train()

    total_loss = 0
    for batch_size, n_id, adjs in train_loader:
        # `adjs` holds a list of `(edge_index, e_id, size)` tuples.
        adjs = [adj.to(device) for adj in adjs]
        optimizer.zero_grad()

        out = model(x[n_id], adjs)
        out, pos_out, neg_out = out.split(out.size(0) // 3, dim=0)

        pos_loss = F.logsigmoid((out * pos_out).sum(-1)).mean()
        neg_loss = F.logsigmoid(-(out * neg_out).sum(-1)).mean()
        loss = -pos_loss - neg_loss
        loss.backward()
        optimizer.step()

        total_loss += float(loss) * out.size(0)

    return total_loss / data.num_nodes


@torch.no_grad()
def test():
    model.eval()
    out = model.full_forward(x, edge_index).cpu()

    clf = LogisticRegression()
    clf.fit(out[data.train_mask], data.y[data.train_mask])

    val_acc = clf.score(out[data.val_mask], data.y[data.val_mask])
    test_acc = clf.score(out[data.test_mask], data.y[data.test_mask])

    return val_acc, test_acc


for epoch in range(1, 201):
    loss = train()
    val_acc, test_acc = test()
    print(
        f"Epoch: {epoch:03d}, Loss: {loss:.4f}, "
        f"Val: {val_acc:.4f}, Test: {test_acc:.4f}"
    )

Epoch: 001, Loss: 1.3109, Val: 0.8920, Test: 0.8861
Epoch: 002, Loss: 1.1033, Val: 0.8580, Test: 0.8670
Epoch: 003, Loss: 1.0204, Val: 0.8400, Test: 0.8574
Epoch: 004, Loss: 0.9708, Val: 0.8300, Test: 0.8539
Epoch: 005, Loss: 0.9379, Val: 0.8480, Test: 0.8513
Epoch: 006, Loss: 0.9375, Val: 0.8580, Test: 0.8600
Epoch: 007, Loss: 0.8940, Val: 0.8440, Test: 0.8600
Epoch: 008, Loss: 0.8909, Val: 0.8540, Test: 0.8548
Epoch: 009, Loss: 0.9030, Val: 0.8520, Test: 0.8583
Epoch: 010, Loss: 0.9111, Val: 0.8560, Test: 0.8643
Epoch: 011, Loss: 0.8934, Val: 0.8560, Test: 0.8757
Epoch: 012, Loss: 0.8477, Val: 0.8660, Test: 0.8757
Epoch: 013, Loss: 0.9216, Val: 0.8520, Test: 0.8635
Epoch: 014, Loss: 0.9194, Val: 0.8500, Test: 0.8478
Epoch: 015, Loss: 0.8816, Val: 0.8400, Test: 0.8522
Epoch: 016, Loss: 0.9012, Val: 0.8400, Test: 0.8487
Epoch: 017, Loss: 0.8753, Val: 0.8400, Test: 0.8470
Epoch: 018, Loss: 0.8418, Val: 0.8480, Test: 0.8513
Epoch: 019, Loss: 0.8705, Val: 0.8540, Test: 0.8678
Epoch: 020, 

In [147]:
with torch.no_grad():
    model.eval()
    out = model.full_forward(x, edge_index).cpu()

In [148]:
palette = {}

for n, y in enumerate(set(data.y.numpy())):
    palette[y] = f"C{n}"

In [149]:
embd = umap.UMAP().fit_transform(out.cpu().numpy())

OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


In [150]:
kywrd_count = len(kywrd2kywid)

In [215]:
x_vals, y_vals = embd.T[0][:-kywrd_count], embd.T[1][:-kywrd_count]
labels = (list(index2title.values()) + list(kywid2kywrd.values()))[:-kywrd_count]

In [281]:
topics

['Climate Indicators',
 'Oceans',
 'Human Dimensions',
 'Solid Earth',
 'Sun-Earth Interactions',
 'Biosphere',
 'Biological Classification',
 'Terrestrial Hydrosphere',
 'Atmosphere',
 'Agriculture',
 'Spectral/Engineering',
 'Land Surface',
 'Cryosphere']

In [280]:
# Get the set of keywords for each dataset
title2topic = dict(zip(labels, [[]]*len(labels)))
for title in labels:
    topics_for_this_title = set()
    kywrds_for_title = index2kywrd[title2index[title]]
    for keyword in kywrds_for_title:
        topic = keyword_dict[keyword][-2]
        topics_for_this_title.add(topic)
    title2topic[title] = list(topics_for_this_title)[0]
    print(topics_for_this_title)

{'Land Surface', 'Atmosphere'}
{'Spectral/Engineering'}
{'Atmosphere'}
{'Spectral/Engineering', 'Atmosphere'}
{'Atmosphere'}
{'Atmosphere'}
{'Land Surface', 'Atmosphere'}
{'Spectral/Engineering'}
{'Atmosphere'}
{'Atmosphere'}
{'Atmosphere'}
{'Biosphere', 'Land Surface'}
{'Atmosphere'}
{'Land Surface', 'Atmosphere'}
{'Atmosphere'}
{'Spectral/Engineering'}
{'Oceans', 'Land Surface', 'Atmosphere'}
{'Spectral/Engineering'}
{'Sun-Earth Interactions'}
{'Oceans', 'Land Surface', 'Atmosphere'}
{'Oceans', 'Land Surface', 'Atmosphere'}
{'Atmosphere'}
{'Atmosphere'}
{'Atmosphere'}
{'Sun-Earth Interactions'}
{'Land Surface', 'Atmosphere'}
{'Atmosphere'}
{'Oceans', 'Land Surface', 'Atmosphere'}
{'Atmosphere'}
{'Atmosphere'}
{'Atmosphere'}
{'Atmosphere'}
{'Biosphere', 'Terrestrial Hydrosphere', 'Land Surface', 'Atmosphere'}
{'Atmosphere'}
{'Atmosphere'}
{'Atmosphere'}
{'Land Surface', 'Atmosphere'}
{'Biosphere'}
{'Oceans', 'Land Surface', 'Atmosphere'}
{'Atmosphere'}
{'Atmosphere'}
{'Land Surface', 

In [279]:
# Backtrack and get the terms for each of the deepest keywords for each dataset
title2term = dict(zip(labels, [[]]*len(labels)))
for title in labels:
    terms_for_this_title = set()
    kywrds_for_title = index2kywrd[title2index[title]]
    for keyword in kywrds_for_title:
        if len(keyword_dict[keyword]) > 2:
            term = keyword_dict[keyword][-3]
        else:
            term = keyword_dict[keyword][-2]
        terms_for_this_title.add(term)
    title2term[title] = list(terms_for_this_title)[0]
    print(terms_for_this_title)

{'Altitude', 'Topography', 'Land Use/Land Cover'}
{'Infrared Wavelengths'}
{'Atmosphere'}
{'Platform Characteristics', 'Spectral/Engineering', 'Atmospheric Radiation', 'Sensor Characteristics'}
{'Atmospheric Chemistry'}
{'Atmospheric Chemistry'}
{'Altitude', 'Topography', 'Land Use/Land Cover'}
{'Infrared Wavelengths'}
{'Aerosols', 'Atmospheric Radiation'}
{'Atmosphere'}
{'Atmosphere'}
{'Soils', 'Surface Thermal Properties', 'Vegetation'}
{'Atmospheric Water Vapor', 'Atmospheric Chemistry'}
{'Surface Thermal Properties', 'Atmospheric Chemistry', 'Atmosphere', 'Atmospheric Water Vapor', 'Atmospheric Temperature'}
{'Atmospheric Chemistry'}
{'Microwave'}
{'Surface Thermal Properties', 'Atmospheric Chemistry', 'Atmospheric Radiation', 'Sea Surface Topography', 'Atmospheric Water Vapor', 'Atmospheric Pressure', 'Ocean Pressure', 'Clouds', 'Ocean Temperature', 'Precipitation', 'Atmospheric Temperature'}
{'Microwave', 'Infrared Wavelengths'}
{'Solar Activity'}
{'Altitude', 'Surface Thermal Pr

In [263]:
title2deep = dict(zip(labels, [[]]*len(labels)))
for title in labels:
    kywrds_for_title = index2kywrd[title2index[title]]
    title2deep[title] = kywrds_for_title

In [264]:
topics = [keyword for keyword, kywrd_class in kywrd2class.items() if kywrd_class == 'Topic']

In [265]:
# Create DataFrame of x- and y-values of embeddings, the dataset title, and the topic associated with the dataset
df_embeddings_title_topic = pd.DataFrame(list(zip(x_vals, y_vals, labels)), columns=['x', 'y', 'title'])

In [266]:
df_embeddings_title_topic['topic'] = df_embeddings_title_topic['title'].map(title2topic)

In [267]:
df_embeddings_title_topic['term'] = df_embeddings_title_topic['title'].map(title2term)

In [268]:
topic2color = dict(zip(topics, range(len(topics))))

In [269]:
df_embeddings_title_topic['color'] = df_embeddings_title_topic['topic'].map(topic2color)

In [278]:
# Plot the embeddings color-coded based on term
import plotly.express as px

fig = px.scatter(df_embeddings_title_topic, x='x', y='y', hover_name="title", hover_data=["x", "y"], color="term")
fig.show()
fig.write_html('07-28-2022_DatasetEmbeddings_StratifiedByTerm.html')

if not os.path.exists("images"):
    os.mkdir("images")

fig.write_image("images/07-28-2022_DatasetEmbeddings_StratifiedByTerm.html.png")

In [277]:
# Plot the embeddings color-coded based on topic
fig = px.scatter(df_embeddings_title_topic, x='x', y='y', hover_name='title', hover_data=['x', 'y'], color='topic')
fig.show()
fig.write_html('07-28-2022_DatasetEmbeddings_StratifiedByTopic.html')

In [None]:
df_tfidf

 (name, summed_tfidf, count_datasets_summed)   

In [None]:
hierarchy_chains = []
for deepest, hierarchy in keyword_dict.items():
    hierarchy.reverse()
    new_chain = hierarchy + [deepest]
    hierarchy_chains.append(new_chain)

def group_then_create_dict(hierarchies):
    group by first element, now list of groups of hierarchies
    for all groups:
        if len(group) == 1: 
            continue
        else:
            new_dict = {common first elt: group_then_create_dict(lists but shaved off first elt)}
            group_then_create_dict(new_dict.vals())

In [None]:
title2kywrd

In [None]:
keyword_dict

In [73]:
from copy import deepcopy

title2allkw = deepcopy(title2kywrd)
for title, keywords in title2kywrd.items():
    higher_keywords = set()
    for keyword in keywords:
        higher_keywords |= set(keyword_dict[keyword])
    title2allkw[title] += list(higher_keywords)

In [74]:
title2allkw

{'Gridded Monthly Time-Mean Observation minus Analysis (oma) Values 0.5 x 0.667 degree V001 (MA_HIRS2_TIROSN_OMA) at GES DISC': ['Barometric Altitude',
  'Terrain Elevation',
  'Land Use/Land Cover Classification',
  'Altitude',
  'Topography',
  'Land Surface',
  'Atmosphere',
  'Land Use/Land Cover',
  'Earth Science'],
 'Suomi NPP CrIS Level 1B Full Spectral Resolution V2 (SNPPCrISL1B) at GES DISC': ['Brightness Temperature',
  'Infrared Radiance',
  'Infrared Wavelengths',
  'Earth Science',
  'Spectral/Engineering'],
 'GPM SSM/I on F13 (GPROF) Climate-based Radiometer Precipitation Profiling L2 1.5 hours 12 km V05 (GPM_2AGPROFF13SSMI_CLIM) at GES DISC': ['Atmospheric Water Vapor',
  'Precipitation',
  'Earth Science',
  'Atmosphere'],
 'Sentinel-5P TROPOMI Radiance product band 7 (SWIR detector) L1B 5.5km x 7km V1 (S5P_L1B_RA_BD7_HiR) at GES DISC': ['Atmospheric Emitted Radiation',
  'Net Radiation',
  'Radiative Forcing',
  'Scattering',
  'Shortwave Radiation',
  'Platform Chara

In [97]:
kywrd2titles = dict(zip(all_keywords, [None]*len(all_keywords)))
for keyword in kywrd2titles.keys():
    titles = set()
    for title in title2allkw.keys():
        if keyword in title2allkw[title]:
            titles.add(title)
    kywrd2titles[keyword] = list(titles)

In [99]:
kywrd2titles['Atmosphere']

['tavgM_3d_trb_Cp: MERRA 3D IAU Diagnostic, Turbulence, Monthly Mean  1.25 x 1.25 degree V5.2.0 (MATMCPTRB) at GES DISC',
 'CAR INTEXB BRDF Measurements L1 V2 (CAR_INTEXB_BRDF) at GES DISC',
 'GPM MHS on NOAA19 (GPROF) Radiometer Precipitation Profiling L3 1 month 0.25 degree x 0.25 degree V05 (GPM_3GPROFNOAA19MHS) at GES DISC',
 'MERRA-2 tavgU_2d_aer_Nx: 2d,diurnal,Time-averaged,Single-Level,Assimilation,Aerosol Diagnostics 0.625 x 0.5 degree V5.12.4 (M2TUNXAER) at GES DISC',
 'GPM SSMIS on F18 (GPROF) Climate-based Radiometer Precipitation Profiling L3 1 day 0.25 degree x 0.25 degree V05 (GPM_3GPROFF18SSMIS_DAY_CLIM) at GES DISC',
 'tavgU_3d_mst_Cp: MERRA 3D IAU Diagnostic, Moist Physics, Diurnal  1.25 x 1.25 degree V5.2.0 (MATUCPMST) at GES DISC',
 'OMI/Aura Formaldehyde (HCHO) Total Column 1-orbit L2 Swath 13x24 km V003 (OMHCHO) at GES DISC',
 'Sentinel-5P TROPOMI SNPP cloud product band 7 (SWIR detector) 1-Orbit L2 5.5km x 7km V1 (S5P_L2__NP_BD7_HiR) at GES DISC',
 'TOMS Nimbus-7 

In [None]:
kywrd2titles['12.1 Micron Extinction']

In [122]:
kywrd2ndftr = {}
for keyword in kywrd2titles.keys():
    sum = np.zeros(df_tfidf.shape[1])
    titles = kywrd2titles[keyword]
    for title in titles:
        tfidf_score = df_tfidf.iloc[title2index[title]].tolist()
        sum += tfidf_score
    keyword_node_features = sum / len(titles)
    kywrd2ndftr[keyword] = list(keyword_node_features)

In [142]:
keyword_node_features = np.array(list(kywrd2ndftr.values()))
keyword_tfidf_tensor = torch.tensor(keyword_node_features, dtype=torch.float)

In [143]:
keyword_tfidf_tensor

tensor([[0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        ...,
        [0.0000, 0.0000, 0.0000,  ..., 0.0016, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0060, 0.0000, 0.0000]])

In [144]:
x = torch.cat((tfidf_tensor, keyword_tfidf_tensor))

In [145]:
data = Data(x=x, edge_index=edge_index, y=y, train_mask=train_mask, val_mask=val_mask, test_mask=test_mask)