## Data Wrangling

In [59]:
import pandas as pd
import json
from bs4 import BeautifulSoup
from urllib.request import urlopen, Request
import re

In [60]:
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.3'}

def get_first_set(subject, limit=50, next_cursor='*'):
    items = []
    regular_url = f"https://api.lib.harvard.edu/v2/items.json?resourceType=text&subject={subject}&limit={limit}&cursor={next_cursor}"
    request = Request(url=regular_url, headers=headers) 
    page = urlopen(request)
    html = page.read().decode("utf-8")
    data = json.loads(html)
    next_cursor = data['pagination']['nextCursor']
    new_items = data['items']['mods']
    items += new_items
    return items

In [61]:
def get_data(subject, limit, next_cursor):
    regular_url = f"https://api.lib.harvard.edu/v2/items.json?resourceType=text&subject={subject}&limit={limit}&cursor={next_cursor}"
    request = Request(url=regular_url, headers=headers) 
    page = urlopen(request)
    html = page.read().decode("utf-8")
    data = json.loads(html)
    return data

def get_all(subject, limit=50, next_cursor='*'):
    items = []
    data = get_data(subject, limit, next_cursor)
    while data['items']:
        next_cursor = data['pagination']['nextCursor']
        new_items = data['items']['mods']
        items += new_items
        data = get_data(subject, limit, next_cursor)
    return items

In [62]:
subject = "positive%20psychology"
entries = get_all(subject)

In [63]:
entries[400].keys()

dict_keys(['titleInfo', 'name', 'typeOfResource', 'genre', 'originInfo', 'language', 'physicalDescription', 'abstract', 'tableOfContents', 'note', 'subject', 'classification', 'relatedItem', 'identifier', 'location', 'extension', 'recordInfo'])

In [64]:
def extract_abstract(entry):
    '''
    Input: a database entry
    Output: dict of abstract
    '''

    if 'abstract' in entry.keys():
        
        abstracts = entry['abstract']
        text = []

        if type(abstracts) != list:
            abstracts = [abstracts]

        for abstract in abstracts:
            text.append(abstract['#text'])
        abstract_text = ' '.join(text)

        return abstract_text
        
    return ""

In [65]:
extract_abstract(entries[13])

"Based upon in-depth interviews with a broad cross-section of patients, Being Positive gives us the clearest picture we have of what life is like for people who have been diagnosed HIV positive. Most books about HIV and AIDS are filled with statistics, or they present the life of a single individual or the experience of several. But Being Positive analyzes the lives of a wide group of people - male and female, straight and gay, African American, white, and Latino - exploring the contrasts and similarities that emerge. The book is not only a humanizing antidote to statistical studies but an important benchmark in understanding the individual dramas of those who are affected. To gain a full grasp of who they are as people, and how they perceive the issues they confront, is Dr. Klitzman's aim."

In [66]:
def extract_title(entry):
    '''
    Input: a database entry
    Output: dict of title information
    '''
    
    if 'titleInfo' in entry.keys():
        if type(entry['titleInfo']) == list:
            title_values = entry['titleInfo'][0].values()
        else:
            title_values = entry['titleInfo'].values()
        title_values = list(filter(lambda entry: entry is not None, title_values))
        title_info = [info.strip() for info in title_values]
        title_text = ' '.join(title_info)
        return title_text

    return ""

In [67]:
def extract_people(entry):
    '''
    Input: a database entry
    Output: tuple of lists of authors and editors
    '''
    if 'name' in entry.keys():

        names = entry['name']

        authors = set()
        editors = set()

        if type(names) != list:
            names = [names]
            
        for i, name in enumerate(names):
            if 'role' in name.keys():
                if type(name['role']) != list:
                    names[i]['role'] = [names[i]['role']]
                for role in name['role']:
                    if 'author' in role['roleTerm']['#text'] or 'creator' in role['roleTerm']['#text']:
                        if type(name['namePart']) == list:
                            authors.add(name['namePart'][0])
                        else:
                            authors.add(name['namePart'])
                    elif 'editor' in role['roleTerm']['#text']:
                        if type(name['namePart']) == list:
                            editors.add(name['namePart'][0])
                        else:
                            editors.add(name['namePart'])
        
        return list(authors), list(editors)
    
    return [], []

In [68]:
authors_index = 25
extract_people(entries[authors_index])

(['Rojas Salcedo, Roberto Iván.'], [])

In [69]:
def extract_subjects(entry):
    '''
    Input: a database entry
    Output: list of subjects
    '''

    if 'subject' in entry.keys():

        subjects = entry['subject']
        
        all_subjects = set()

        if type(subjects) != list:
            subjects = [subjects]

        for subject in subjects:
            if 'topic' in subject.keys():
                topics = subject['topic']
                if type(topics) != list:
                    topics = [topics]
                for topic in topics:
                    if topic:
                        all_subjects.add(topic)
                    
        return list(all_subjects)
        
    return []

In [70]:
subject_index = 4
extract_subjects(entries[subject_index])

['Counseling of',
 'AIDS (Disease)',
 'psychology',
 'Patient Education as Topic',
 'HIV (Viruses)',
 'AIDS Serodiagnosis',
 'Serodiagnosis',
 'Counseling',
 'HIV Infections',
 'HIV-positive persons',
 'HIV Seropositivity']

In [71]:
def create_data_structure(entries):
    data = []
    for entry in entries:
        title = extract_title(entry)
        authors, editors = extract_people(entry)
        abstract = extract_abstract(entry)
        subjects = extract_subjects(entry)
        entry_dict = {
            'title': title,
            'authors': authors,
            'editors': editors,
            'abstract': abstract,
            'subjects': subjects
        }
        data.append(entry_dict)
    return data

In [72]:
data = create_data_structure(entries)
data[10]

{'title': 'Positive carers the rights and responsibilities of HIV positive health care workers',
 'authors': ['Mayho, Paul.'],
 'editors': [],
 'abstract': 'There is not one positively identified case in Britain of an HIV-infected health care worker transmitting HIV to a patient, yet much fear surrounds the HIV positive care worker. Positive Carers examines the responsibilities and rights of the individual, health care management and government agencies. Case studies provide a vivid framework for a discussion of the practicality of continuing employment after diagnosis and explores the dilemmas workers may face. The role of the media is generating unnecessary fear is documented and analysed in relation to public attitudes. There is a useful detailed proposal for local policy on the management of HIV-infected workers, as well as a summary of the current Department of Health guidelines. Positive Carers provides a better understanding of the predicament of HIV-infected health care workers

## Cypher Queries

In [73]:
def extract_all_unique_subjects(data):
    all_subjects = set()
    for entry in data:
        subjects = entry['subjects']
        if subjects:
            all_subjects.update(filter(lambda subject: subject, subjects))
    return list(all_subjects)

In [74]:
def subject_to_cypher_variable(subject):
    variable = re.sub('[^0-9a-zA-Z]+', '_',
                      '_'.join(subject.lower().split()))
    if variable[0].isnumeric():
        return f"_{variable}"
    return variable

In [75]:
replace_quotes = lambda text: text.replace('"', '').replace("'", '')

In [76]:
def print_cypher_query_create_entry_nodes(data):
    entry_node_queries = []
    seen_entry_cypher_variables = set()
    for entry in data:
        title = replace_quotes(entry['title']) if entry['title'] else 'null'
        cypher_variable = subject_to_cypher_variable(title)
        if cypher_variable not in seen_entry_cypher_variables:
            seen_entry_cypher_variables.add(cypher_variable)

            authors = [replace_quotes(author) for author in entry['authors']] if entry['authors'] else 'null'
            editors = [replace_quotes(editor) for editor in entry['editors']] if entry['editors'] else 'null'
            abstract = replace_quotes(entry['abstract']) if entry['abstract'] else 'null'
            
            query = f"MERGE ({cypher_variable}:Entry {{name: \"{title}\", authors: \"{authors}\", editors: \"{editors}\", abstract: \"{abstract}\"}})"
            entry_node_queries.append(query)
    for query in entry_node_queries:
        print(query)

In [77]:
def print_cypher_query_create_subject_nodes(all_subjects):
    subject_node_queries = []
    seen_subject_cypher_variables = set()
    for subject in all_subjects:
        cypher_variable = subject_to_cypher_variable(subject)
        if cypher_variable not in seen_subject_cypher_variables:
            seen_subject_cypher_variables.add(cypher_variable)
            query = f"MERGE ({cypher_variable}:Subject {{name: \"{subject}\"}})"
            subject_node_queries.append(query)
    for query in subject_node_queries:
        print(query)

In [78]:
def create_subject_entry_pairs(data):
    subject_entry_pairs = []
    for entry in data:
        for subject in entry['subjects']:
            if subject:
                subject_entry_pairs.append((entry['title'], subject))
    return subject_entry_pairs

In [79]:
def print_cypher_query_create_subject_relationships(subject_entry_pairs, data):
    subject_node_queries = []
    all_subjects = extract_all_unique_subjects(data)
    seen_subject_cypher_variables = set()
    for subject in all_subjects:
        cypher_variable = subject_to_cypher_variable(subject)
        if cypher_variable not in seen_subject_cypher_variables:
            query = f"MATCH ({cypher_variable}:Subject {{name: \"{subject}\"}})"
            subject_node_queries.append(query)
            seen_subject_cypher_variables.add(cypher_variable)
    for query in subject_node_queries:
        print(query)

    entry_node_queries = []
    seen_entry_cypher_variables = set()
    for entry in data:
        title = replace_quotes(entry['title'])
        cypher_variable = subject_to_cypher_variable(title)
        if cypher_variable not in seen_entry_cypher_variables:
            query = f"MATCH ({cypher_variable}:Entry {{name: \"{title}\"}})"
            entry_node_queries.append(query)
            seen_entry_cypher_variables.add(cypher_variable)
    for query in entry_node_queries:
        print(query)

    subject_entry_relationship_queries = []
    for descendant, ancestor in subject_entry_pairs:
        descendant_cypher_variable = subject_to_cypher_variable(descendant)
        ancestor_cypher_variable = subject_to_cypher_variable(ancestor)
        query = f"MERGE ({ancestor_cypher_variable})<-[:HAS_SUBJECT]-({descendant_cypher_variable})"
        subject_entry_relationship_queries.append(query)
    for query in subject_entry_relationship_queries:
        print(query)

In [80]:
def create_all_cypher_queries(data):
    all_subjects = extract_all_unique_subjects(data)
    print_cypher_query_create_subject_nodes(all_subjects)
    print_cypher_query_create_entry_nodes(data)
    subject_entry_pairs = create_subject_entry_pairs(data)
    print_cypher_query_create_subject_relationships(subject_entry_pairs, data)

## Relational Database

In [81]:
subject_entry_pairs = create_subject_entry_pairs(data)
df_relationships = pd.DataFrame(subject_entry_pairs, columns=['entry', 'subject'])
df_relationships.to_csv('~/thesis/data/relationships.csv', index=False)

In [82]:
def create_relationship_csv(subject):
    entries = get_all(subject)
    data = create_data_structure(entries)
    subject_entry_pairs = create_subject_entry_pairs(data)
    df_relationships = pd.DataFrame(subject_entry_pairs, columns=['entry', 'subject'])
    df_relationships.to_csv(f'~/thesis/data/{subject}_relationships.csv', index=False)

In [83]:
subject = "positive%20psychology"
create_relationship_csv(subject)

## Graph Machine Learning

### Edge Creation

In [84]:
flattened_pairs = list(set([item for sublist in create_subject_entry_pairs(data) for item in sublist]))
name2index = {name: index for name, index in zip(flattened_pairs, range(len(flattened_pairs)))}

In [85]:
subject_entry_pairs = create_subject_entry_pairs(data)
edges = [(name2index[entry], name2index[subject]) for entry, subject in subject_entry_pairs]

In [86]:
edges[:5]

[(698, 53), (698, 419), (698, 232), (698, 825), (698, 919)]

In [87]:
all_subjects = extract_all_unique_subjects(data)

In [88]:
all_text = [replace_quotes(key) for key, _ in name2index.items()]
all_text[:5]

['AIDS (Disease) in women',
 'Coping and the Challenge of Resilience',
 'Positive psychology in business ethics and corporate responsibility',
 'New Trends in Psychobiography',
 'Hope']

### tf–idf

In [89]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [90]:
# Calculate tf–idf for all words
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(all_text)
feature_names = vectorizer.get_feature_names()
dense = vectors.todense()
dense_list = dense.tolist()
df_tfidf = pd.DataFrame(dense_list, columns=feature_names)



In [91]:
df_tfidf.head()

Unnamed: 0,01,02,05,08,100,12,13,14,19,1973,...,ללמוד,למנהיגים,משפיעים,פסיכולוגיה,שבמנהיגות,שינוי,ḥadashim,ḥayim,ḥiyuvit,ṭrigerim
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [92]:
name2index

{'AIDS (Disease) in women': 0,
 'Coping and the Challenge of Resilience': 1,
 'Positive psychology in business ethics and corporate responsibility': 2,
 'New Trends in Psychobiography': 3,
 'Hope': 4,
 'Health Care Economics and Organizations': 5,
 'diagnosis': 6,
 'language teaching.': 7,
 'Séropositifs': 8,
 'Attitude (Psychology)': 9,
 'Personal Satisfaction': 10,
 'The journal of positive psychology': 11,
 'Public Policy': 12,
 'Experiential research': 13,
 'Influence': 14,
 'Optimal Learning Environments to Promote Student Engagement': 15,
 'A positive view of LGBTQ embracing identity and cultivating well-being': 16,
 'HIV-positive gay men': 17,
 'Cabala': 18,
 'Exito': 19,
 'Reproductive health': 20,
 'Computers and Society': 21,
 'Geographic Locations': 22,
 '01 Maindfulnes ṿeha-mahapekhah ha-sheḳeṭah be-ḥinukh': 23,
 'Medicine': 24,
 'Attitudes': 25,
 'Santé mentale': 26,
 'Positiv psykologi': 27,
 'HIV infections': 28,
 'Respiratory Tract Infections': 29,
 'Legal staus, l

### Model Creation

In [93]:
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch_geometric.transforms as T
import umap.umap_ as umap
from sklearn.linear_model import LogisticRegression
from torch_cluster import random_walk
from torch_geometric.datasets import Planetoid
from torch_geometric.loader import NeighborSampler as RawNeighborSampler
from torch_geometric.nn import SAGEConv

  from .autonotebook import tqdm as notebook_tqdm


In [94]:
from torch_geometric.data import Data

edge_index = torch.tensor(edges, dtype=torch.long).t().contiguous()
edge_index

tensor([[ 698,  698,  698,  ..., 1021, 1021, 1021],
        [  53,  419,  232,  ...,   28, 1286,   86]])

In [95]:
edge_index.shape

torch.Size([2, 3598])

In [96]:
tfidf_tensor = torch.tensor(df_tfidf.to_numpy(), dtype=torch.float)
x = tfidf_tensor

In [97]:
y = torch.tensor([0 if name in all_subjects else 1 for name in name2index.keys()], dtype=torch.int)

In [98]:
train_mask = [True]*800 + [False]*(len(name2index)-800)
val_mask = [False]*800 + [True]*200 + [False]*(len(name2index)-1000)
test_mask = [False]*1000 + [True]*(len(name2index)-1000)

In [99]:
data = Data(x=x, y=y, edge_index=edge_index, train_mask=train_mask, val_mask=val_mask, test_mask=test_mask)

In [100]:
class NeighborSampler(RawNeighborSampler):
    def sample(self, batch):
        batch = torch.tensor(batch)
        row, col, _ = self.adj_t.coo()

        # For each node in `batch`, we sample a direct neighbor (as positive
        # example) and a random node (as negative example):
        pos_batch = random_walk(row, col, batch, walk_length=1, coalesced=False)[:, 1]

        neg_batch = torch.randint(
            0, self.adj_t.size(1), (batch.numel(),), dtype=torch.long
        )

        batch = torch.cat([batch, pos_batch, neg_batch], dim=0)
        return super(NeighborSampler, self).sample(batch)

In [101]:
train_loader = NeighborSampler(
    data.edge_index,
    sizes=[10, 10],
    batch_size=256,
    shuffle=True,
    num_nodes=data.num_nodes,
)

In [102]:
class SAGE(nn.Module):
    def __init__(self, in_channels, hidden_channels, num_layers):
        super(SAGE, self).__init__()
        self.num_layers = num_layers
        self.convs = nn.ModuleList()

        for i in range(num_layers):
            in_channels = in_channels if i == 0 else hidden_channels
            self.convs.append(SAGEConv(in_channels, hidden_channels))

    def forward(self, x, adjs):
        for i, (edge_index, _, size) in enumerate(adjs):
            x_target = x[: size[1]]  # Target nodes are always placed first.
            x = self.convs[i]((x, x_target), edge_index)
            if i != self.num_layers - 1:
                x = x.relu()
                x = F.dropout(x, p=0.5, training=self.training)
        return x

    def full_forward(self, x, edge_index):
        for i, conv in enumerate(self.convs):
            x = conv(x, edge_index)
            if i != self.num_layers - 1:
                x = x.relu()
                x = F.dropout(x, p=0.5, training=self.training)
        return x

In [103]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = SAGE(data.num_node_features, hidden_channels=64, num_layers=2)
model = model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
x, edge_index = data.x.to(device), data.edge_index.to(device)

In [104]:
def train():
    model.train()

    total_loss = 0
    for batch_size, n_id, adjs in train_loader:
        # `adjs` holds a list of `(edge_index, e_id, size)` tuples.
        adjs = [adj.to(device) for adj in adjs]
        optimizer.zero_grad()

        out = model(x[n_id], adjs)
        out, pos_out, neg_out = out.split(out.size(0) // 3, dim=0)

        pos_loss = F.logsigmoid((out * pos_out).sum(-1)).mean()
        neg_loss = F.logsigmoid(-(out * neg_out).sum(-1)).mean()
        loss = -pos_loss - neg_loss
        loss.backward()
        optimizer.step()

        total_loss += float(loss) * out.size(0)

    return total_loss / data.num_nodes


@torch.no_grad()
def test():
    model.eval()
    out = model.full_forward(x, edge_index).cpu()

    clf = LogisticRegression()
    clf.fit(out[data.train_mask], data.y[data.train_mask])

    val_acc = clf.score(out[data.val_mask], data.y[data.val_mask])
    test_acc = clf.score(out[data.test_mask], data.y[data.test_mask])

    return val_acc, test_acc

In [105]:
for epoch in range(1, 201):
    loss = train()
    val_acc, test_acc = test()
    print(
        f"Epoch: {epoch:03d}, Loss: {loss:.4f}, "
        f"Val: {val_acc:.4f}, Test: {test_acc:.4f}"
    )

Epoch: 001, Loss: 1.3782, Val: 0.9950, Test: 0.9905
Epoch: 002, Loss: 1.3255, Val: 1.0000, Test: 0.9968
Epoch: 003, Loss: 1.2976, Val: 0.9950, Test: 0.9937
Epoch: 004, Loss: 1.2477, Val: 0.9900, Test: 0.9842
Epoch: 005, Loss: 1.1786, Val: 0.9850, Test: 0.9811
Epoch: 006, Loss: 1.1312, Val: 0.9850, Test: 0.9842
Epoch: 007, Loss: 1.1081, Val: 0.9600, Test: 0.9779
Epoch: 008, Loss: 1.1381, Val: 0.9750, Test: 0.9685
Epoch: 009, Loss: 1.0947, Val: 0.9600, Test: 0.9621
Epoch: 010, Loss: 1.1190, Val: 0.9700, Test: 0.9527
Epoch: 011, Loss: 1.0827, Val: 0.9750, Test: 0.9495
Epoch: 012, Loss: 1.0524, Val: 0.9700, Test: 0.9306
Epoch: 013, Loss: 1.0968, Val: 0.9600, Test: 0.9306
Epoch: 014, Loss: 1.0671, Val: 0.9700, Test: 0.9558
Epoch: 015, Loss: 1.0223, Val: 0.9650, Test: 0.9495
Epoch: 016, Loss: 1.0322, Val: 0.9600, Test: 0.9558
Epoch: 017, Loss: 1.0249, Val: 0.9600, Test: 0.9464
Epoch: 018, Loss: 1.0433, Val: 0.9450, Test: 0.9274
Epoch: 019, Loss: 1.0203, Val: 0.9350, Test: 0.9211
Epoch: 020, 

In [106]:
with torch.no_grad():
    model.eval()
    out = model.full_forward(x, edge_index).cpu()

In [107]:
embd = umap.UMAP().fit_transform(out.cpu().numpy())

OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


### Visualization

In [108]:
x_vals, y_vals = embd.T[0], embd.T[1]
labels = name2index.keys()

In [109]:
# Create DataFrame of x- and y-values of embeddings and the entry/subject name
df_embeddings_name = pd.DataFrame(list(zip(x_vals, y_vals, labels)), columns=['x', 'y', 'name'])

In [128]:
# Plot the embeddings color-coded based on term
import plotly.express as px

fig = px.scatter(df_embeddings_name, x='x', y='y', hover_name="name", hover_data=["x", "y"])
fig.update_traces(textfont_size=6)

config = {
  'toImageButtonOptions': {
    'format': 'png', # one of png, svg, jpeg, webp
    'filename': 'custom_image',
    'height': 500,
    'width': 700,
    'scale': 6 # Multiply title/legend/axis/canvas sizes by this factor
  }
}

fig.show(config=config)
fig.write_html('12-19-22_EntryEmbeddings.html')

if not os.path.exists("images"):
    os.mkdir("images")

# fig.write_image("images/zoomed.png", scale=6, width=1080, height=1080)