# 6. Topic extraction from NER

## Setup

In [1]:
%run __init__.py

logger.setLevel(logging.INFO)

In [2]:
from bokeh.io import output_notebook

output_notebook()



In [3]:
import pandas as pd

PMC_FILE_PATH = os.path.join(NOTEBOOK_2_RESULTS_DIR, 'pmc_dataframe.pkl')

pmc_df = pd.read_pickle(PMC_FILE_PATH)
publications = pmc_df['text_cleaned'].values

## Loading the NER model

In [4]:
from src.utils import load_object

ner = load_object(os.path.join(NOTEBOOK_4_RESULTS_DIR, 'ner_system.pkl'))

## Testing the system

In [5]:
text = publications[-1]

### Entity linking

In [6]:
import en_core_sci_lg
from collections import Counter

nlp = en_core_sci_lg.load()
entities = ner.transform([text])
entities[0][:10]

['fungi',
 'tree',
 'stored',
 'soil',
 'ectomycorrhizal',
 'trees',
 'implanted',
 'orchards',
 'seedlings',
 'years']

In [7]:
from src.entity_linking import WikidataEntityLinker

linker = WikidataEntityLinker()
linked_entities = linker.fit_transform(entities)
linked_entities[0][:5]

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




[('fungi', 'http://www.wikidata.org/entity/Q764'),
 ('tree', 'http://www.wikidata.org/entity/Q10884'),
 ('stored', 'http://www.wikidata.org/entity/Q28266969'),
 ('soil', 'http://www.wikidata.org/entity/Q36133'),
 ('ectomycorrhizal', 'http://www.wikidata.org/entity/Q28274559')]

### Building the graph

In [8]:
from src.graph import WikidataGraphBuilder

graph_builder = WikidataGraphBuilder(max_hops=2)
entity_graph = graph_builder.build_graph(linked_entities[0])

INFO:src.graph:Started building graph.
INFO:src.graph:Finished building graph.


In [9]:
from bokeh.io import show
from bokeh.layouts import gridplot

from src.graph import build_graph_plot

plot = build_graph_plot(entity_graph, f"Linked entities graph")
show(plot)

In [10]:
from src.graph import get_largest_connected_subgraph

connected_entity_subgraph = get_largest_connected_subgraph(entity_graph)

plot = build_graph_plot(connected_entity_subgraph, f"Linked entities graph")
show(plot)

In [11]:
import networkx.algorithms as nxa

from src.graph import get_centrality_algorithm_results

def try_centrality_algorithms(g, algorithms, stop_uris, top_n=9):
    for (algorithm, name) in algorithms:
        print(f'Algorithm: {name}')
        result = get_centrality_algorithm_results(g, algorithm, stop_uris, top_n)
        print(f"Topics:", result)
        print()
        
algorithms = [
    (nxa.centrality.information_centrality, "Information centrality"),
    (nxa.centrality.eigenvector_centrality_numpy, "Eigenvector centrality"),
    (nxa.centrality.closeness_centrality, "Closeness centrality"),
    (nxa.centrality.betweenness_centrality, "Betweenness centrality"),
    (nxa.centrality.load_centrality, "Load centrality")
]

stop_uris = ['Q4167836', 'Q11862829', 'Q13442814',
             'Q17339814', 'Q24017414', 'Q47154513']
try_centrality_algorithms(connected_entity_subgraph,
                          algorithms,
                          stop_uris)

Algorithm: Information centrality
Topics: [('biological process', 0.0006184279312145398), ('water', 0.000607503053493451), ('process', 0.0006060572552603755), ('statistics', 0.000601608524298357), ('botany', 0.0005861485410144861), ('phylogenetics', 0.0005850691751670728), ('organism', 0.000583906468627325), ('concept', 0.0005768842560445808), ('plant', 0.0005759072362745063)]

Algorithm: Eigenvector centrality
Topics: [('biological process', 0.39878782467757473), ('water', 0.24682246464938298), ('nucleotides', 0.20336005217187708), ('DNA metabolic process', 0.1713350664807118), ('nucleic acid', 0.17024118051792125), ('polynucleotide', 0.16728749530870124), ('DNA transport', 0.15835986970333613), ('DNA catabolic process', 0.1511883628656641), ('DNA biosynthetic process', 0.14773735136081184)]

Algorithm: Closeness centrality
Topics: [('statistics', 0.19688249400479615), ('specialty', 0.19101907864122847), ('botany', 0.19075278810408922), ('forestry science', 0.1902665121668598), ('taxo

### Obtaining the topics

## Setting up the pipeline

In [12]:
from sklearn.pipeline import Pipeline

from src.topic_extraction import TopicLabeller


topic_extractor = TopicLabeller(graph_builder, nxa.centrality.closeness_centrality,
                                num_labels_per_topic=7, stop_uris=stop_uris)
topic_pipe = Pipeline([('ner', ner),
                       ('entity_linker', linker),
                       ('topic_extractor', topic_extractor)])
results = topic_pipe.fit_transform(publications[:5])

HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))

INFO:src.graph:Started building graph.
INFO:src.graph:Started building graph.
INFO:src.graph:Started building graph.
INFO:src.graph:Started building graph.
INFO:src.graph:Started building graph.





INFO:src.graph:Finished building graph.
INFO:src.graph:Finished building graph.
INFO:src.graph:Finished building graph.
INFO:src.graph:Finished building graph.
INFO:src.graph:Finished building graph.


### Obtaining the topics

In [13]:
results

[[('chemistry', 0.19925816023738874),
  ('breastfeeding', 0.19548762736535663),
  ('pharmacology', 0.19438413663337675),
  ('academic major', 0.192572411815314),
  ('sociology', 0.19120159453302962),
  ('chemical element', 0.18974286521616276),
  ('forestry science', 0.18926155580608794)],
 [('specialty', 0.20943683409436833),
  ('academic major', 0.2061423220973783),
  ('Area studies', 0.20500595947556616),
  ('statistics', 0.20178911863909665),
  ('regional studies', 0.20125786163522014),
  ('American studies', 0.20125786163522014),
  ('agriculture', 0.20093457943925233)],
 [('biological process', 0.1954924874791319),
  ('protein', 0.19445366987711724),
  ('process', 0.19374586366644606),
  ('science', 0.1924088070982583),
  ('biopolymer', 0.188749194068343),
  ('research', 0.18835451182242238),
  ('group', 0.18738998239718355)],
 [('Area studies', 0.18492481937121655),
  ('physics', 0.18427709671142245),
  ('regional studies', 0.1825014453651956),
  ('occurrence', 0.1776069017254313

### Saving the results

In [15]:
pmc_df['topics_from_ner'] = results
pmc_df.head(n)

ValueError: Length of values does not match length of index

In [None]:
results_df = pd.
results_df['topics_from_ner'] = results
results_df.