# 6. Topic extraction from NER

## Setup

In [1]:
%run __init__.py

logger.setLevel(logging.INFO)

In [2]:
from bokeh.io import output_notebook

output_notebook()



In [3]:
import pandas as pd

PMC_FILE_PATH = os.path.join(NOTEBOOK_2_RESULTS_DIR, 'pmc_dataframe.pkl')

pmc_df = pd.read_pickle(PMC_FILE_PATH)
publications = pmc_df['text_cleaned'].values

## Loading the NER model

In [4]:
from src.utils import load_object

ner = load_object(os.path.join(NOTEBOOK_4_RESULTS_DIR, 'ner_system.pkl'))

## Trying out the system

In [5]:
text = publications[-1]

### Entity linking

In [6]:
import en_core_sci_lg
from collections import Counter

nlp = en_core_sci_lg.load()
entities = ner.transform([text])
entities[0][:10]

['fungi',
 'tree',
 'stored',
 'soil',
 'ectomycorrhizal',
 'trees',
 'implanted',
 'orchards',
 'seedlings',
 'years']

In [7]:
from src.entity_linking import WikidataEntityLinker

linker = WikidataEntityLinker()
linked_entities = linker.fit_transform(entities)
linked_entities[0][:5]

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




[('fungi', 'http://www.wikidata.org/entity/Q764'),
 ('tree', 'http://www.wikidata.org/entity/Q10884'),
 ('stored', 'http://www.wikidata.org/entity/Q28266969'),
 ('soil', 'http://www.wikidata.org/entity/Q36133'),
 ('ectomycorrhizal', 'http://www.wikidata.org/entity/Q28274559')]

### Building the graph

In [None]:
from src.graph import WikidataGraphBuilder

graph_builder = WikidataGraphBuilder(max_hops=2)
entity_graph = graph_builder.build_graph(linked_entities[0])

INFO:src.graph:Started building graph.


In [None]:
from bokeh.io import show
from bokeh.layouts import gridplot

from src.graph import build_graph_plot

plot = build_graph_plot(entity_graph, f"Linked entities graph")
show(plot)

In [None]:
from src.graph import get_largest_connected_subgraph

connected_entity_subgraph = get_largest_connected_subgraph(entity_graph)

plot = build_graph_plot(connected_entity_subgraph, f"Linked entities graph")
show(plot)

In [None]:
import networkx.algorithms as nxa

from src.graph import get_centrality_algorithm_results

def try_centrality_algorithms(g, algorithms, stop_uris, top_n=9):
    for (algorithm, name) in algorithms:
        print(f'Algorithm: {name}')
        result = get_centrality_algorithm_results(g, algorithm, stop_uris, top_n)
        print(f"Topics:", result)
        print()
        
algorithms = [
    (nxa.centrality.information_centrality, "Information centrality"),
    (nxa.centrality.eigenvector_centrality_numpy, "Eigenvector centrality"),
    (nxa.centrality.closeness_centrality, "Closeness centrality"),
    (nxa.centrality.betweenness_centrality, "Betweenness centrality"),
    (nxa.centrality.load_centrality, "Load centrality")
]

stop_uris = ['Q4167836', 'Q11862829', 'Q13442814',
             'Q17339814', 'Q24017414', 'Q4671286',
             'Q47154513']
try_centrality_algorithms(connected_entity_subgraph,
                          algorithms,
                          stop_uris)

## Setting up the pipeline

In [None]:
from sklearn.pipeline import Pipeline

from src.topic_extraction import TopicLabeller


topic_extractor = TopicLabeller(graph_builder, nxa.centrality.closeness_centrality,
                                num_labels_per_topic=7, stop_uris=stop_uris)
topic_pipe = Pipeline([('ner', ner),
                       ('entity_linker', linker),
                       ('topic_extractor', topic_extractor)])

### Obtaining the topics

In [None]:
results = topic_pipe.fit_transform(publications)
results[:5]

### Saving the results

In [None]:
NEW_COL_NAME = 'topics_from_ner'

pmc_df[NEW_COL_NAME] = results
pmc_df.head()

In [None]:
results_df = pmc_df[['id', 'title', NEW_COL_NAME]]
results_df.head()

In [None]:
OUTPUT_FILE_NAME = "pmc_df_with_ner_topics.csv"

results_df.to_csv(os.path.join(NOTEBOOK_6_RESULTS_DIR, OUTPUT_FILE_NAME))