In [12]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import os
from wikipediaapi import Wikipedia

class NLTKStanfordNER:
    """
    Named Entity Recognizer using NLTK and Stanford NER.

    @param stanford_ner_path: Path to the Stanford NER tool directory.
    @param jar_path: Path to the Stanford NER JAR file.
    @param model_path: Path to the Stanford NER model file.
    """

    def __init__(self, stanford_ner_path, jar_path, model_path):
        """
        Initialize the Named Entity Recognizer.

        @param stanford_ner_path: Path to the Stanford NER tool directory.
        @param jar_path: Path to the Stanford NER JAR file.
        @param model_path: Path to the Stanford NER model file.
        """
        self.stanford_ner_path = stanford_ner_path
        self.jar_path = jar_path
        self.model_path = model_path
        self.stanford_ner = nltk.tag.StanfordNERTagger(model_filename=model_path, path_to_jar=jar_path)
        self.wikipedia = Wikipedia(user_agent='WDPS 13')


    # Need to adjust the preprocessing function, for now doesnt work, doesnt return anything
    def process_text(self, text, current_entity="", stemmed=True):
        """
        Tokenize, remove stop words, and apply stemming to the input text.

        @param text: Input text to be processed.
        @param current_entity: The current entity being processed.
        @param stemmed: Flag indicating whether to apply stemming. Default is True.
        @return: List of processed words.
        """
        """
        # Tokenize the text
        words = word_tokenize(text)

        # Remove stop words and stemming
        stop_words = set(stopwords.words('english'))
        """"""
        ps = PorterStemmer()

        processed_words = []
        for word in words:
            if word.lower() not in stop_words and len(word) > 1:
                processed_words.append(ps.stem(word) if stemmed else word)

        return processed_words
        """

    def entity_extraction(self, text):
        """
        Extract named entities from the input text.

        @param text: Input text containing named entities.
        @return: List of unique entities with information like name, type, and Wikipedia page URL.
        """
        words = word_tokenize(text)
        tagged_entities = self.stanford_ner.tag(words)

        entities_dict = {}
        current_entity = ""
        for word, tag in tagged_entities:
            if tag != 'O':
                current_entity += word + " "
            elif current_entity:
                current_entity = current_entity.strip()
                wikipedia_page = self.get_wikipedia_page(current_entity)
                if wikipedia_page.exists():
                    url = wikipedia_page.fullurl
                    if url not in entities_dict:
                        entities_dict[url] = {
                            'name': current_entity,
                            'type': tag,
                            'wikipedia_page': url
                        }
                current_entity = ""

        # Filter out entities with the same Wikipedia URL
        unique_entities = list(entities_dict.values())

        return unique_entities

    def get_wikipedia_page(self, entity):
        """
        Fetch the Wikipedia page for a given entity.

        @param entity: The entity for which to fetch the Wikipedia page.
        @return: Wikipedia page object or None if an error occurs.
        """
        try:
            return self.wikipedia.page(entity)
        except Exception as e:
            print(f"Error fetching Wikipedia page for {entity}: {e}")
            return None


# Example usage:
stanford_ner_path = '/Users/ranjan/Downloads/stanford-ner-2020-11-17'
jar_path = os.path.join(stanford_ner_path, 'stanford-ner.jar')
model_path = os.path.join(stanford_ner_path, 'classifiers/english.all.3class.distsim.crf.ser.gz')

nltk_stanford_ner = NLTKStanfordNER(stanford_ner_path, jar_path, model_path)

# Replace the following with your text
sample_text = "Yes, Managua is the capital city of Nicaragua. It is located in the southwestern part of the country and is home to many important government buildings and institutions, including the President's office and the National Assembly. The city has a population of over one million people and is known for its vibrant cultural scene, historic landmarks, and beautiful natural surroundings."

entities = nltk_stanford_ner.entity_extraction(sample_text)
for entity in entities:
    print(f"{entity['name']}\t{entity['wikipedia_page']}")

Managua	https://en.wikipedia.org/wiki/Managua
Nicaragua	https://en.wikipedia.org/wiki/Nicaragua
National Assembly	https://en.wikipedia.org/wiki/National_Assembly
