In [None]:
!pip install pandas
!pip install spacy

In [None]:
import pandas as pd

## Load dataset with sentences and mountain names

In [None]:
columns_names = ['sentence', 'mountain_name']
sentences_df = pd.read_csv('mountains_sentences.txt', sep=';', header=None, names = columns_names)

In [None]:
sentences_df.head()

Unnamed: 0,sentence,mountain_name
0,The Weißkugel is one of the highest peaks in t...,Weißkugel
1,"During the summer, the stunning views from the...",Weißkugel
2,Many hikers consider the ascent to Weißkugel a...,Weißkugel
3,Sikjangsan is renowned for its beautiful hikin...,Sikjangsan
4,The summit of Sikjangsan offers a panoramic vi...,Sikjangsan


In [None]:
mountains = sentences_df["mountain_name"].unique().astype(str)
mountains = [mountain.strip() for mountain in mountains]       # delete possible spaces before the word and after

## Define part of speech of each word and labels of the mountains

In [None]:
import spacy

In [None]:
nlp = spacy.load("en_core_web_sm")

In [None]:
token_data = []

for index, row in sentences_df.iterrows():
    doc = nlp(row["sentence"])

    sentence_id = index + 1
    printed_tokens = set()  #  Variable for tracking printed tokens

    for token_index, token in enumerate(doc):
        label = "O"  # Other

        for mountain in mountains:
            mountain_tokens = mountain.split()

            # Check if there is any mountain in the current sentence
            if mountain.lower() in doc.text.lower():
                if token.text.lower() == mountain_tokens[0].lower():
                    label = "B-MOUNT"  # Begin mountain label
                    if token.text not in printed_tokens:
                        token_data.append({
                            "sentence_id": sentence_id,
                            "word": token.text,
                            "POS": token.pos_,
                            "label": label
                        })
                        printed_tokens.add(token.text)

                    # Check if remaining tokens are part of the mountain name
                    for i in range(1, len(mountain_tokens)):
                        if (token_index + i) < len(doc) and doc[token_index + i].text.lower() == mountain_tokens[i].lower():
                            if doc[token_index + i].text not in printed_tokens:
                                token_data.append({
                                    "sentence_id": sentence_id,
                                    "word": doc[token_index + i].text,
                                    "POS": doc[token_index + i].pos_,
                                    "label": "I-MOUNT" # Inner mountain label
                                })
                                printed_tokens.add(doc[token_index + i].text)
                    break

        # Append tokens that are not part of any mountain
        if label == "O" and token.text not in printed_tokens:
            token_data.append({
                "sentence_id": sentence_id,
                "word": token.text,
                "POS": token.pos_,
                "label": label
            })

In [None]:
tokens_df = pd.DataFrame(token_data)

In [None]:
tokens_df.head(50)

Unnamed: 0,sentence_id,word,POS,label
0,1,The,DET,O
1,1,Weißkugel,PROPN,B-MOUNT
2,1,is,AUX,O
3,1,one,NUM,O
4,1,of,ADP,O
5,1,the,DET,O
6,1,highest,ADJ,O
7,1,peaks,NOUN,O
8,1,in,ADP,O
9,1,the,DET,O


# Save file for further training

In [None]:
tokens_df.to_csv('test_task_NER_dataset.csv', index=False, encoding='utf-8')