# Dataset generation

## Import necessary libraries

In [1]:
import pandas as pd 

## Define the paths to the files containing mountain names and sentences

In [2]:
mountain_csv = 'data_for_dataset/mountain.csv'
sentences_csv = 'data_for_dataset/sentences.csv'

## Load datasets into a DataFrame

In [3]:
mountain_df = pd.read_csv(mountain_csv)
sentences_df = pd.read_csv(sentences_csv)

## Creating list of mountain names 

In [4]:
mountains_list = mountain_df['Mountain'].tolist()
mountains_list[0:10]

['Mount Everest',
 'K2',
 'Kangchenjunga',
 'Lhotse',
 'Makalu',
 'Cho Oyu',
 'Dhaulagiri',
 'Manaslu',
 'Nanga Parbat',
 'Annapurna']

## Function tokenizes a given sentence and tags mountain names with appropriate labels.

In [5]:
def tokenize_sentences(sentence, mountains_list):
    words = sentence.split()
    tags = ["O"] * len(words)

    for mount_name in mountains_list:
        if mount_name in sentence:
            mount_words = mount_name.split()
            start_index = sentence.find(mount_name)
            end_index = start_index + len(mount_name)

            start_word_index = len(sentence[:start_index].split())
            end_word_index = start_word_index + len(mount_words)

            if end_word_index <= len(tags):
                tags[start_word_index] = 'B-MOUNTAIN'
                for i in range(start_word_index + 1, end_word_index):
                    tags[i] = 'I-MOUNTAIN'

    return words, tags


## Apply the tokenize_sentences function to each sentence in the DataFrame

In [6]:
sentences_df[['tokens', 'tags']] = sentences_df['sentence'].apply(lambda x: pd.Series(tokenize_sentences(x, mountains_list)))


## Define the output path for the tokenized sentences dataset and save the DataFrame to a CSV

In [7]:
sentence_tokenize_dataset = 'data_for_dataset/sentences_tokenizer.csv'
sentences_df.to_csv(sentence_tokenize_dataset, index=False, encoding='utf-8')


## Extract tokens and tags into a new DataFrame.

In [8]:
final_df = sentences_df[['tokens', 'tags']]

## Save the new DataFrame as a CSV file.

In [9]:
ner_dataset = 'data_for_dataset/ner_dataset.csv'
final_df.to_csv(ner_dataset,index=False, encoding='utf-8')

## First 10 rows of dataset

In [10]:

final_df.head(10)

Unnamed: 0,tokens,tags
0,"[Mount, Everest, is, the, highest, mountain, i...","[B-MOUNTAIN, I-MOUNTAIN, O, O, O, O, O, O, O]"
1,"[Kilimanjaro, offers, breathtaking, views, of,...","[B-MOUNTAIN, O, O, O, O, O]"
2,"[The, sunrise, view, from, Mount, Kilimanjaro,...","[O, O, O, O, B-MOUNTAIN, I-MOUNTAIN, O, O]"
3,"[Mount, Fuji, is, an, iconic, symbol, of, Japan.]","[B-MOUNTAIN, I-MOUNTAIN, O, O, O, O, O, O]"
4,"[Many, climbers, dream, of, scaling, Denali, i...","[O, O, O, O, O, B-MOUNTAIN, O, O]"
5,"[K2,, also, known, as, Mount, Godwin-Austen,, ...","[B-MOUNTAIN, O, O, O, O, O, O, O, O, O, O, O, O]"
6,"[Aoraki, /, Mount, Cook, is, the, tallest, mou...","[B-MOUNTAIN, I-MOUNTAIN, I-MOUNTAIN, I-MOUNTAI..."
7,"[Mount, Elbrus, is, the, highest, peak, in, Eu...","[B-MOUNTAIN, I-MOUNTAIN, O, O, O, O, O, O]"
8,"[Climbing, Vinson, in, Antarctica, is, a, rare...","[O, B-MOUNTAIN, O, O, O, O, O, O]"
9,"[Mount, Rainier, towers, over, the, skyline, o...","[B-MOUNTAIN, I-MOUNTAIN, O, O, O, O, O, O]"
