In [9]:
# importing all the necessary libraries
import numpy as np
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords, wordnet
import pandas as pd
import string


READING THE FIRST 6 ARTICLES FROM THE DATASET 

In [10]:
# reading the data as a DataFrame
df = pd.read_csv("enwiki-20170820.csv", nrows= 90)
df.head()

Unnamed: 0,ARTICLE_ID,SECTION_TEXT
0,0,'' 'Anarchism '' political philosophy advocate...
1,1,'' 'Autism '' neurodevelopmental disorder char...
2,2,Percentage diffusely reflected sunlight relati...
3,3,Writing cursive forms '' '' named plural `` ''...
4,4,'' 'Alabama '' state southeastern region Unite...


In [12]:
drop_columns = ['TITLE','SECTION_TITLE']
df.drop(drop_columns, axis=1, inplace=True)

#the columns have already been dropped 

KeyError: "['TITLE', 'SECTION_TITLE'] not found in axis"

In [None]:
# merged_df = df.groupby('ARTICLE_ID')['SECTION_TEXT'].apply('\n'.join).reset_index()
# print(merged_df)

Preprocessing the Dataset

In [None]:
# Initialize lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Function to preprocess text
def preprocess_text(text):
    if isinstance(text, str):  # Check if the text is a string
        # Tokenize text
        tokens = word_tokenize(text)
        
        # Remove punctuation, non-alphabetic characters, and single characters
        tokens = [word.lower() for word in tokens if word.isalpha() and len(word) > 1]
        
        # Remove stopwords
        tokens = [word for word in tokens if word not in stop_words]
        
        # Lemmatize English words
        tokens = [lemmatizer.lemmatize(word) if wordnet.synsets(word) else word for word in tokens]
        
        return tokens
    else:
        return []  # Return an empty list if the input is not a string

# Create a dictionary to store words for each article ID
article_words = {}

# Iterate through rows and preprocess text
for index, row in df.iterrows():
    article_id = row['ARTICLE_ID']
    section_text = row['SECTION_TEXT']
    
    # Preprocess text
    cleaned_words = preprocess_text(section_text)
    
    # Add words to the list for the respective article ID
    if article_id in article_words:
        article_words[article_id].extend(cleaned_words)
    else:
        article_words[article_id] = cleaned_words



Making a Vocabulary.txt File and assigning each word a unique ID

In [13]:
# Function to extract unique words and assign IDs after lemmatizing the text
def extract_unique_words(text, language='english'):
    lemmatizer = WordNetLemmatizer()
    words = word_tokenize(text)
    unique_words = set([lemmatizer.lemmatize(word.lower()) for word in words])
    word_id_mapping = {word: str(i) for i, word in enumerate(unique_words)}
    return word_id_mapping

# Apply function to each row in the DataFrame
word_id_mappings = df['SECTION_TEXT'].apply(extract_unique_words)

# Combine word_id_mappings from all rows
combined_word_id_mapping = {}
for word_id_mapping in word_id_mappings:
    combined_word_id_mapping.update(word_id_mapping)

with open('Vocabulary.txt', 'w', encoding='utf-8') as file:
    for word, idx in combined_word_id_mapping.items():
        file.write(f"{word},{idx}\n")