In [1]:
# importing all the necessary libraries
import numpy as np
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.corpus import stopwords, wordnet
import pandas as pd
import string



In [2]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\abbas\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

READING THE FIRST 6 ARTICLES FROM THE DATASET 

In [3]:
# reading the data as a DataFrame
df = pd.read_csv("enwiki-20170820.csv", nrows= 90)
df.head()

Unnamed: 0,ARTICLE_ID,SECTION_TEXT
0,0,'' 'Anarchism '' political philosophy advocate...
1,1,'' 'Autism '' neurodevelopmental disorder char...
2,2,Percentage diffusely reflected sunlight relati...
3,3,Writing cursive forms '' '' named plural `` ''...
4,4,'' 'Alabama '' state southeastern region Unite...


In [4]:
# drop_columns = ['TITLE','SECTION_TITLE']
# df.drop(drop_columns, axis=1, inplace=True)

#already dropped


In [5]:
df.isnull().sum()

# Drop rows with missing SECTION_TEXT values
df.dropna(subset=['SECTION_TEXT'], inplace=True)

In [6]:
df.isnull().sum()

ARTICLE_ID      0
SECTION_TEXT    0
dtype: int64

In [7]:
#merging rows with similar article ID
df = df.groupby('ARTICLE_ID')['SECTION_TEXT'].apply('\n'.join).reset_index()
df

Unnamed: 0,ARTICLE_ID,SECTION_TEXT
0,0,'' 'Anarchism '' political philosophy advocate...
1,1,'' 'Autism '' neurodevelopmental disorder char...
2,2,Percentage diffusely reflected sunlight relati...
3,3,Writing cursive forms '' '' named plural `` ''...
4,4,'' 'Alabama '' state southeastern region Unite...
5,5,Achilles Nereid Cymothoe Attic red-figure kant...
6,6,'' 'Abraham Lincoln '' February 12 1809 – Apri...


In [8]:
article_id = 0
text_for_article_id_0 = df.loc[df['ARTICLE_ID'] == article_id, 'SECTION_TEXT'].iloc[0]

print(text_for_article_id_0)

'' 'Anarchism '' political philosophy advocates self-governed societies based voluntary institutions often described stateless societies although several authors defined specifically institutions based non-hierarchical free associations Anarchism holds state undesirable unnecessary harmful anti-statism central anarchism specifically entails opposing authority hierarchical organisation conduct human relations including limited state system Anarchism usually considered extreme left-wing ideology much anarchist economics anarchist legal philosophy reflects anti-authoritarian interpretations communism collectivism syndicalism mutualism participatory economics Anarchism offer fixed body doctrine single particular world view instead fluxing flowing philosophy Many types traditions anarchism exist mutually exclusive Anarchist schools thought differ fundamentally supporting anything extreme individualism complete collectivism Strains anarchism often divided categories social individualist anar

Preprocessing the Dataset

In [9]:
# Initialize lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))


def preprocess_text(text):
    if isinstance(text, str):  # Check if the text is a string
        # Tokenize text
        tokens = word_tokenize(text)
        
        # Remove punctuation, non-alphabetic characters, and single characters
        tokens = [word.lower() for word in tokens if word.isalpha() and len(word) > 1]
        
        # Remove stopwords
        tokens = [word for word in tokens if word not in stop_words]
        
        # Lemmatize English words with POS tagging
        tokens = [lemmatizer.lemmatize(word, pos=get_wordnet_pos(word)) for word in tokens]
        
        return tokens
    else:
        return []  # Return an empty list if the input is not a string

# Function to map POS tags to WordNet POS tags
def get_wordnet_pos(word):
    tag = nltk.pos_tag([word])[0][1][0].upper()  # Get the POS tag
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)  # Default to noun if no match found

df['SECTION_TEXT'] = df['SECTION_TEXT'].apply(preprocess_text)

df['SECTION_TEXT'] = df['SECTION_TEXT'].apply(lambda x: ' '.join(x))

df



Unnamed: 0,ARTICLE_ID,SECTION_TEXT
0,0,political philosophy advocate society base vol...
1,1,neurodevelopmental disorder characterize impai...
2,2,percentage diffusely reflect sunlight relation...
3,3,write cursive form name plural aes first lette...
4,4,state southeastern region united state border ...
5,5,achilles nereid cymothoe attic kantharos volci...
6,6,lincoln february april american politician law...


In [10]:
article_id = 0
text_for_article_id_0 = df.loc[df['ARTICLE_ID'] == article_id, 'SECTION_TEXT'].iloc[0]

print(text_for_article_id_0)

political philosophy advocate society base voluntary institution often described stateless society although several author define specifically institution base free association anarchism hold state undesirable unnecessary harmful central anarchism specifically entail oppose authority hierarchical organisation conduct human relation include limited state system anarchism usually consider extreme ideology much anarchist economics anarchist legal philosophy reflect interpretation communism collectivism syndicalism mutualism participatory economics anarchism offer fix body doctrine single particular world view instead flux flow philosophy many type tradition anarchism exist mutually exclusive anarchist school thought differ fundamentally support anything extreme individualism complete collectivism strain anarchism often divide category social individualist anarchism similar dual classification term anarchism compound word compose word anarchy suffix derive respectively greek anarchy anarch

Making a Vocabulary.txt File and assigning each word a unique ID

In [12]:
# Function to process text and extract unique words
def process_text(text):
    return set(text.split())

# Extract unique words from each section
unique_words = set()
for text in df['SECTION_TEXT']:
    unique_words.update(process_text(text))

# Sort unique words alphabetically
sorted_words = sorted(list(unique_words))

# Create vocabulary.txt file
with open('vocabulary.txt', 'w') as f:
    # Write words and their unique IDs to the file in alphabetical order
    for i, word in enumerate(sorted_words):
        f.write(f"{word} {i}\n")