In [3]:
import gdown
import pandas as pd
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
nltk.download('stopwords')
import re
from tqdm import tqdm

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
!gdown --id 1BhQwXvj4BjEXyTEk7m63HHiaGX8aqQ-Q

In [4]:
df = pd.read_parquet("/content/train-00000-of-00041.parquet")
print("Total Number of wikipedia page data obtained : ",df.shape[0])
df.head()

Total Number of wikipedia page data obtained :  157529


Unnamed: 0,id,url,title,text
0,12,https://en.wikipedia.org/wiki/Anarchism,Anarchism,Anarchism is a political philosophy and moveme...
1,25,https://en.wikipedia.org/wiki/Autism,Autism,Autism is a neurodevelopmental disorder charac...
2,39,https://en.wikipedia.org/wiki/Albedo,Albedo,Albedo (; ) is the measure of the diffuse refl...
3,290,https://en.wikipedia.org/wiki/A,A,"A, or a, is the first letter and the first vow..."
4,303,https://en.wikipedia.org/wiki/Alabama,Alabama,Alabama () is a state in the Southeastern regi...


In [5]:
def decontracted(text):
    '''Funtion to expand the sentences which are in short forms'''
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"can\'t", "can not", text)
    text = re.sub(r"n\'t", " not", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'s", " is", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'t", " not", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'m", " am", text)
    return text

def remove_special_chars(text):
  '''This function removes the special chars from the text'''
  text = re.sub('[^A-Za-z0-9]+', ' ', text)
  text=text.lower()
  return text

stopWords=stopwords.words('english')
stemmer=SnowballStemmer('english')

#removing no,nor and not words from the english stopwords
stopWords.remove('not')
stopWords.remove('no')
stopWords.remove('nor')


def remove_stopwords(text):
  '''This function removes the stopwords from the text'''
  text=[word for word in text.split() if not word in stopWords]
  text=' '.join(text)
  return text


def stemming(text):
  '''This function is to do stemming on words of text'''
  text=' '.join([stemmer.stem(word) for word in text.split()])
  return text

def clean_text(text):
  '''This function does all the text preprocessing steps and return a clean text'''
  text=decontracted(text)
  text=remove_special_chars(text)
  #text=remove_stopwords(text)
  #text=stemming(text)
  return text.strip()

In [6]:
def get_sentences(wiki_text,minLen=5,maxLen=15):
  sentences = sent_tokenize(wiki_text)
  cleaned_sentences = [clean_text(sentence) for sentence in sentences]
  filtered_sentences = [sentence for sentence in cleaned_sentences if len(sentence.split())<maxLen and len(sentence.split())>minLen]
  return filtered_sentences

In [7]:
data = []

for index,row in tqdm(df.iterrows(), total=df.shape[0]):
  sentences = get_sentences(row['text'])
  for s in sentences:
    data.append((row['title'],s))

df_sentences = pd.DataFrame(data,columns=["Title","Text"])
df_sentences.head()

100%|██████████| 157529/157529 [13:42<00:00, 191.56it/s]


Unnamed: 0,Title,Text
0,Anarchism,with the rise of organised hierarchical bodies...
1,Anarchism,various anarchist schools of thought formed du...
2,Anarchism,criticism of anarchism include claims that it ...
3,Anarchism,the suffix ism denotes the ideological current...
4,Anarchism,anarchism is broadly used to describe the anti...


In [8]:
print("Number of sentences after cleaning all the wikipedia pages data : ",df_sentences.shape[0])

Number of sentences after cleaning all the wikipedia pages data :  3209719


In [9]:
df = df[["url","title"]]
df.to_csv("WikipediaData.csv",index=False)
df_sentences.to_csv("Cleaned_sentences.csv",index=False)

In [11]:
#!mv /content/WikipediaData.csv "/content/drive/MyDrive/CS521 Project"
#!mv /content/Cleaned_sentences.csv "/content/drive/MyDrive/CS521 Project"