## Install this packs
!pip install nltk

!pip install datasets

!pip install ipywidgets

In [2]:
import os, re, json

import pandas as pd

from datetime import datetime

## Download, extract and create a json file from wikipedia dump
- wget https://dumps.wikimedia.org/ptwiki/latest/ptwiki-latest-pages-articles.xml.bz2
- python -m WikiExtractor data/ptwiki-latest-pages-articles.xml.bz2 --json

In [3]:
# start_time = datetime.now()

# !wget https://dumps.wikimedia.org/ptwiki/latest/ptwiki-latest-pages-articles.xml.bz2 -O data/wiki.bz2
# !python -m WikiExtractor data/wiki.bz2 --json -o data/

# print('Duration: {}'.format(datetime.now() - start_time))

In [4]:
DATA_DIR = 'data/'

if not os.path.exists(DATA_DIR):
    os.makedirs(DATA_DIR, mode=0o777)

In [14]:
# Punctuation list
punctuations = re.escape('!"#%\'()*+,./:;<=>?@[\\]^_`{|}~')

# ##### #
# Regex #
# ##### #
re_remove_brackets = re.compile(r'\{.*\}')
re_remove_html = re.compile(r'<(\/|\\)?.+?>', re.UNICODE)
re_transform_numbers = re.compile(r'\d', re.UNICODE)
re_transform_emails = re.compile(r'[^\s]+@[^\s]+', re.UNICODE)
re_transform_url = re.compile(r'(http|https)://[^\s]+', re.UNICODE)
# Different quotes are used.
re_quotes_1 = re.compile(r"(?u)(^|\W)[‘’′`']", re.UNICODE)
re_quotes_2 = re.compile(r"(?u)[‘’`′'](\W|$)", re.UNICODE)
re_quotes_3 = re.compile(r'(?u)[‘’`′“”]', re.UNICODE)
re_dots = re.compile(r'(?<!\.)\.\.(?!\.)', re.UNICODE)
re_punctuation = re.compile(r'([,";:]){2},', re.UNICODE)
re_hiphen = re.compile(r' -(?=[^\W\d_])', re.UNICODE)
re_tree_dots = re.compile(u'…', re.UNICODE)
re_changehyphen = re.compile(u'–')
re_doublequotes_1 = re.compile(r'(\"\")')
re_doublequotes_2 = re.compile(r'(\'\')')
re_trim = re.compile(r' +', re.UNICODE)


def clean_text(text):
    """Apply all regex above to a given string."""
#     text = text.lower()
    text = text.replace('\xa0', ' ')
    text = re_tree_dots.sub('...', text)
    text = re.sub('\.\.\.', '', text)
    text = re_remove_brackets.sub('', text)
    text = re_changehyphen.sub('-', text)
    text = re_remove_html.sub(' ', text)
    text = re_transform_numbers.sub('0', text)
    text = re_transform_url.sub('URL', text)
    text = re_transform_emails.sub('EMAIL', text)
    text = re_quotes_1.sub(r'\1"', text)
    text = re_quotes_2.sub(r'"\1', text)
    text = re_quotes_3.sub('"', text)
    text = re.sub('"', '', text)
    text = re.sub('[\\n]+', '\\n', text)
    text = re_dots.sub('.', text)
    text = re.sub(r'[^\w\s]','',text)
    text = re_hiphen.sub(' - ', text)
    text = re_doublequotes_1.sub('\"', text)
    text = re_doublequotes_2.sub('\'', text)
    text = re.sub(r'\s+', ' ', text) # Removing multiple spaces
    text = re.sub(r"\s+[a-zA-Z]\s+", ' ', text) # Single character removal
    text = re.sub(r'[0-9]', '', text)
    text = re.sub(r'\(|\)', '', text)
    text = re_trim.sub(' ', text)
    return text.strip()


# Before building the pre-training dataset, we should make sure the corpus has the following format:**

# each line is a sentence
# a blank line separates two documents


def save_txt(text, file):
    f = open(file, 'a')
    f.write(text+'\n')
    f.close()
    

def save_full(dataframe, fraction, base_dir,  filename, txt=True, csv=False):
    df_temp = dataframe[['id','text']].sample(frac=fraction)
    df_temp['text'] = df_temp['text'].apply(clean_text)
       
    if csv:
        output = base_dir + 'csv/'
        if not os.path.exists(output):
            os.makedirs(output, mode=0o777)
            
        df_temp.to_csv(output + filename + '.csv', sep='|', index=False)
    
    if txt:
        output = base_dir + 'txt/'
        if not os.path.exists(output):
            os.makedirs(output, mode=0o777)
            
        filename = output+filename+'.txt'
        
        if os.path.exists(filename):
            os.remove(filename)
            
        df_temp['text'].apply(save_txt, file=filename)
        
    return df_temp

In [6]:
with open(DATA_DIR + 'wiki.json') as json_file:      
    data = json_file.readlines()
    # this line below may take at least 8-10 minutes of processing for 4-5 million rows. It converts all strings in list to actual json objects. 
    data = list(map(json.loads, data)) 

In [7]:
df_wiki = pd.DataFrame(data)
df_wiki = df_wiki.drop(['url', 'title'], axis=1)
df_wiki['id'] = pd.to_numeric(df_wiki['id'])
df_wiki.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1058219 entries, 0 to 1058218
Data columns (total 2 columns):
 #   Column  Non-Null Count    Dtype 
---  ------  --------------    ----- 
 0   id      1058219 non-null  int64 
 1   text    1058219 non-null  object
dtypes: int64(1), object(1)
memory usage: 16.1+ MB


In [21]:
start_time = datetime.now()

df = save_full(dataframe=df_wiki, fraction=1, base_dir=DATA_DIR, filename='text')

print('Duration: {}'.format(datetime.now() - start_time))

Duration: 0:09:56.725202
