# Create a corpus file from the LA wikipedia dump
In this notebook we'll:
1. Process a wikipedia dump that has been transformed into a series of JSONL files
1. Select text section that have contiguous group of sentences, so as to yield a higher quality embedding later on
1. Tokenize the senteces and words 
1. Format the output into the Gensim `cor` file format for better stream processing

## Note: 
To run this notebook on a fresh install, you should first run the scripts:
* `install_latin_wikipedia_data.sh`
* `preprocess_latin_wikipedia_files`

as described in this folder's README.

In [1]:
import json
import glob
import os
import pathlib
import re
import pickle

## Enumerate the JSONL files

In [2]:
current_dir = pathlib.Path.cwd()
parent_dir = os.path.dirname(current_dir)

json_files = glob.glob(os.path.join (parent_dir, 'data', 'latin_wikipedia', 'jsonl' ,'**') , recursive=True )
print(f"{len(json_files)} candidate JSONL files")

71 candidate JSONL files


## Load a sentence tokenizer from CLTK, and demonstrate its use

In [3]:
with open(os.path.join(os.path.expanduser('~/cltk_data/latin/model/latin_models_cltk/tokenizers/sentence'),'latin_punkt.pickle'), 'rb') as loader:
    sentence_tokenizer = pickle.load(loader)

sentence_tokenizer.tokenize("arma virumque cano. odi et amo. Et tu, Brute?")

['arma virumque cano.', 'odi et amo.', 'Et tu, Brute?']

## Load a Word Tokenizer and demonstrate its use

In [4]:
from cltk.tokenize.word import WordTokenizer
word_tokenizer = WordTokenizer(language='latin')
word_tokenizer.tokenize('Et tu, Brute?')

['Et', 'tu', ',', 'Brute', '?']

## Compile Regex pattern to swallow tag text and parenthetical explanations

In [5]:
gtlt_stripper = re.compile(r'<.*>')
paren_stripper = re.compile(r'\(.*\)')

In [6]:
# Testing/verifcation lines commented out 
#text_data =[]
with open ('wikimedia.la.cor', 'wt')as writer:
    for filename in json_files:
        if os.path.isdir(filename): # skip directories
            continue
        with open (filename, 'rt') as loader:
            for line in loader:
                obj = json.loads(line)
                if 'text' in obj:
                    text = obj['text']                    
                    # skip headings, one line sentence pages
                    if len(sentence_tokenizer.tokenize(text)) > 1: 
                        # drop parenthetical and angle bracket info
                        text = paren_stripper.sub('', gtlt_stripper.sub('', text)).strip()
                        parts = text.split('\n')
                        for part in parts:
                            # skip heading sections, and one line sentence sections
                            if len(sentence_tokenizer.tokenize(part)) > 1: 
                                for sent in sentence_tokenizer.tokenize(part):                                
                                    if sent:
                                        words = word_tokenizer.tokenize(sent)
                                        # Testing/verifcation lines commented out
                                        #text_data.append(sent)
                                        writer.write(' '.join(words))
                                        writer.write('\n')
                                        
# Testing/verifcation lines commented out                                     
# for line in text_data[:5]:                
#     print(line)

In [7]:
! wc -w wikimedia.la.cor
# text_data[0] 

 5874245 wikimedia.la.cor


### Now the Latin wikipedia corpus is reading for pipeline processing.