In [17]:
import re

from nltk.tokenize import word_tokenize
from tqdm.notebook import tqdm
from zipfile import ZipFile

In [18]:
line_count_total = 35_216_229

In [19]:
def clean_sentence(sentence):    
    cleaned_sentence = []
    for word in sentence:
        lower_word = word.lower()
        punc_less_word = re.sub(r'[^\w\s]', '', lower_word)
        
        if len(punc_less_word) > 0:
            cleaned_sentence.append(punc_less_word)
            
    return cleaned_sentence

In [20]:
def process_data(languages):
    try:
        zf = ZipFile('en-it.txt.zip')
        
        in_files = {}
        out_files = {}
        for lang in languages:
            in_files[lang] = zf.open('OpenSubtitles.en-it.' + lang, 'r')
            out_files[lang] = open(f'{lang}_corpus_processed.txt', 'w', encoding='utf-8')
            
        for en_sent, it_sent in tqdm(zip(in_files['en'], in_files['it']), total=line_count_total, desc='Processing lines'):
            tok_en_sent = clean_sentence(word_tokenize(en_sent.decode("utf-8").replace('\n', ''), language='english'))
            tok_it_sent = clean_sentence(word_tokenize(it_sent.decode("utf-8").replace('\n', ''), language='italian'))
            
            if len(tok_en_sent) > 0 and len(tok_it_sent) > 0:
                out_files['en'].write(' '.join(tok_en_sent) + '\n')
                out_files['it'].write(' '.join(tok_it_sent) + '\n')
            
    finally:
        for lang in languages:
            in_files[lang].close()
            out_files[lang].close()
            
        zf.close()

In [21]:
process_data(['en', 'it'])

Processing lines:   0%|          | 0/35216229 [00:00<?, ?it/s]

Resulting line count: 35_188_410