# Convert Matrix files to Corpus file format
The Gensim library encourages developers to work with streaming access to a corpus whenever possible.
Streaming a corpus makes it easy to work with and is more memory friendly. In this notebook, we'll show how to do this.

In [21]:
from gensim.utils import save_as_line_sentence
import pickle
import random
import glob

In [24]:
datafiles = glob.glob('latin_library.corpus*.pkl') 

def load_corpus_matrix(filename):
    with open(filename,'rb') as loader:
        corpus = pickle.load(loader)
        return corpus

corpus = []
for filename in datafiles:
    corpus += load_corpus_matrix(filename)

print(f'total corpus size: {len(corpus):,}')

save_as_line_sentence(corpus, 'latin_library.preprocessed.cor')

total corpus size: 593,594


## QA the Corpus Data
You should always perform some random inspections of the corpus data, so that you can catch any obvious surprises.

In [25]:
random.sample(corpus, 3)

[['illis',
  'tandem',
  'frustra',
  'circa',
  'nouam',
  'munitionem',
  'laborantibus',
  'insidiae',
  'comitis',
  'surrexerunt',
  'in',
  'equis',
  'uelocissimis',
  'ad',
  'auxilium',
  'sociorum',
  'qui',
  'erant',
  'in',
  'praesidio',
  'contendentes',
  'et',
  'iam',
  'diem',
  'ultimum',
  'metuentes',
  'et',
  'regredi',
  'sursum',
  'ad',
  'portam',
  'praeparantes',
  'uehementi',
  'insecutione',
  'oppresserunt',
  'solum',
  'que',
  'iuuenem',
  'de',
  'nobili',
  'parentela',
  'procreatum',
  'retinuerunt',
  'caeteri',
  'fuga',
  'elapsi',
  'sunt'],
 ['at',
  'qui',
  'formas',
  'nouit',
  'is',
  'naturae',
  'unitatem',
  'in',
  'materiis',
  'dissimillimis',
  'complectitur',
  'itaque',
  'quae',
  'adhuc',
  'facta',
  'non',
  'sunt',
  'qualia',
  'nec',
  'naturae',
  'uicissitudines',
  'neque',
  'experimentales',
  'industriae',
  'neque',
  'casus',
  'ipse',
  'in',
  'actum',
  'unquam',
  'perduxissent',
  'neque',
  'cogitationem',

In [26]:
corpus[:5]

[['conflato',
  'atque',
  'ex',
  'que',
  'omnem',
  'classem',
  'arcessit',
  'sagittarios',
  'equites',
  'ab',
  'rege',
  'euocat',
  'tormenta',
  'undique',
  'conquiri',
  'et',
  'frumentum',
  'mitti',
  'auxilia',
  'adduci',
  'iubet'],
 ['interim',
  'munitiones',
  'cotidie',
  'operibus',
  'augentur',
  'atque',
  'omnes',
  'oppidi',
  'partes',
  'quae',
  'minus',
  'esse',
  'firmae',
  'uidentur',
  'testudinibus',
  'ac',
  'musculis',
  'aptantur',
  'ex',
  'aedificiis',
  'autem',
  'per',
  'foramina',
  'in',
  'proxima',
  'aedificia',
  'arietes',
  'immittuntur',
  'quantum',
  'que',
  'aut',
  'ruinis',
  'deicitur',
  'aut',
  'per',
  'uim',
  'recipitur',
  'loci',
  'in',
  'tantum',
  'munitiones',
  'proferuntur'],
 ['nam',
  'incendio',
  'fere',
  'tuta',
  'est',
  'quod',
  'sine',
  'contignatione',
  'ac',
  'materia',
  'sunt',
  'aedificia',
  'et',
  'structuris',
  'ac',
  'fornicibus',
  'continentur',
  'tecta',
  'que',
  'sunt',
  

In [27]:
len(corpus[2])

24