In [1]:
# Starting points 
import csv
import wikipedia
import spacy
import textacy
import textacy.keyterms
import datetime

wikipedia.set_rate_limiting(True, min_wait=datetime.timedelta(0, 0, 5000000))

filepath = 'articles.csv'
corpus_savepath = 'wikicorpus'

In [2]:
def list_from_csv(filepath):
    csvlist = []
    with open(filepath, newline='\n') as csvfile:
        article_reader = csv.reader(csvfile, dialect='excel')
        for row in article_reader:
            csvlist.append(row)
            
    del csvlist[0]
    return csvlist

In [3]:
articles = list_from_csv(filepath)
# print(articles[:10])

In [4]:
corpus = textacy.Corpus('en')

In [5]:
corpus

Corpus(0 docs; 0 tokens)

In [6]:
def batch_load_texts(corpus, text_list, batch_size=5):
    iterable = text_list[:]
    length = len(iterable)
    batches = length // batch_size
    
    for i in range(batches):
        min_slice = i * (batch_size + 1)
        max_slice = min_slice + batch_size
        
        for article in iterable[min_slice: max_slice]:
            try:
                wikipage = wikipedia.WikipediaPage(article)
                corpus.add_text(wikipage.content, metadata = {"name": article})
                print(corpus)
                print(article)
            except:
                continue
        
        '''
        batch_size = 5
        batch 0:      min = 0,   max = 5    
        batch 1:      min = 6,   max = 11   min = i + batch_size,        max = 2 * batch_size + 1
        batch 2:      min = 12,  max = 17   min = i * (batch_size + 1)
        
        batch_size = 10
        batch 0:      min = 0,   max = 10
        batch 1:      min = 11,  max = 21
        batch 2:      min = 22,  max = 32
        
        batch_size = n
        batch i:        min = i,             max = min + n
        batch i + 1:    min = i * (n + 1)    max = min + n
        batch i + 2:
        '''

In [7]:
batch_load_texts(corpus, articles[:10])
corpus.save(corpus_savepath)

Corpus(1 docs; 923 tokens)
['Outline of mathematics']
Corpus(2 docs; 7181 tokens)
['Mathematics']
Corpus(3 docs; 7181 tokens)
['Portal:Contents/Mathematics and logic']
Corpus(4 docs; 8035 tokens)
['Legendre moment']
Corpus(5 docs; 8247 tokens)
['User:SwathiSreeLuke7']
Corpus(6 docs; 13338 tokens)
['Data-driven control system']
Corpus(7 docs; 15453 tokens)
['Pure mathematics']
Corpus(8 docs; 15654 tokens)
['Physical mathematics']
Corpus(9 docs; 17506 tokens)
['Mathematics Subject Classification']


In [9]:
for doc in corpus:
    kt = textacy.keyterms.sgrank(doc, ngrams=(1,2,3), window_width=100, n_keyterms=5)
    print(doc)
    print(kt)
    print('\n')

Doc(923 tokens; "Mathematics is a field of study that investigat...")
[('number', 0.06908142214698697), ('science', 0.0596557577621979), ('theory', 0.054275944527853205), ('discrete mathematic', 0.05337183932328035), ('mathematical', 0.048935627621927776)]


Doc(6258 tokens; "Mathematics (from Greek μάθημα máthēma, "knowle...")
[('mathematical', 0.09112864559046843), ('mathematic', 0.06741666897680332), ('number theory', 0.06190268140297412), ('study', 0.059156191397581714), ('mathematician', 0.04346600543849723)]


Doc(0 tokens; "")
[]


Doc(854 tokens; "In mathematics, Legendre moments are a type of ...")
[('legendre moment', 0.18767484102084847), ('legendre polynomial', 0.17157093699326562), ('intensity function f(x', 0.06745579327176407), ('moment calculation complexity', 0.05361521846829209), ('image moment', 0.03757423091453614)]


Doc(212 tokens; "Jobin Geoge [ aka JGM ] is an Indian Math magic...")
[('indian math magician', 0.12379554539024905), ('jgm', 0.09101772454563008), ('