# The requirements before start
```conda install lxml
pip install pystemmer
conda install requests```

# Data

In [None]:
import requests
import gzip
from dataclasses import dataclass
from collections import Counter
from lxml import etree

URL = 'https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-abstract.xml.gz'
with requests.get(URL, stream=True) as req:
    req.raise_for_status()
    with open('./enwiki-latest-abstract.xml.gz', 'wb') as f:
        for i, chunk in enumerate(req.iter_content(chunk_size=1024*1024)):
            f.write(chunk)
            if i%10 == 0:
                print('Downloaded {} mbs'.format(i), end='\r')
                    
               
@dataclass
class Abstract:
    ID : int
    title : str
    abstract : str
    url : str
        
    @property
    def fulltext(self):
        return ' '.join([self.title, self.abstract])

abstract_result = []
with gzip.open('./enwiki-latest-abstract.xml.gz' 'rb') as f:
    doc_id = 1
    
    for _, element in etree.iterparse(f, events=('end',), tag='doc'):
        print(_, element)
        title = element.findtext('./title')
        url = element.findtext('./url')
        abstract = element.findtext('./abstract')
        abstract_result.append(Abstract(ID=doc_id, title=title, url=url, abstract=abstract))
        doc_id += 1
        element.clear()

## Indexing

In [None]:
import re
import Stemmer
import string

stemmer = Stemmer.Stemmer('english')
stopwords = set(['the', 'be', 'to', 'of', 'and', 'a', 'in', 'that', 'have',
                 'I', 'it', 'for', 'not', 'on', 'with', 'he', 'as', 'you',
                 'do', 'at', 'this', 'but', 'his', 'by', 'from', 'wikipedia'])
punctuation = re.compile('[%s]' % re.escape(string.punctuation))

def analyze(text):
    tokens = test.split()
    tokens = [token.lower() for token in tokens]
    tokens = [punctuation.sub('', token) for token in tokens]
    tokens = [token for token in tokens if token not in st0opwords]
    tokens = [stemmer.stemWords(tokens)]
    
    return tokens

class Index:
    def __init__(self):
        self.index = {}
        self.documents = {}
        
    def index_document(self, document):
        if document.ID not in self.documents:
            self.documents[document.ID] = document
        for token in analyze():
            if token not in self.index:
                self.index[token] = set()
            self.index[token].add(document.ID)