# Setup

In [84]:
import os,re,time
from nltk import word_tokenize,sent_tokenize
from nltk.stem import PorterStemmer,LancasterStemmer

"""please uncomment and run these if your machine does not have needed nltk data"""
# nltk.download('punkt') 
# nltk.download('stopwords')

'please uncomment and run these if your machine does not have needed nltk data'

### Basic info retrival about corpora

In [8]:
# total document files in our corpus
len(os.listdir('english-corpora'))

8635

In [15]:
# all of them are .txt files
all('.txt' in t for t in os.listdir('english-corpora'))

True

In [16]:
os.listdir('english-corpora')[:5]

['D00521.txt', 'P_1709.txt', 'D00920.txt', 'T00143.txt', 'P03064.txt']

In [58]:
english_corpora_files=os.listdir('english-corpora')

### Tasks to do

- [x] TODO: how to remove HTML info from these pages
- [x] TODO: how to use porter's stemmer
- [x] TODO: how to do tokenization
- [x] TODO: how to clean text files: remove non-ascii and what else?; and when removing non-ascii should I remove just one character or whole word
- [x] TODO: don't remove stopwords; at least for tf-idf it does not matter; for binary and BM25??

In [85]:
test_corpora_dir='test-corpora'

In [98]:
def create_folder(folder_name):
    '''
    create folder with name given
    '''
    try:
        os.mkdir(folder_name)
    except FileExistsError:
        pass

def corpora_processor(corpora_files_dir,processed_corpora_dir):
    '''
    Input: corpora files dir; processed folder name

    Output: processed corpora files
    '''
    # TODO: add stemmer specifing functionality
    create_folder(processed_corpora_dir)

    porter=PorterStemmer()

    for file in os.listdir(corpora_files_dir):
        with open(corpora_files_dir+'/'+file,'r') as f:
            corpora_file_str=f.read() # text str

            """remove css code lines"""
            css_regex=re.compile(r'.mw.*}')
            # substitue regex expression by ''
            corpora_file_str=css_regex.sub('',corpora_file_str)

            """remove html tag lines"""
            html_regex=re.compile(r'<.*>')
            # substitue regex expression by ''
            corpora_file_str=html_regex.sub('',corpora_file_str)

            """tokenize file str"""
            # here each doc could be processed line by line; i am doing whole doc str
            tokens=word_tokenize(corpora_file_str)
            
            """stem file tokens"""
            processed_corpora_file_str=''
            for token in tokens:
                # avoid non-ascii as suggested in the question
                if token.isascii():
                    processed_corpora_file_str+=porter.stem(token)
                    processed_corpora_file_str+=' ' # to seperate processed tokens

        """store processed file with same name"""
        with open(processed_corpora_dir+'/'+file,'w') as output_file:
            output_file.write(processed_corpora_file_str)

- time taken for whole corpora is too much. around 1600 sec or 25 min
    - one solution is to directly stem file string without tokenization; so we stem whole string together rather than each token and after stemming we tokenize and store; it could save half time 
    - NOTE: above solution is not implementable using Porter since it stems token wise meaning word by word

In [99]:
# testing processor on test-corpora of 100 files
corpora_processor(test_corpora_dir,'processed-test-corpora')

---

In [107]:
# tokenization of a file withput using word_tokenizer from nltk
with open('test-corpora/C00001.txt','r') as doc:
    file=doc.read()
    regex_exp3=re.compile(r'\n') # to sub \n by ' '
    regex_exp4=re.compile(r'\t') # to sub \n by ''
    clean_text=regex_exp3.sub(' ',file)
    clean_text=regex_exp4.sub('',clean_text)
    tokens=clean_text.split(' ')
    print(len(tokens))

10640


---
---

In [4]:
def get_unique_word_freq(word_list):
    '''
    takes word list and returns unique-words with their freq in dict
    '''
    unqiue_words=list()
    words_freq={}
    # store unqiue words in list
    for word in word_list:
        if word not in unqiue_words:
            unqiue_words.append(word)
    # get unique words freq from all words list using count
    for unqiue_word in unqiue_words:
        # NOTE: this same method can be used to get freq of a particular word in given document
        words_freq[unqiue_word]=word_list.count(unqiue_word) # used count method of lists
    
    return words_freq

In [37]:
# implement pre-processing using nested dicts rather than linked lists
doc1='from pyIR.utils.cache import Cache from pyIR.utils.collections import TweakedCounter from pyIR.utils.inverted_index import InvertedIndex'
doc2='from collections import Counter from math import log from typing import TweakedCounter TweakedCounter'
doc3='math from TweakedCounter TweakedCounter math import Cache'
words_global=list() # to store words from all docs
words_freq_global_dict={} # to store word:freq from all docs
indexed_files={}
DICT={}
for i,doc in enumerate([doc1,doc2,doc3]):
    indexed_files[i+1]=f'doc{i+1}'
    tokens=doc.split(' ')
    doc_word_freq=get_unique_word_freq(tokens)
    # print(doc_word_freq)
    for word in doc_word_freq.keys():
        does_word_exists=DICT.get(word,None)
        if does_word_exists is not None:
            DICT[word][i+1]=doc_word_freq[word]
        else:
            # here first we create a new element in parent dict
            DICT[word]={}
            DICT[word][i+1]=doc_word_freq[word]

In [40]:
DICT

{'from': {1: 3, 2: 3, 3: 1},
 'pyIR.utils.cache': {1: 1},
 'import': {1: 3, 2: 3, 3: 1},
 'Cache': {1: 1, 3: 1},
 'pyIR.utils.collections': {1: 1},
 'TweakedCounter': {1: 1, 2: 2, 3: 2},
 'pyIR.utils.inverted_index': {1: 1},
 'InvertedIndex': {1: 1},
 'collections': {2: 1},
 'Counter': {2: 1},
 'math': {2: 1, 3: 2},
 'log': {2: 1},
 'typing': {2: 1}}

In [39]:
indexed_files


{1: 'doc1', 2: 'doc2', 3: 'doc3'}

In [34]:
# rough work to see if it works
b={'d':{'c':3}}
print(b)
b['d']['b']=2
b['f']={}
b['f']['g']=1
print(b.get('d',None))
print(b)

{'d': {'c': 3}}
{'c': 3, 'b': 2}
{'d': {'c': 3, 'b': 2}, 'f': {'g': 1}}
