In [1]:
import re
from bs4 import BeautifulSoup

### Split warc file to docs

In [2]:
# ref : https://stackoverflow.com/questions/60269904/split-text-file-after-specific-line-in-python
SECTION_START = re.compile(r'<!DOCTYPE html')
SECTION_END = re.compile(r'</html>')

def split_docs_iter(stream):
    def inner(stream):
        # Yields each line until an end marker is found (or EOF)
        for line in stream:
            if line and not SECTION_END.match(line):
                yield line
                continue
            break

    # Find a start marker, then break off into a nested iterator
    for line in stream:
        if line:
            if SECTION_START.match(line):
                yield inner(stream)
            continue
        break

In [3]:
filename = "03.warc"

# split docs
with open(filename, 'r', encoding="ISO-8859-1") as fh_in:
    for (i, nested_iter) in enumerate(split_docs_iter(fh_in)):
        with open('./docs/docID_{:05d}'.format(i), 'w', encoding='UTF-8') as fh_out:
            for line in nested_iter:
                fh_out.write(line)

### Parse html and get text in <body> tag 

In [9]:
from os import listdir
from os.path import isfile, join

DOCS_PATH = 'docs'
PROCESSED_DOCS_PATH = 'processed_docs'

files = [f for f in listdir(DOCS_PATH) if isfile(join(DOCS_PATH, f))]

for file in files:
    try:
        with open(f"{DOCS_PATH}/{file}", 'r', encoding="ISO-8859-1") as f:
            soup = BeautifulSoup(f, "html.parser")

            # get text in <body>
            body_text = soup.find('body').getText()
            # Remove newline characters, Home\nHi -> Home Hi
            concatenated_body_text = " ".join(body_text.split())
            # Case folding, A -> a, additional character -> ""
            processed_concatenated_body_text = re.sub(r"[^A-Za-z0-9]+", ' ', concatenated_body_text).lower()

            with open(f"{PROCESSED_DOCS_PATH}/{file}_processed", mode="w", encoding="utf-8", errors='strict', buffering=1) as f1:
                f1.write(processed_concatenated_body_text)
    # skip the docs which not have <body>
    except (OSError, AttributeError) as e:
        print(f"{file} don't have body")


docID_01864 don't have body
docID_01865 don't have body
docID_01866 don't have body
docID_01867 don't have body
docID_01868 don't have body
docID_01869 don't have body
docID_01870 don't have body
docID_01871 don't have body
docID_01872 don't have body
docID_02492 don't have body
docID_05721 don't have body
docID_05722 don't have body
docID_05723 don't have body
docID_05724 don't have body
docID_05725 don't have body
docID_05726 don't have body
docID_05727 don't have body
docID_05728 don't have body
docID_05729 don't have body
docID_05792 don't have body
docID_05842 don't have body
docID_10104 don't have body


In [15]:
import nltk
from nltk.corpus import stopwords
from collections import Counter

# download stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\T160\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [21]:
# initialize nltk tokenizer
nltk.download('punkt')
sent_segmenter = nltk.data.load('tokenizers/punkt/english.pickle')

word_tokenizer = nltk.tokenize.regexp.WordPunctTokenizer()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\T160\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


In [27]:
stemmer = nltk.stem.porter.PorterStemmer()

In [92]:
# all docs with all tokens dict
all_docs_all_tokens_dict = {}
# final key name
each_term_in_each_doc_freq_dict = {}

# processed_files = [f for f in listdir(PROCESSED_DOCS_PATH) if isfile(join(PROCESSED_DOCS_PATH, f))]

processed_files = ['docID_00000_processed', 'docID_00001_processed']


for p_file in processed_files:
    with open(f"{PROCESSED_DOCS_PATH}/{p_file}", 'r', encoding="ISO-8859-1") as f:
        contents = f.read()

        doc_num = re.findall(r"[0-9]+", p_file)[0]
        single_doc_tokens_dict = {}
        doc_freq = 0

        # tokenized = word_tokenizer.tokenize(contents)
        tokenized = ["usernam", "member", "usernam"]

        tokenized_and_rm_stopwords_and_stemmed = [stemmer.stem(word) for word in tokenized if word not in stopwords.words('english') and not stemmer.stem(word).isnumeric()]
        tokens_dict = Counter(tokenized_and_rm_stopwords_and_stemmed)

        distinct_tokens = tokens_dict.keys()

        for term in distinct_tokens:
            pos_list = [i for i, x in enumerate(tokenized_and_rm_stopwords_and_stemmed) if x == term]
            term_freq = len(pos_list)
            inner_key_format = f"{doc_num},{term_freq}"

            # doc level
            # output format
            # d1 = {
            #  'salaka': [
            #     {'00000,2': [4, 35]}
            #     ],
            #  'time': [
            #     {'00000,2': [9, 98]}
            #     ]
            # }
            if term not in single_doc_tokens_dict:
                single_doc_tokens_dict[term] = []

            single_doc_tokens_dict[term].append({inner_key_format: pos_list})

            # all docs level
            # output format
            # d1 = {
            #  'salaka': [
            #     {'00000,2': [4, 35]}, {'00001,1': [36]}
            #     ],
            #  'time': [
            #     {'00000,2': [7, 10]}, {'00001,3': [9, 98, 100]}
            #     ]
            # }
            if term not in all_docs_all_tokens_dict:
                all_docs_all_tokens_dict[term] = []
                all_docs_all_tokens_dict.update(single_doc_tokens_dict)
            else:
                all_docs_all_tokens_dict[term].append(single_doc_tokens_dict[term])

all_docs_all_tokens_dict

{'usernam': [{'00000,2': [0, 2]}, [{'00001,2': [0, 2]}]],
 'member': [{'00000,1': [1]}, [{'00001,1': [1]}]]}

In [93]:
# in multi doc
# total_term_doc_freq_dict = {'alprazolam': 2, 'onlin': 1}
# term_in_each_doc_freq_dict = {'alprazolam': "alprazolam,5", 'onlin': "onlin,4"}

# dict((term_in_each_doc_freq_dict[key], value) for (key, value) in total_term_doc_freq_dict.items())

total_term_doc_freq_dict = {'alprazolam': 2, 'onlin': 1}
term_in_each_doc_freq_dict = {'alprazolam': "alprazolam,5", 'onlin': "onlin,4"}

# 每次 iteration 做的是:
# 找 term 並 append {'2, 3': [4, 7, 10]} 進去，沒有的話新增一個
# 更新外層 dict 的 key 中的 count
# d1 = {
#     'alprazolam, 5':{
#         {'1, 2': [0, 4]}, {'2, 3': [4, 7, 10]}
#     },
#     'onlin, 4':{
#         {'1, 3': [7, 10, 11]}, {'2, 1': [4]}
#     }
# }

# # final key name
each_term_in_each_doc_freq_dict = {}

# d1 = {
#     'alprazolam':[
#         {'1,2': [0, 4]}, {'2,3': [4, 7, 10]}
#     ],
#     'onlin':[
#         {'1,3': [7, 10, 11]}, {'2,1': [4]}
#     ]
# }


for term, doc_and_doc_freq_pos_list in all_docs_all_tokens_dict.items():
    term_in_doc_freq = 0
    print(term)
    print(doc_and_doc_freq_pos_list)

    for doc_and_doc_freq in doc_and_doc_freq_pos_list:
        each_doc_key_freq = int(doc_and_doc_freq.split(',')[1])
        term_in_doc_freq += each_doc_key_freq
    
# TODO : 要準備 total_term_doc_freq_dict 跟 term_in_each_doc_freq_dict

# # 最後再一次跟total_term_doc_freq_dict換key name
# each_term_in_each_doc_freq_dict.update({term: f"{term},{term_in_doc_freq}"})

# each_term_in_each_doc_freq_dict
# d1.keys()
# 
# print(total_count_dict)

usernam
[{'00000,2': [0, 2]}, [{'00001,2': [0, 2]}]]
member
[{'00000,1': [1]}, [{'00001,1': [1]}]]


In [78]:
# with open(f"{PROCESSED_DOCS_PATH}/docID_00001_processed", 'r', encoding="ISO-8859-1") as f:
#     contents = f.read()
#     doc_num = re.findall(r"[0-9]+", "docID_00001_processed")[0]
#     tokenized = word_tokenizer.tokenize(contents)
#     tokenized_and_rm_stopwords_and_stemmed = [stemmer.stem(word) for word in tokenized if word not in stopwords.words('english') and not stemmer.stem(word).isnumeric()]
#     tokens_dict = Counter(tokenized_and_rm_stopwords_and_stemmed)
#     print(tokens_dict)

In [None]:
"""
Format: to, 993427: 
    <1, 6: <7, 18, 33, 72, 86, 231>;   
    2, 5: <1, 17, 74, 222, 255>; … >
"""

In [74]:
with open(f"{PROCESSED_DOCS_PATH}/docID_00001_processed", 'r', encoding="ISO-8859-1") as f:
    contents = f.read()

    doc_num = re.findall(r"[0-9]+", "docID_00001_processed")[0]
    all_tokens_dict = {}
    doc_freq = 0

    tokenized = word_tokenizer.tokenize(contents)
    tokenized_and_rm_stopwords_and_stemmed = [stemmer.stem(word) for word in tokenized if word not in stopwords.words('english') and not stemmer.stem(word).isnumeric()]
    tokens_dict = Counter(tokenized_and_rm_stopwords_and_stemmed)

    distinct_tokens = tokens_dict.keys()

    # output format
    # d1 = {
    #  'salaka': [
    #     {'00001,2': [4, 35]}
    #     ],
    #  'time': [
    #     {'00001,2': [9, 98]}
    #     ]
    # }
    for term in distinct_tokens:
        pos_list = [i for i, x in enumerate(tokenized_and_rm_stopwords_and_stemmed) if x == term]
        term_freq = len(pos_list)
        inner_key_format = f"{doc_num},{term_freq}"

        if term not in all_tokens_dict:
            all_tokens_dict[term] = []

        all_tokens_dict[term].append({inner_key_format: pos_list})

print(all_tokens_dict)

{'25mg': [{'00001,14': [0, 2, 22, 34, 39, 44, 50, 53, 74, 100, 130, 140, 164, 200]}], 'alprazolam': [{'00001,26': [1, 3, 23, 33, 38, 40, 46, 51, 61, 69, 73, 81, 84, 101, 109, 119, 120, 121, 125, 129, 132, 141, 159, 172, 175, 199]}], 'salaka': [{'00001,2': [4, 35]}], 'babolaz': [{'00001,2': [5, 36]}], 'com': [{'00001,3': [6, 37, 134]}], 'date': [{'00001,1': [7]}], 'aug': [{'00001,1': [8]}], 'time': [{'00001,2': [9, 98]}], 'remot': [{'00001,1': [10]}], 'name': [{'00001,2': [11, 178]}], 'comment': [{'00001,1': [12]}], 'tri': [{'00001,1': [13]}], 'http': [{'00001,1': [14]}], 'acnet': [{'00001,1': [15]}], 'pratt': [{'00001,1': [16]}], 'edu': [{'00001,1': [17]}], 'mcolumbi': [{'00001,1': [18]}], 'order': [{'00001,1': [19]}], 'onlin': [{'00001,12': [20, 111, 176, 186, 202, 212, 216, 229, 238, 244, 249, 298]}], 'ativan': [{'00001,4': [21, 190, 266, 292]}], 'level': [{'00001,1': [24]}], 'discuss': [{'00001,2': [25, 92]}], 'home': [{'00001,1': [26]}], 'content': [{'00001,1': [27]}], 'search': [{