In [36]:
from collections import Counter

In [37]:
import string
import os
import pickle as pkl
from os import listdir
from os.path import isfile, join
import sys
import time

import nltk
from tqdm import tqdm
from nltk.corpus import stopwords
from nltk.tokenize.treebank import TreebankWordTokenizer
from nltk.tokenize import word_tokenize

from multiprocessing import Pool


nltk.download("stopwords")
nltk.download('punkt')
eng_stopwords = set(stopwords.words('english')).union(set(string.punctuation))

tokenizer = TreebankWordTokenizer()


def stem_token(token):
    """
        Stem the given token, using any stemmer available from the nltk library
        Input: a single token
        Output: the stem of the token
    """
    from nltk.stem.porter import PorterStemmer

    return PorterStemmer().stem(token)


def tokenize(text):
    """
        Tokenize the text.
        Input: text - a string
        Output: a list of tokens
    """
    tokens = word_tokenize(text)
    return tokens


def process_text(text):
    tokens = []
    for token in tokenize(text):
        if token.lower() in eng_stopwords:
            continue
        token = stem_token(token)
        token = token.lower()
        tokens.append(token)

    return tokens


def read_ap_docs(root_folder="./datasets/"):
    dirs = [join(root_folder, "ap", "docs", 'ap-88'),
            join(root_folder, "ap", "docs", 'ap-89')]
    doc_ids = []
    docs = []

    apfiles = []
    for dir in dirs:
        apfiles.extend([join(dir, f) for f in listdir(dir) if isfile(
            join(dir, f)) and 'ap' in f])

    print("Reading in documents")
    for apfile in tqdm(apfiles):
        with open(apfile, 'r', errors='replace') as reader:
            lines = reader.readlines()
        line_counter = 0
        doc_id = ''
        doc = ''
        while line_counter < len(lines):
            line = lines[line_counter]
            if '<DOCNO>' in line:
                doc_id = line.split('<DOCNO>')[1].strip().split(
                    '</DOCNO>')[0].strip()
                doc = ''
                doc_ids.append(doc_id)
            if '<TEXT>' in line and '</TEXT>' not in line:
                line_counter += 1
                line = lines[line_counter]
                while '</TEXT>' not in line:
                    doc += line.strip() + " "
                    line_counter += 1
                    line = lines[line_counter]
                if len(docs) == len(doc_ids):
                    docs[-1] = doc
                else:
                    docs.append(doc)
                continue
            line_counter += 1

    return docs, doc_ids


def get_processed_docs(doc_set_name="processed_docs"):

    path = f"./{doc_set_name}.pkl"

    if not os.path.exists(path):
        docs, doc_ids = read_ap_docs()

        print("Processing documents now")
        doc_repr = {}
        p = Pool()
        out_p = []
        step_size = 1000
        start_time = time.time()
        for i in range(0, len(docs), step_size):
            out_p_local = p.map(
                process_text, docs[i:min(len(docs), i+step_size)])
            out_p += out_p_local
            print("Processed %i of %i docs" % (i+step_size, len(docs)))
            time_passed = time.time() - start_time
            time_to_go = time_passed * (len(docs)-i-step_size) / (i+step_size)
            print("Estimated remaining time: %imin %isec" %
                  (int(time_to_go/60.0), int(time_to_go) % 60))

        for i in range(len(out_p)):
            if len(out_p[i]) > 0:
                doc_repr[doc_ids[i]] = out_p[i]

        with open(path, "wb") as writer:
            pkl.dump(doc_repr, writer)

        print(f"all docs processed. saved to {path}")

        return doc_repr
    else:
        print("Docs already processed. Loading from disk")

        with open(path, "rb") as reader:
            return pkl.load(reader)


def read_qrels(root_folder="./datasets/"):

    qrels = {}
    queries = {}

    with open(os.path.join(root_folder, "ap", "qrels.tsv")) as reader:
        for line in reader:
            qid, _, doc_id, _ = line.split("\t")
            if qid not in qrels:
                qrels[qid] = {}
            qrels[qid][doc_id] = 1

    with open(os.path.join(root_folder, "ap", "queries.tsv")) as reader:
        for line in reader:
            qid, query = line.split("\t")
            if qid in qrels:
                queries[qid] = query

    return qrels, queries




[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/xiaojingu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/xiaojingu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [38]:
import read_ap
from gensim.test.utils import common_dictionary, common_corpus
from gensim.models import LsiModel

model = LsiModel(common_corpus, id2word=common_dictionary)
vectorized_corpus = model[common_corpus]

docs = read_ap.get_processed_docs()

print(len(docs))

pass
"""
use gensim to get a vocabulary from the processed docs => Dictionary

pass to gensim's bow or tfidf functions => corpus

pass corpus to gensim models
""""""

Docs already processed. Loading from disk
164557


In [39]:

keys = list(docs.keys())#[:100]
count = 0
for key in keys:
    count += len(docs[key])
#     print(key, docs[key])
#     print()
count

46270240

In [41]:
# model = LsiModel(docs)
# vectorized_corpus = model[common_corpus]

# docs = read_ap.get_processed_docs()

print(len(docs))

164557


In [62]:
def get_frequent_tokens(docs, doc_ids, min_threshold=50):
    corpus_token_count = Counter()

    doc_ids = doc_ids[:500]
    
    for doc_id in tqdm(doc_ids):
        corpus_token_count = corpus_token_count + Counter(docs[doc_id])

    # filter out unfrequent words
    thresholded_token_counts = Counter(el for el in corpus_token_count.elements() if corpus_token_count[el] >= min_threshold)

    return set(thresholded_token_counts.keys())

In [63]:
freq_docs = get_frequent_tokens(docs, keys)


  0%|          | 0/500 [00:00<?, ?it/s][A
 12%|█▏        | 61/500 [00:00<00:00, 599.09it/s][A
 18%|█▊        | 90/500 [00:00<00:00, 452.00it/s][A
 24%|██▍       | 122/500 [00:00<00:00, 401.93it/s][A
 30%|███       | 152/500 [00:00<00:00, 360.71it/s][A
 37%|███▋      | 183/500 [00:00<00:00, 341.42it/s][A
 42%|████▏     | 211/500 [00:00<00:00, 301.31it/s][A
 48%|████▊     | 238/500 [00:00<00:00, 263.66it/s][A
 53%|█████▎    | 263/500 [00:00<00:00, 248.49it/s][A
 57%|█████▋    | 287/500 [00:00<00:00, 245.37it/s][A
 62%|██████▏   | 311/500 [00:01<00:00, 236.86it/s][A
 67%|██████▋   | 335/500 [00:01<00:00, 229.23it/s][A
 72%|███████▏  | 358/500 [00:01<00:00, 214.96it/s][A
 76%|███████▌  | 380/500 [00:01<00:00, 213.38it/s][A
 80%|████████  | 402/500 [00:01<00:00, 208.50it/s][A
 85%|████████▍ | 423/500 [00:01<00:00, 202.27it/s][A
 89%|████████▉ | 444/500 [00:01<00:00, 195.09it/s][A
 93%|█████████▎| 464/500 [00:01<00:00, 190.55it/s][A
100%|██████████| 500/500 [00:02<00:00, 2

In [61]:
len( freq_docs)

s = set()

type(s)

set

In [68]:
counter = 0
# create docs corpus matrix
docs_matrix = []

for key in keys:
    
    docs_matrix.append(list(set(docs[key]) - freq_docs))
    
    
    
docs_matrix

[['17', 'lotto', '19', 'lotteri', 'pick', 'weekli', '35', '22', '33'],
 ['1-0-5-2-5',
  'bonu',
  'lotto',
  '19',
  '24',
  '13',
  '3-7-5-9',
  '34',
  '35',
  'supplementari',
  'grand',
  '05',
  '5-5-7',
  '54',
  'game',
  '11',
  '22',
  '9-6-5-1',
  'lotteri',
  '48',
  '8-4-8-4',
  'big',
  'megabuck',
  '18',
  '2-9-6-6-8-8',
  '33',
  '17',
  '64064',
  '04',
  'pick',
  '28',
  'weekli',
  '06',
  '4',
  '6'],
 ['dalla',
  'tenn.',
  '12',
  'boston',
  'worth',
  'louisvil',
  '11',
  'ky.',
  'atlanta',
  'nashvil',
  'san',
  'chatanooga',
  'colorado',
  'd.c.',
  'fort',
  'tent',
  'cincinnati',
  'indianapoli',
  '17',
  'franicisco',
  '16',
  '14',
  '13',
  'chicago'],
 ['rest',
  'herbicid',
  'orang',
  'previou',
  '1-800-225-4712',
  '240',
  'insur',
  'distribut',
  'disconnect',
  'co.',
  'aetna',
  'vietnam',
  'settlement',
  'veteran',
  'consolid',
  'agent',
  'phone',
  'suit',
  'manufactur',
  'toll-fre'],
 ['incid',
  'rome',
  'dump',
  'american

In [None]:
# Create gensim dictionaries
from gensim.corpora import Dictionary

dictionary = Dictionary()

In [67]:
l = []

for i in range(5):
    l.append([i] * 10)
    
l

[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
 [2, 2, 2, 2, 2, 2, 2, 2, 2, 2],
 [3, 3, 3, 3, 3, 3, 3, 3, 3, 3],
 [4, 4, 4, 4, 4, 4, 4, 4, 4, 4]]