## Overview

This notebook explores the article features, specifically instances in which we did not have an instance of the LDA topic model for an article.

### Imports + Variables

In [1]:
import bz2
import json
import os
import xml.etree.ElementTree as et

import pandas as pd

In [None]:
lang = "ru"
base_dir = "home/flemmerich/wikimotifs2/data"
wiki_dump_fn = os.path.join(base_dir, "text", "raw", f"{lang}wiki-20170720-pages-articles.xml.bz2")
article_text_fn = os.path.join(base_dir, "text", "json", f"{lang}wiki-20170720.json")
article_features_fn = os.path.join(base_dir, "article_features", f"article_features_{lang}.p")

print("Analyzing {0}".format(lang))

for fn in (wiki_dump_fn, article_text_fn, article_features_fn):
    if not os.path.exists(fn):
        print("{0} does not exist.".format(fn))

### Load in Article Features

In [None]:
print("Loading in features.")
df = pd.read_pickle(article_features_fn)
print("Columns:\n", df.columns)
print("Head:\n", df.head())

In [None]:
print("Articles missing LDA topics:")
df_with_topics[df_with_topics.topic_0.isnull()]

In [None]:
missing_titles = frozenset(df[df.topic_0.isnull()]["gensim_title"])
print("\n=== Missing Titles ({0}) ===\n{1}".format(len(missing_titles), missing_titles))

### Load in Article Text (to investigate missing articles)

In [None]:
article2text = {}
print("opening article texts")
with open(article_text_fn, "r") as f:
    for line in f:
        js = json.loads(line)
        title = js["title"]
        text = "\n".join(js["section_texts"]).replace("'", "")
        article2text[title] = text

articles_ids = list(article2text.keys())
articles_text = list(article2text.values())

json_ids = frozenset(articles_ids)
missing_but_in_json = [x for x in missing_titles if x in json_ids]
missing_and_not_in_json = [x for x in missing_titles if x not in json_ids]

print("Count missing (in JSON):", len(missing_but_in_json))
print("Count missing (not in JSON):", len(missing_and_not_in_json))

In [None]:
df[df["page_title"] == "B-25 Mitchell"]

In [None]:
df[(df["gensim_title"] == "তালমুদ")]

### Get example missing pages

In [None]:
examples_to_keep = 10

entries_for_missing = {}
with bz2.BZ2File(wiki_dump_fn, "r") as f:
    mapping = {}
    current_page = ""
    page_count = 0
    for i,line in enumerate(f):
        line = line.strip().decode("utf-8")
        if line.startswith("<page>"):
            current_page = ""
            page_count += 1
        current_page += line
        if line.startswith("</page>"):
            page_elem = et.fromstring(current_page)
            title = page_elem.find("title").text
            pid = page_elem.find("id").text
            mapping[pid] = title
            if page_count % 100000 == 0:
                print ("extracted page titles:", page_count)
            if title in missing_titles:
                entries_for_missing[title] = current_page
                if len(entries_for_missing) == examples_to_keep:
                    print("Reached {0} titles.".format(examples_to_keep))
                    break

In [None]:
entries_for_missing["Norden (Ostfriesland)"]