# Analysis of Wikipedia Pages
**An Implementation of `map_reduce` for Parallel Processing**

This example project illustrates applying a `map_reduce` function for parallel processing for various searches for a target substring within numerous files with numerous lines.  The results of the final search are written to a csv file containing the original file name, line number, and character indices to locate the substring, and abbreviated surrounding string context for the targeted substring.

In [1]:
# To structure code automatically
%load_ext nb_black

<IPython.core.display.Javascript object>

### Checking Files in the Directory

In [2]:
import os

file_names = os.listdir("wiki")
print(f'There are {len(file_names)} files in the "wiki" folder:')
file_names

There are 999 files in the "wiki" folder:


['Bay_of_ConcepciC3B3n.html',
 'Bye_My_Boy.html',
 'Valentin_Yanin.html',
 'Kings_XI_Punjab_in_2014.html',
 'William_Harvey_Lillard.html',
 'Radial_Road_3.html',
 'George_Weldrick.html',
 'Zgornji_Otok.html',
 'Blue_Heelers_(season_8).html',
 'Taggen_Nunatak.html',
 'Henri_BraqueniC3A9.html',
 'Vrila.html',
 'William_Henry_Porter.html',
 'Clive_Brown_(footballer).html',
 'Blick_nach_Rechts.html',
 'Central_District_(Rezvanshahr_County).html',
 'Alexios_Aspietes.html',
 'Mei_Lanfang.html',
 'Wangeroogeclass_tug.html',
 'Dowell_Philip_O27Reilly.html',
 'Coalville_Town_railway_station.html',
 'Gennady_Lesun.html',
 'Bartrum_Glacier.html',
 'Victor_S._Mamatey.html',
 'Gottfried_Keller.html',
 'Table_Point_Formation.html',
 'Nobuhiko_Ushiba.html',
 'Master_of_Space_and_Time.html',
 'Early_medieval_states_in_Kazakhstan.html',
 'Eressa_aperiens.html',
 'Myrtle_(sternwheeler).html',
 'Abanycha_bicolor.html',
 'JeecyVea.html',
 'Aubrey_Fair.html',
 'Ingrid_GuimarC3A3es.html',
 'Urban_chicken.ht

<IPython.core.display.Javascript object>

### Examining Lines of the First File

In [3]:
folder_name = "wiki"
file_name = file_names[0]
with open(os.path.join(folder_name, file_name)) as f:
    lines = [line for line in f.readlines()]
print(lines)

['<!DOCTYPE html>\n', '<html class="client-nojs" lang="en" dir="ltr">\n', '<head>\n', '<meta charset="UTF-8"/>\n', '<title>Bay of Concepción - Wikipedia</title>\n', '<script>document.documentElement.className = document.documentElement.className.replace( /(^|\\s)client-nojs(\\s|$)/, "$1client-js$2" );</script>\n', '<script>(window.RLQ=window.RLQ||[]).push(function(){mw.config.set({"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":false,"wgNamespaceNumber":0,"wgPageName":"Bay_of_Concepción","wgTitle":"Bay of Concepción","wgCurRevisionId":647460156,"wgRevisionId":647460156,"wgArticleId":16044270,"wgIsArticle":true,"wgIsRedirect":false,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Coordinates on Wikidata","All stub articles","Landforms of Bío Bío Region","Bays of Chile","Bío Bío Region geography stubs"],"wgBreakFrames":false,"wgPageContentLanguage":"en","wgPageContentModel":"wikitext","wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgD

<IPython.core.display.Javascript object>

### Defining `map_reduce` Function

In [4]:
import math
import functools
from multiprocessing import Pool


def make_chunks(file_names, num_chunks):
    """
    Takes input array-like file_names and int num_chunks and
    return list of sliced file_names of length num chunks

    file_names: array_like list of path file names containing file_names
    num_chunks: integer
    """
    chunk_size = math.ceil(len(file_names) / num_chunks)
    return [
        file_names[i : i + chunk_size] for i in range(0, len(file_names), chunk_size)
    ]


def map_reduce(file_names, num_processes, mapper, reducer):
    """
    Parallel processing after using function
    make_chunks to divide file_names into chunks

    file_names: array-like list of path file names containing file_names
    num_processes: integer of number of processes
    mapper: function for individual chunks
    reducer: function to aggregate results of mapper
    """
    chunks = make_chunks(file_names, num_processes)
    with Pool(num_processes) as pool:
        chunk_results = pool.map(mapper, chunks)
    return functools.reduce(reducer, chunk_results)

<IPython.core.display.Javascript object>

### Counting the Total Number of Lines in All Files

In [5]:
def map_line_count(file_names_chunk):
    chunk_line_cnt = 0
    for file_name in file_names_chunk:
        with open(os.path.join(folder_name, file_name)) as f:
            chunk_line_cnt += len([line for line in f.readlines()])
    return chunk_line_cnt


def reduce_line_count(res1, res2):
    return res1 + res2


total_lines = map_reduce(file_names, 4, map_line_count, reduce_line_count)
print(f"Combined the files have {total_lines} lines.")

Combined the files have 499797 lines.


<IPython.core.display.Javascript object>

### Locating All Lines with a Given String in All Files with a grep Function

In [6]:
# Example target = "data"
target = "data"


def map_grep_find_string(file_names_chunk):
    chunk_lines = {}
    for file_name in file_names_chunk:
        chunk_lines[file_name] = []
        with open(os.path.join(folder_name, file_name)) as f:
            for i, line in enumerate(f.readlines()):
                if target in line:
                    chunk_lines[file_name].append(i)
    return chunk_lines


def reduce_grep_find_string(res1, res2):
    merged = {}
    merged.update(res1)
    merged.update(res2)
    return merged


target_found = map_reduce(file_names, 4, map_grep_find_string, reduce_grep_find_string)

# Checking a sample file
target_found[file_names[42]]

[43, 46, 104, 187, 204]

<IPython.core.display.Javascript object>

### Updating grep Function to Case Insensitive

In [7]:
# Example target = "data"
target = "data"


def map_grep_case_insensitive(file_names_chunk):
    chunk_lines = {}
    for file_name in file_names_chunk:
        chunk_lines[file_name] = []
        with open(os.path.join(folder_name, file_name)) as f:
            for i, line in enumerate(f.readlines()):
                if target.lower() in line.lower():
                    chunk_lines[file_name].append(i)
    return chunk_lines


target_found2 = map_reduce(
    file_names, 4, map_grep_case_insensitive, reduce_grep_find_string
)

# Checking a sample of indexes for rows containing target for one file
target_found2[file_names[42]]

[43, 46, 104, 187, 204]

<IPython.core.display.Javascript object>

#### Identifying New Lines Found with Case Insensitivity

In [8]:
for key, value in target_found2.items():
    no_match = [
        key for key, value in target_found2.items() if value != target_found[key]
    ]
new_lines = {}
for file in no_match:
    new_lines[file] = [
        line for line in target_found2[file] if line not in target_found[file]
    ]

print(
    f"There are {sum([len(value) for key, value in new_lines.items()])} new lines identified:"
)
new_lines

There are 165 new lines identified:


{'Table_Point_Formation.html': [80],
 'Ingrid_GuimarC3A3es.html': [173],
 'Jules_Verne_ATV.html': [918, 1169],
 'Pictogram.html': [397],
 'Claire_Danes.html': [818, 820],
 'PTPRS.html': [58],
 'A_Beautiful_Valley.html': [177],
 'Mudramothiram.html': [196],
 'Gordon_Bau.html': [131, 148],
 'Embraer_Unidade_GaviC3A3o_Peixoto_Airport.html': [131],
 'Code_page_1023.html': [142, 533, 1315],
 'Cryptographic_primitive.html': [81],
 'Alex_Kurtzman.html': [338],
 'Filip_Pyrochta.html': [88],
 'Morgana_King.html': [587],
 'Don_Parsons_(ice_hockey).html': [168],
 'Bias.html': [971],
 'Tomohiko_ItC58D_(director).html': [107, 108],
 'Imperial_Venus_(film).html': [130],
 'Camp_Nelson_Confederate_Cemetery.html': [145],
 'Benny_Lee.html': [91],
 'Kul_Gul.html': [104],
 'Medicago_murex.html': [107],
 'Oldfield_Baby_Great_Lakes.html': [117],
 'Wilson_Global_Explorer.html': [120],
 'Craig_Chester.html': [278],
 'Derek_Acorah.html': [141],
 'Jack_Goes_Home.html': [166],
 'Morning_Glory_(2010_film).html': 

<IPython.core.display.Javascript object>

### Finding All Instances of Given String (i.e., Tuples of Row Index and Starting Character Index)

#### Define Function to Find Starting Indices of Substring within String

In [9]:
import re


def find_substring_starts(string, substring):
    pattern = re.compile(r"" + substring)
    matches = pattern.finditer(string)
    return [mat.span(0)[0] for mat in matches]


# Test function
test = "Data, data all day long!".lower()
find_substring_starts(test, target)

[0, 6]

<IPython.core.display.Javascript object>

#### Proceeding to Find All Instances with `map_reduce`

In [10]:
def map_find_all_instances(file_names_chunk):
    chunk_tuples = {}
    for file_name in file_names_chunk:
        chunk_tuples[file_name] = []
        with open(os.path.join(folder_name, file_name)) as f:
            for line_i, line in enumerate(f.readlines()):
                if target.lower() in line.lower():
                    for char_i in find_substring_starts(line.lower(), target.lower()):
                        chunk_tuples[file_name].append((line_i, char_i))
    return chunk_tuples


def reduce_find_all_instances(res1, res2):
    merged = {}
    merged.update(res1)
    merged.update(res2)
    return merged


targets_found = map_reduce(
    file_names, 4, map_find_all_instances, reduce_find_all_instances
)

# Checking a sample of line with character index for one file
targets_found[file_names[42]]

[(43, 102),
 (46, 441),
 (46, 463),
 (104, 40),
 (187, 1115),
 (187, 1165),
 (187, 1209),
 (204, 124)]

<IPython.core.display.Javascript object>

### Writing Results to csv File

In [26]:
import csv

with open("match_results.csv", "w") as f:
    writer = csv.writer(f)
    rows = [["File", "Line", "Char_index", "Context"]]

    for file_name in targets_found:
        for found in targets_found[file_name]:
            with open(os.path.join(folder_name, file_name)) as f2:
                line_index = found[0]
                char_index = found[1]
                line = f2.readlines()[line_index]
                start = 0
                if char_index >= 15:
                    start = char_index - 15
                end = len(line)
                if end - char_index > 0:
                    end = char_index + 15
                context = line[start:end]
                row = [file_name, line_index, char_index, context]
                rows.append(row)
    writer.writerows(rows)

<IPython.core.display.Javascript object>

#### Reading `match_results.csv` into pandas and Checking a Sample

In [29]:
import pandas as pd

df = pd.read_csv("match_results.csv")
df.sample(10)

Unnamed: 0,File,Line,Char_index,Context
1313,Tom_Buffington.html,242,1100,"esskey=""g"">Wikidata item</a></"
20485,Imperial_amazon.html,203,365,w.birdlife.org/datazone/specie
16855,Central_Luzon.html,59,655,"s_City.JPG 2x"" data-file-width"
3248,Plains_Kansas.html,6,469,"same as on Wikidata"",""Cities i"
8125,Slade_School_of_Fine_Art.html,370,443,"tos, and other data for this l"
17365,2008_Fed_Cup_World_Group_II.html,308,461,"ia.svg.png 2x"" data-file-width"
9400,WintersWimberley_House.html,437,124,ttps://www.wikidata.org/wiki/Q
1758,Patrick_F._Philbin.html,81,1055,"_2.svg.png 2x"" data-file-width"
578,John_Reid_(British_Army_officer).html,147,40,"ass=""catlinks"" data-mw=""interf"
4908,Shpolskii_matrix.html,45,546,"e-width=""1890"" data-file-heigh"


<IPython.core.display.Javascript object>