# Word ladder

Word ladder is a game invented by Lewis Carroll where the goal is transform one word into another by changing one letter at a time. Each intermediate word must be a valid word (in the dictionary). You should try to find the shortest possible path.

We have a file, `words.txt`, that has a series of three-, four- and five-letter words from the Merriam-Webster dictionary.

**Goal**: Find the shortest path between any two words.

**Plan**:

1. Construct a graph from all words in the corpus
2. Perform a BFS to find the shortest path between two nodes


In [None]:
from typing import Dict, List, Iterator
from collections import defaultdict, deque
import itertools

from tqdm.notebook import tqdm

import networkx as nx

from concurrent.futures import ThreadPoolExecutor
from multiprocessing import Pool, cpu_count
import ray

In [None]:
def edit_distance(a: str, b: str) -> bool:
    n = len(a)
    m = len(b)

    diffs = 0
    for i in range(min(n, m)):
        if a[i] != b[i]:
            diffs += 1

    # Handle strings being different lengths
    diffs += abs(n - m)

    return diffs

In [None]:
# TODO: Optimise! This is so slow (takes about 3 mins on my M1 Air)
def load_graph(filename: str) -> Dict[str, List[str]]:
    graph = defaultdict(list)

    with open(filename) as f:
        words = f.read().splitlines()

    for i in tqdm(words):
        for j in words:
            if i == j:
                continue

            if edit_distance(i, j) <= 1:
                graph[i].append(j)

    return graph

In [None]:
%%time
graph = load_graph("words.txt")

In [None]:
def word_ladder(graph: Dict[str, List[str]], start: str, end: str) -> List[List[str]]:
    """Returns a list of shortest paths from start to end."""
    queue = deque([(start, [])])
    visited = set()
    paths: List[List[str]] = []  # a forest, TODO: track parent instead
    shortest_path = float("inf")

    if start not in graph:
        raise ValueError(f"invalid start: {start}")
    if end not in graph:
        raise ValueError(f"invalid end: {end}")

    while queue:
        word, path = queue.popleft()

        if word == end:
            if len(path) < shortest_path:
                paths.clear()
                shortest_path = len(path)
            if len(path) <= shortest_path:
                paths.append(path + [word])

        visited.add(word)

        for neighbour in graph[word]:
            if neighbour in visited:
                continue
            queue.append((neighbour, path + [word]))

    return paths

In [None]:
%%time
word_ladder(graph, "aba", "abaca")

In [None]:
%%time
word_ladder(graph, "head", "tail")

In [None]:
%%time
word_ladder(graph, "pig", "sty")

I considered an extension of the game where you can move between words of different lengths (the original version is only for words of the same length).


Knuth studied this game and remarked that three- and four-letter words are not that interesting and six-letter words are too hard. Five-letter words are that sweet spot. In the dictionary he used, he found that there were 517 'aloof' words, including 'aloof' itself!

> In our dictionary, 'aloof' is related to 'cloof' which has many neighbours.


In [None]:
graph["aloof"], graph["cloof"]

Our aloof words are:


In [None]:
aloof_words = []
with open("words.txt") as f:
    for word in f.read().splitlines():
        if len(graph[word]) == 0:
            aloof_words.append(word)

len(aloof_words), aloof_words[:10]

# Using networkx


In [None]:
# Construct graph
# Takes about three minutes (on my M1 Air)
def load_graph(filename) -> nx.Graph:
    G = nx.Graph()
    with open(filename) as f:
        words = f.read().splitlines()
        for i in tqdm(words):
            for j in words:
                if i == j:
                    continue

                if edit_distance(i, j) <= 1:
                    G.add_edge(i, j)

    return G


In [None]:
%%time
G = load_graph("words.txt")

In [None]:
def word_ladder(G: nx.Graph, start: str, end: str) -> Iterator[List[str]]:
    return nx.all_shortest_paths(G, start, end)

In [None]:
%%time
list(word_ladder(G, "pig", "sty"))

At a rough benchmark, it seems like NetworkX is ~25x faster than my own code.


# Parallelising data loading

1. Using threads
2. Using processes
3. Using Ray


In [None]:
def load_graph(filename) -> nx.Graph:
    def process_word_neighbours(words, word):
        neighbours = []
        for neighbour in words:
            if word != neighbour and edit_distance(word, neighbour) <= 1:
                neighbours.append((word, neighbour))

        return neighbours

    G = nx.Graph()
    with open(filename) as f:
        words = f.read().splitlines()

    with ThreadPoolExecutor() as executor:
        for result in executor.map(
            lambda word: process_word_neighbours(words, word), words
        ):
            if result:
                G.add_edges_from(result)

    return G


In [None]:
%%time
# This wasn't really any faster
G = load_graph("words.txt")

In [None]:
def process_word_neighbours(words, word):
    neighbours = []
    for neighbour in words:
        if word != neighbour and edit_distance(word, neighbour) <= 1:
            neighbours.append((word, neighbour))

    return neighbours


def load_graph(filename) -> nx.Graph:
    with open(filename) as f:
        words = f.read().splitlines()

    with Pool(cpu_count()) as pool:
        results = pool.map(process_word_neighbours, [(words, word) for word in words])

    G = nx.Graph()
    for result in results:
        if result:
            G.add_edges_from(result)

    return G


In [None]:
%%time
G = load_graph("words.txt")

In [None]:
context = ray.init()
print(context.dashboard_url)

In [None]:
@ray.remote
def process_word_neighbours(words, start):
    neighbours = []
    for word in words:
        if edit_distance(start, word) <= 1:
            neighbours.append((start, word))

    return neighbours


def load_graph(filename) -> nx.Graph:
    G = nx.Graph()
    with open(filename) as f:
        words = f.read().splitlines()

    results = ray.get([process_word_neighbours.remote(words, word) for word in words])

    for result in results:
        if result:
            G.add_edges_from(result)

    return G

In [None]:
%%time
# Takes about a minute, 3x faster than unparallelised
G = load_graph("words.txt")