# Sentence alignment with the Gale-Church algorithm

## Load NLTK and a sample of the Europarl parallel corpus

In [1]:
import os
import nltk
from nltk.translate import gale_church
if 'europarl_raw.zip' not in os.listdir( nltk.data.find("corpora") ):
    nltk.download('europarl_raw')
from nltk.corpus.europarl_raw import german, english
from nltk.corpus import comtrans
english.ensure_loaded()
german.ensure_loaded()

## Each monolingual corpus is divided into chapters

In [2]:
print('Number of chapters:')
print((len(english.chapters()), len(german.chapters())))
print('Number of sentences per chapter:')
for i in range(4):
    print((len(english.chapters()[i]), len(german.chapters()[i])))

Number of chapters:
(81, 90)
Number of sentences per chapter:
(65, 71)
(111, 116)
(89, 95)
(84, 86)


## Look at the first sentences of a few chapters

In [3]:
def show_chap_sent(lang, chap, sent):
    """Print the first SENT sentences in chapter CHAP of language LANG"""
    for i, s in enumerate(lang.chapters()[chap][:sent]):
        print('{}\t{}'.format(i, ' '.join(s)))

In [4]:
print('First two sentences of each of the first three chapters:')
for c in range(3):
    print("\nChapter {}".format(c))
    print('English')
    show_chap_sent(english, c, 2)
    print('German')
    show_chap_sent(german, c, 2)

First two sentences of each of the first three chapters:

Chapter 0
English
0	Resumption of the session I declare resumed the session of the European Parliament adjourned on Friday 17 December 1999 , and I would like once again to wish you a happy new year in the hope that you enjoyed a pleasant festive period .
1	Although , as you will have seen , the dreaded ' millennium bug ' failed to materialise , still the people in a number of countries suffered a series of natural disasters that truly were dreadful .
German
0	Wiederaufnahme der Sitzungsperiode Ich erkläre die am Freitag , dem 17. Dezember unterbrochene Sitzungsperiode des Europäischen Parlaments für wiederaufgenommen , wünsche Ihnen nochmals alles Gute zum Jahreswechsel und hoffe , daß Sie schöne Ferien hatten .
1	Wie Sie feststellen konnten , ist der gefürchtete " Millenium-Bug " nicht eingetreten .

Chapter 1
English
0	Agenda The next item is the verification of the final version of the draft agenda as drawn up by the Confere

## The Gale-Church algorithm takes as input the lengths of the input sentences

In [5]:
def csl(lang, chap, sent=None):
    """Compute the lengths of the first SENT sentences in chapter CHAP of language LANG"""
    if sent is None:
        sent = len(lang.chapters())
    return [len(s) for s in lang.chapters()[chap][:sent]]

print(csl(english, 0, 10))
print(csl(german, 0, 12))

[44, 37, 21, 47, 12, 21, 24, 27, 53, 21]
[36, 14, 12, 17, 37, 11, 10, 6, 22, 35, 60, 16]


In [6]:
def csl_show(lang, chap, sent=None, c="x"):
    """Show the lengths of the first SENT sentences in chapter CHAP of language LANG"""
    if sent is None:
        sent = len(lang.chapters())
    return " ".join([c*len(s) for s in lang.chapters()[chap][:sent]])

print(csl_show(english, 0, 10, c="-"))
print(csl_show(german, 0, 12, c="+"))
print()
print(csl_show(english, 1, 10, c="-"))
print(csl_show(german, 1, 12, c="+"))

-------------------------------------------- ------------------------------------- --------------------- ----------------------------------------------- ------------ --------------------- ------------------------ --------------------------- ----------------------------------------------------- ---------------------
++++++++++++++++++++++++++++++++++++ ++++++++++++++ ++++++++++++ +++++++++++++++++ +++++++++++++++++++++++++++++++++++++ +++++++++++ ++++++++++ ++++++ ++++++++++++++++++++++ +++++++++++++++++++++++++++++++++++ ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ ++++++++++++++++

--------------------------------------- ----------- ------------------------------------- ------------------ ----------- ----------------- ------------------------------------------------------------ ---------------------------------------- --------------------------- ---------------------------
++++++++++++++++++++++++++++++++++ +++++++++ ++++++++++++++++++++++++++++++++ +++++++++++++++++++

## Run and show the alignment on the first N English sentences vs the first M German sentences

In [7]:
def align_and_show(chap, s1, s2):
    a = gale_church.align_blocks(csl(english, chap, s1), csl(german, chap, s2))
    print("Alignment of sentences English[{}][0:{}] to sentences German[{}][0:{}]:\n{}".format(chap, s1, chap, s2, a))
    al = {}
    for (e, g) in a:
        if e not in al: al[e] = []
        al[e].append(g)
    all = [(k, al[k]) for k in sorted(al.keys())]
    print("Mappings from English sentences")
    for (k, v) in all:
        print("{} -> {}".format(k, v))
    print("\nText of the aligned sentences:")
    for (i, v) in all:
        print("{} {}\n---".format(i, " ".join(english.chapters()[chap][i])))
        for j in v:
            print("{} {}".format(j, " ".join(german.chapters()[chap][j])))
        print()

### Run the alignment on the first 10 English sentences vs the first 12 German sentences of Chapter 0

In [8]:
align_and_show(0, 10, 12)

Alignment of sentences English[0][0:10] to sentences German[0][0:12]:
[(0, 0), (1, 1), (1, 2), (2, 3), (3, 4), (4, 5), (5, 6), (5, 7), (6, 8), (7, 9), (8, 10), (9, 11)]
Mappings from English sentences
0 -> [0]
1 -> [1, 2]
2 -> [3]
3 -> [4]
4 -> [5]
5 -> [6, 7]
6 -> [8]
7 -> [9]
8 -> [10]
9 -> [11]

Text of the aligned sentences:
0 Resumption of the session I declare resumed the session of the European Parliament adjourned on Friday 17 December 1999 , and I would like once again to wish you a happy new year in the hope that you enjoyed a pleasant festive period .
---
0 Wiederaufnahme der Sitzungsperiode Ich erkläre die am Freitag , dem 17. Dezember unterbrochene Sitzungsperiode des Europäischen Parlaments für wiederaufgenommen , wünsche Ihnen nochmals alles Gute zum Jahreswechsel und hoffe , daß Sie schöne Ferien hatten .

1 Although , as you will have seen , the dreaded ' millennium bug ' failed to materialise , still the people in a number of countries suffered a series of natural dis

### Run the alignment on the first 10 English sentences vs the first 9 German sentences of Chapter 1

In [191]:
align_and_show(1, 10, 9)

Alignment of sentences English[1][0:10] to sentences German[1][0:9]:
[(0, 0), (1, 1), (2, 2), (3, 3), (4, 4), (5, 5), (6, 6), (7, 7), (8, 8), (9, 8)]
Mappings from English sentences
0 -> [0]
1 -> [1]
2 -> [2]
3 -> [3]
4 -> [4]
5 -> [5]
6 -> [6]
7 -> [7]
8 -> [8]
9 -> [8]

Text of the aligned sentences:
0 Agenda The next item is the verification of the final version of the draft agenda as drawn up by the Conference of Presidents at its meeting of 13 January pursuant to Rule 110 of the Rules of Procedure .
---
0 Arbeitsplan Nach der Tagesordnung folgt die Prüfung des endgültigen Entwurfs der Tagesordnung , wie er nach Artikel 110 der Geschäftsordnung am Donnerstag , dem 13. Januar von der Konferenz der Präsidenten festgelegt wurde .

1 No amendments have been proposed relating to Monday and Tuesday .
---
1 Zu Montag und Dienstag liegen keine Änderungen vor .

2 Relating to Wednesday : The Group of the Party of European Socialists requests that a Commission statement be included on its st

### Run the alignment on the first 7 English sentences vs the first 8 German sentences of Chapter 2

In [193]:
align_and_show(2, 7, 8)

Alignment of sentences English[2][0:7] to sentences German[2][0:8]:
[(0, 0), (1, 1), (1, 2), (2, 3), (3, 4), (4, 5), (5, 6), (6, 7)]
Mappings from English sentences
0 -> [0]
1 -> [1, 2]
2 -> [3]
3 -> [4]
4 -> [5]
5 -> [6]
6 -> [7]

Text of the aligned sentences:
0 Safety advisers for the transport of dangerous goods The next item is the report ( A5-0105 / 1999 ) by Mr Koch , on behalf of the Committee on Regional Policy , Transport and Tourism , on the common position adopted by the Council with a view to adopting a European Parliament and Council directive on the harmonisation of examination requirements for safety advisers for the transport of dangerous goods by road , rail or inland waterways ( C5-0208 / 1999 - 1998 / 0106 ( COD ) ) .
---
0 Sicherheitsberater für den Gefahrguttransport Nach der Tagesordnung folgt der Bericht ( A5-0105 / 1999 ) von Herrn Koch im Namen des Ausschusses für Regionalpolitik , Verkehr und Fremdenverkehr über den Gemeinsamen Standpunkt des Rates im Hinblic