In [1]:
import re
from difflib import SequenceMatcher

In [2]:
punkset = [ '፧', '፥', '።', '፥', '፤', '፣', '፦', '፡', '፨', '፠', 
                     '?', ',', '.', '!', '(', ')', '[', ']', ';', ':', '“', '”', 
                     '"', '’', "'", '¶', '/', '...', '>', '<', '{', '}', '—', '-' ]

def tokenize(doc):
    """
    Splits the document into sentences using end punctuation.

    :param doc: to split
    :return: the list of sentence strings from the document
    """
    tokens = []
    doc = ' '.join(doc.split())
    
    curr_word = ""
    for c in doc:
        if c == ' ' or c in punkset:
            if curr_word:
                tokens.append(curr_word)
                curr_word = ""
            if c in punkset:
                tokens.append(c)
        else:
            curr_word += c
    if curr_word:
        tokens.append(curr_word)            

    return tokens

In [119]:
#Tokenization test
doc_ti = 'ኣብ መበል 16 ዓመተ-ልደትካ እንታይ ጌርካ፧'
doc_en = 'What did you do on your 16th birthday?'
print(doc_ti)
print()
print(doc_en)

ኣብ መበል 16 ዓመተ-ልደትካ እንታይ ጌርካ፧

What did you do on your 16th birthday?


In [3]:
#Tokenize document
doc_in = "/Users/alp/Documents/TWB/play/tigrinya/data/twb.ti"
doc_out = "/Users/alp/Documents/TWB/play/tigrinya/data/twb.tok.ti"
with open(doc_in, 'r') as f_in, open(doc_out, 'w') as f_out:
    lines_ti = f_in.readlines()
    
    for line_ti in lines_ti:
        line_tokens = tokenize(line_ti)
        f_out.write(' '.join(line_tokens) + '\n')

In [None]:
def split_to_verses(string1, string2):
    str1 = ' '.join(string1.split())
    str2 = ' '.join(string2.split())

    match_blocks = SequenceMatcher(None, str1, str2).get_matching_blocks()

    verses_1 = []
    verses_2 = []
    marks = []

    verse_begin_1 = 0
    verse_begin_2 = 0

    for match in match_blocks:
        matching_string = str1[match.a: match.a + match.size].strip()
        if len(matching_string) > 1:
            verse_1 = str1[verse_begin_1:match.a]
            verse_2 = str2[verse_begin_2:match.b]

            verses_1.append(verse_1)
            verses_2.append(verse_2)

            marks.append(matching_string)

            verse_begin_1 = match.a + match.size
            verse_begin_2 = match.b + match.size

    verse_1 = str1[verse_begin_1:]
    verse_2 = str2[verse_begin_2:]
    verses_1.append(verse_1)
    verses_2.append(verse_2)
    marks.append("")
    
    return verses_1, verses_2, marks

In [132]:
verses_ti, verses_en, marks = split_to_verses(doc_ti, doc_en)

for vt, ve, m in zip(verses_ti, verses_en, marks):
    print(vt)
    print(ve)
    print()
    print("|" + m + "|")
    print()

ኣብ መበል
What did you do on your

|16|

 ዓመተ-ልደትካ እንታይ ጌርካ፧
th birthday?

||



In [146]:
file_en = "/Users/alp/Documents/TWB/play/tigrinya/Tigrinya-Parallel-Corpus/v1/EN_all"
file_ti = "/Users/alp/Documents/TWB/play/tigrinya/Tigrinya-Parallel-Corpus/v1/TI_all"

LONGLINE_NOTOKENS = 100

count = 0
with open(file_en, 'r') as f_en, open(file_ti, 'r') as f_ti:
    lines_en = f_en.readlines()
    lines_ti = f_ti.readlines()
    
    for line_no, (line_en, line_ti) in enumerate(zip(lines_en, lines_ti)):
        #normalize lines
        str_en = ' '.join(line_en.split()).strip()
        str_ti = ' '.join(line_ti.split()).strip()
        
        nontokens_en = str_en.split()
        nontokens_ti = str_ti.split()
        
        if len(nontokens_en) > LONGLINE_NOTOKENS:
            count += 1
            
            print(line_no + 1)
            print(str_en)
            print(str_ti)
            
            verses_ti, verses_en, marks = split_to_verses(str_en, str_ti)
            

            if len(verses_ti) > 1:
#                 for vt, ve, m in zip(verses_ti, verses_en, marks):
#                     print(vt)
#                     print(ve)
#                     print()
#                     print("|" + m + "|")
#                     print()
                print("%i verses split from %s"%(len(verses_ti), marks))
            else:
                print("///////////Long and no match//////////////")

            print()
            if count % 10 == 0:
                    print("Continue?")
                    a = input("...")
                    if a == 'q' or a == 'Q':
                        break
                    else:
                        continue
            

193
Allergies are among the most common chronic conditions worldwide . Allergy symptoms range from making you miserable to putting you at risk for life - threatening reactions . According to the leading experts in allergy , an allergic reaction begins in the immune system . Our immune system protects us from invading organisms that can cause illness . If you have an allergy , your immune system mistakes an otherwise harmless substance as an invader . This substance is called an allergen . The immune system overreacts to the allergen by producing Immunoglobulin E antibodies . These antibodies travel to cells that release histamine and other chemicals , causing an allergic reaction .
ቑጥዐታት ገለ ካብ እቶም ዓለምለኻዊ ልሙዳት ዝኾኑ ሕዱር ኩነታት እዮም ። ምልክታት ቁጥዐ ኣብ ሽግር ካብ ምእታው ክሳብ ንሂወት ፈታኒ ሓደጋ ይዝርግሑ ። ብመሰረት ቀንዲ ሙኩራት ናይ ቁጥዐ ፡ ቁጠዓዊ ግብረመልሲ ኣብ ስርዓተ ምክልኻል እዩ ዝጅምር ። ስርዓተ ምክልኻልና ሕማም ከስዕቡ ካብ ዝኽእሉ ወረርቲ ታህዋስያን ይከላኸል ። ቁጥዐ እንተደኣ ኣልዩካ ፡ ስርዓተ ምክልኻልካ ንሓደ ዘይተጓዳኢ ነገር ብጌጋ ከም ወራሪ ይወስዶ ። እዚ ነገር ከኣ ኣቖታዒ እዩ ዝበሃል ። እቲ ስርዓተ ምክልኻል ኢሚ

...
622
Diplomacy might also play a role. Even when the US and the Soviet Union were bitter ideological enemies during the Cold War, they were able to negotiate agreements. Given the authoritarian nature of the Russian political system, it could be meaningless to agree not to interfere in Russian elections. Nonetheless, it might be possible to establish rules that limit the intensity and frequency of information attacks. During the Cold War, the two sides did not kill each other’s spies, and the Incidents at Sea Agreement limited the level of harassment involved in close naval surveillance. Today, such agreements seem unlikely, but they are worth exploring in the future.
ዲፕሎማሲ ውን ተራ ክህልዎ ይኽእል ኢዩ። ወላ ኣብቲ ሕቡራት መንግስታት ኣመሪካን ሕብረት ሶቬትን ብናይ ኣታሓሳስባ ኣይዲዮሎጂ ጽልኢ ኣብ ዝነበራሉ፡ተዛትየን ክስማማዓ ይኽእላ ኢየን። ምልካዊ ዓይነት ናይ ሩስያ ፖለቲካዊ ስርዓት መንግስቲ ኣብ ግምት ይእቲኻ፡ ኣብ ምርጫ ናይ ሩስያ ምትእትታው ንዘይምግባር ምስምማዕ ዋጋ ዘለዎ ኣይመስልን። እዚይኹን እምበር ንቕልጣፈን ብዝሕን ናይ ሓበሬታ መጥቃዕቲ ዝቆጻጸር ሕጊን ኣገባብን ግን ይካኣል ኢዩ። ኣብ ግዜ ዝሑል ኩናት እዞም ክልተ ወገናት ነናይሕድሕዶም ሰለይቲ ኣይተ

...
670
According to Strabo's Geographica, before the expansion of the Roman Republic, the name was used by Greeks to indicate the land between the strait of Messina and the line connecting the gulf of Salerno and gulf of Taranto, corresponding roughly to the current region of Calabria. Later the term was extended by Romans to include the Italian Peninsula up to the Rubicon, a river located between Northern and Central Italy. In 49 BC, with the Lex Roscia, Julius Caesar gave Roman citizenship to the people of the Cisalpine Gaul,[34] while in 42 BCE the hitherto existing province was abolished, thus extending Italy to the north up to the southern foot of the Alps.
ብመሰረት ስትራቦ ጂኦግራፊካ ፥ ቅድሚ ምስፍሕፋሕ ናይ ግዝኣተ ሮማ እቲ ስም ነቲ ምስናይ ሕጂ ዞባ ካላብርያ ዝሳነ፥ ኣብ መንጎ መጻብቦ መሲናን ንወሽመጥ ሳለርኖን ወሽመጥ ታራንቶን ዘራኽብ መስመርን ዝርከብ መሬት ይምልከት ነበረ። ጸኒሑ ድማ እቲ ቅጽል ብሮማውያን፥ ን ኣብ ሰሜናውን ማእከላይ ጥልያን ዝረክብ ፈለግ ማለት ወሽመጥ ጥልያን ክሳብ ሩቢኮን ዘሎ ኣጠቃለለ። ኣብ 49 ቅ.ል.ክ፥ምስ ሌክስ ሪዚካ፥ ጁልየስ ሴዛር ን ህዝቢ ሲሳልፒነ ጋውል፥{34} ሮማዊ ዜግነት ሃበ፥ይኹን እምበር ኣብ 42 ቅ.ል.ክ እቲ ዞባ ሂተትሮ 

...
1851
And know that you 're sick , you 're not weak , and it 's an issue , not an identity , because when you get past the fear and the ridicule and the judgment and the stigma of others , you can see depression for what it really is , and that 's just a part of life , just a part of life , and as much as I hate , as much as I hate some of the places , some of the parts of my life depression has dragged me down to , in a lot of ways I 'm grateful for it .
ምሕማምካ እንተፈሊጥካ፡ ኣይደኸምካን ፡ኣዚ ድማ ዛዕባ ዳኣ እምበር መንነት ኣይኮነን፡ መኽንያቱ ብመገዲ ፍርሒ፡ላግጺ፡ፍርድን ናይ ካልኦት ስባት ምስእትሓልፍ ናይ ኣእምሮ ጭንቀት እንታይ ምዃኑ ትርዳእ፡እዚ ድማ ኣካል ሂወትና ኢዩ፡ ከምቲ ብዝሒ ዝጸልኦ ፡ ከምቲ ብዝሒ ንካልኦት ቦታታት ዝጸልኦ ፡ንገለ ክፋል ሂወተይ ጭንቀት ንድሕሪት ጎቲቱኒ ኢዩ፡ ኣብ ብዙሕ ካልእ ነገራት ድማ የመስግኖ ኢየ።
///////////Long and no match//////////////

1853
My pain , more than anything in 19 years on this planet , has given me perspective , and my hurt , my hurt has forced me to have hope , have hope and to have faith , faith in myself , faith in others , faith that it can get better , that we c

...
2858
In this era, one of the biggest global challenges has the mass extinction of languages. Subsequently, the rich values and traditions embedded in them became obsolete. A lot of efforts have been undertaken to save smaller languages from death. These efforts included, recording and documentation of corpora from written and oral sources. The efforts of Travis Foundation to digitize smaller languages, like Tigrinya is no doubt, complementary to these efforts. Various texts written in Tigrinya are being translated in to English or English texts are being translated in to Tigrinya and made available digitally, parallel to each other. I suppose, a lot of parties could benefit from this project. Tourists, researchers, investors and other individuals who need language assistance wherever they go, the Travis products are optimal and efficient to use. The digitized corpses also help refugees with poor English language fluency with translation. To put it in a nut shell, the role of the Tr

...q


In [16]:
a = [1,2,3]
b= [3,4,5]
c = a + b
print(c)
c[3:]

[1, 2, 3, 3, 4, 5]


[3, 4, 5]

In [36]:
#Duplication checking
def pick_unique_testset(allset_src, allset_tgt, testsetsize, check_duplication_in):
    testset_src = []
    testset_tgt = []
    grabbed = []
    for i in range(len(allset_src)):
        found_duplicate = False
        if check_duplication_in == 0:
            s = allset_src[i]
            check_in = allset_src[0:i] + allset_src[i+1:]
            print("Checking", s)
        else:
            s = allset_tgt[i]
            check_in = allset_tgt[0:i] + allset_tgt[i+1:]
            print("Checking", s)
        
        for x in check_in:
            if s == x:
                print("Found duplicate of", s)
                found_duplicate = True
                break

        if not found_duplicate:
            testset_src.append(allset_src[i])
            testset_tgt.append(allset_tgt[i])
            grabbed.append(i)

            if len(grabbed) == testsetsize:
                break
                
    restset_src = [s for i, s in enumerate(allset_src) if i not in grabbed]
    restset_tgt = [s for i, s in enumerate(allset_src) if i not in grabbed]

    return testset_src, testset_tgt, restset_src, restset_tgt

In [38]:
alls_src = [1, 2, 3, 4, 5, 6, 2, 5, 6, 7, 5, 4, 5, 2, 6, 7, 8, 12, 13, 14, 15]
alls_tgt = ['a', 'b', 'b', 'd', 'e', 'f', 't', 'h', 'i', 'j', 'k', 'l', 'm', 'e', 'h', 'p', 'r', 's', 'h', 'u', 'v']

In [39]:
t_s, t_t, r_s, r_t = pick_unique_testset(alls_src, alls_tgt, 5, 1)

Checking a
Checking b
Found duplicate of b
Checking b
Found duplicate of b
Checking d
Checking e
Found duplicate of e
Checking f
Checking t
Checking h
Found duplicate of h
Checking i


In [32]:
[s for i, s in enumerate(alls_src) if i not in grabbed]

[1, 5, 6, 2, 5, 6, 7, 5, 4, 5, 2, 6, 7, 8, 12, 13, 14, 15]