## Measure progress during annotation

In [1]:
from pathlib import Path

sc = Path("poems/bokmål")
dst = Path("tita")

tot = len(list(sc.iterdir()))
done = len(list(dst.iterdir()))

print(f"Annotated {done} of {tot}, aka {(done/tot)*100}%")

Annotated 508 of 508, aka 100.0%


## Compare source and annotation
I made some mistakes during substitution of danish words, script to check/repair

In [29]:
# from pathlib import Path

# sc = Path("poems/bokmål")
# dst = Path("tita")


# sc_list = sorted(sc.iterdir(), key = lambda x: x.name[:-4])
# dst_list = sorted(dst.iterdir(), key = lambda x: x.name[:-14])


# for sc_file, dst_file in zip(sc_list, dst_list):
#     sc_stanzas = sc_file.read_text().split("\n\n")
#     dst_stanzas = dst_file.read_text().split("\n\n")
#     for sc_s, dst_s in zip(sc_stanzas, dst_stanzas):
#         dst_lines = dst_s.split("\n")
#         rhyme_code = dst_lines[0]
#         dst_lines = dst_lines[1:]
#         sc_lines = sc_s.split("\n")[:-1]

#         for i, (l1, l2) in enumerate(zip(sc_lines, dst_lines)):
#             if l1 != l2:
#                 print(f"{l1}\n{l2}\n\n")


## Count stanzas and lines

In [18]:
from pathlib import Path

dst = Path("tita")

poems = len(list(dst.iterdir()))

stanzas = 0
lines = 0
for e in dst.iterdir():
    st = e.read_text().split("\n\n")[:-1]
    stanzas += len(st)
    for s in st:
        #-1 because the first line is the rhyme code
        lines += (len(s.split("\n"))-1)    


In [7]:
stanzas, lines, poems

(5158, 26198, 508)

In [19]:
import pandas as pd

dst = Path("tita")

poem_df = pd.read_csv(f"tsvs/{dst.name}_rhymes_poems.tsv", sep="\t")
poem_df

Unnamed: 0,rhyme scheme,stanza
0,AABCCBI,"Gud, la oss i din kunnskap fremmes,\nså det vi..."
1,TAABCCB,ANNEN SANG\nFra Landego seiles mot Skrova i no...
2,AABCCB,Og er det en høstkveld når mørket står på\nså ...
3,AABCCB,"Snart ligger og kaien der dyngvåt av regn,\nog..."
4,AABCCB,Og rett bort i øst ligger Svinøya gård\nhvor B...
...,...,...
5153,AABB,Lukket! Lukket! Øde veier! Gledens vinger brut...
5154,AABB,Og nå stod han her som fremmed i et fremmed la...
5155,AABB,"Opp av havets blanke bølger, som en vårfrisk m..."
5156,AABB,"Til Europa! Tanken slår ham plutselig som lyn,..."


## Count rhyme schemes

In [24]:
from pathlib import Path
from collections import Counter
import pandas as pd

rhyme_schemes = list(poem_df["rhyme scheme"])

c = Counter(rhyme_schemes)
l = list(c.items())
l.sort(key = lambda x: x[1], reverse = True)

title_schemes = [s for s in rhyme_schemes if "T" in s]
info_schemes = [s for s in rhyme_schemes if "I" in s]
noise_schemes = [s for s in rhyme_schemes if "N" in s]

print(f"Schemes with title: {len(title_schemes)}\nschemes with info: {len(info_schemes)}\nschemes with noise: {len(noise_schemes)}")
df = pd.DataFrame(l, columns = ["scheme", "count"])
df

Schemes with title: 349
schemes with info: 81
schemes with noise: 2


Unnamed: 0,scheme,count
0,ABAB,1059
1,ABCB,699
2,AABB,571
3,AABCCB,336
4,ABBA,170
...,...,...
593,ABCDBEC,1
594,ABAAC,1
595,TABBACC,1
596,TIABAC,1


## Remove title, info and noise code for rhyme scheme statistics

In [25]:
new_rhyme_schemes = [scheme.replace("T", "").replace("I", "").replace("N", "") for scheme in rhyme_schemes]
c2 = Counter(new_rhyme_schemes)
l2 = list(c2.items())
l2.sort(key = lambda x: x[1], reverse = True)
df2 = pd.DataFrame(l2, columns = ["scheme", "count"])
df2

Unnamed: 0,scheme,count
0,ABAB,1142
1,ABCB,749
2,AABB,611
3,AABCCB,360
4,ABBA,181
...,...,...
485,ABCDEFGHG,1
486,ABCDCEA,1
487,ABCDBEC,1
488,ABAAC,1


In [26]:
one_occurence = df2.loc[df2["count"]==1]
one_occurence

Unnamed: 0,scheme,count
187,AABCDEFD,1
188,ABACBDCD,1
189,ABABBB,1
190,ABBBA,1
191,AABCCDDA,1
...,...,...
485,ABCDEFGHG,1
486,ABCDCEA,1
487,ABCDBEC,1
488,ABAAC,1


In [27]:
not_one_occurence = df2.loc[df2["count"]>1]
not_one_occurence

Unnamed: 0,scheme,count
0,ABAB,1142
1,ABCB,749
2,AABB,611
3,AABCCB,360
4,ABBA,181
...,...,...
182,ABCBCDD,2
183,ABCDABCD,2
184,ABAACCBBDDB,2
185,ABBCDE,2


##  Number of possible unique rhyme pairs

In [28]:
def no_rhyme(scheme):
    return len(scheme) == len(set(scheme))

def get_edges(vertices):
    return (vertices*(vertices-1))/2

tot_word_pairs = 0
for scheme, count in l2:
    c = Counter(scheme)
    tot_word_pairs += sum(get_edges(v) for v in c.values())*count
    
tot_word_pairs

12631.0

## Count unique line ending words

In [13]:
def get_line_ending_words(pathlib_file):
    # TODO: USE TOKENIZER
    stanzas = pathlib_file.read_text().split("\n\n")[:-1]
    #schemes = [stanza.split("\n")[0] for stanza in stanzas]
    line_ending_words = [line.split(" ")[-1] for stanza in stanzas for line in stanza.split("\n")[1:]] 
    return line_ending_words
    #flatten_matrix = [val for sublist in matrix for val in sublist]

line_ending_words = []    
    
for e in dst.iterdir():
    line_ending_words += get_line_ending_words(e)

In [204]:
len(line_ending_words), len(set(line_ending_words))

(26198, 12524)