# Separate stanzas for annotation
## Step 1: gather random stanzas from the datast

In [1]:
import string

import pandas as pd
import nltk

In [2]:
punctuation = string.punctuation + "«»—..."

def tokenize_line(line):
    return [t.lower() for t in nltk.tokenize.word_tokenize(line, language='norwegian') if t not in punctuation]

In [3]:
df = pd.read_csv("../../norwegian_rhyme_scheme_corpus/tsvs/tita_rhymes_poems.tsv", sep="\t")
n = 20*4
df = df.sample(n=n, random_state=1337)
schemes = list(df["rhyme scheme"])
stanzas = list(df.stanza)

stanzas = ["\n".join([" ".join(tokenize_line(line)) for line in stanza.split("\n")]) for stanza in stanzas]
stanzas = [(stanza, scheme) for stanza, scheme in zip(stanzas, schemes)]

split_stanzas = [stanzas[x:x+20] for x in range(0, len(stanzas), 20)]
len(split_stanzas), len(split_stanzas[0])

(4, 20)

## Step 2: gather baseline generations (without rhyme)

In [4]:
with open("baseline_poetry_no_rhyme_stanza.txt") as f:
    text = f.read()
    
text = text.split("\n\n")
baseline_stanzas = text[:40]

split_baseline_stanzas = [baseline_stanzas[x:x+10] for x in range(0, len(baseline_stanzas), 10)]
len(split_baseline_stanzas), len(split_baseline_stanzas[0])

(4, 10)

## Step 3: gather line-based verse generations

In [5]:
with open("generated_rhyming_poetry_rhyme_gen_norsc_big_9_buckets_line.txt") as f:
    text = f.read()

text = text.split("\n\n")[:40]
line_schemes = [stanza.split("\n")[0] for stanza in text]
line_stanzas = ["\n".join(stanza.split("\n")[1:]) for stanza in text]

line_stanzas = [(stanza, scheme) for stanza, scheme in zip(line_stanzas, line_schemes)]

split_line_stanzas = [line_stanzas[x:x+10] for x in range(0, len(line_stanzas), 10)]
len(split_line_stanzas), len(split_line_stanzas[0])

(4, 10)

## Step 4: gather stanza-based verse generations

In [6]:
with open("generated_rhyming_poetry_rhyme_gen_norsc_big_9_buckets_stanza.txt") as f:
    text = f.read()

text = text.split("\n \n")[:40]
stanza_schemes = [stanza.split("\n")[0] for stanza in text]
stanza_stanzas = ["\n".join(stanza.split("\n")[1:]) for stanza in text]

stanza_stanzas = [(stanza, scheme) for stanza, scheme in zip(stanza_stanzas, stanza_schemes)]

split_stanza_stanzas = [stanza_stanzas[x:x+10] for x in range(0, len(stanza_stanzas), 10)]
len(split_stanza_stanzas), len(split_stanza_stanzas[0])

(4, 10)

# Split into four schemes

In [7]:
import random
random.seed(1337)

for j in range(4):
    orig = split_stanzas[j]
    base = split_baseline_stanzas[j]
    line = split_line_stanzas[j]
    stanz = split_stanza_stanzas[j]
    
    side_by_side_orig = orig[:15]
    stanza_o, scheme_o = zip(*side_by_side_orig)

    side_by_side_gen = line[:5] + stanz[:5]
    stanza_g, scheme_g = zip(*side_by_side_gen)
    stanza_g = list(stanza_g) + base[:5]
    scheme_g = list(scheme_g) + ["-"]*5

    df = pd.DataFrame({"type": ["orig"]*15 + ["line"]*5 + ["stanza"]*5 + ["base"]*5, 
                       "scheme": list(scheme_o) + scheme_g, 
                       "stanza": list(stanza_o) + stanza_g})

    # shuffle the generated stanzas
    gen_is = df.loc[df["type"]!="orig"].sample(n=len(df)//2, random_state=1337).index

    pair_index = list(range(len(df)))

    for i, g in enumerate(gen_is):
        pair_index[g] = i

    df["pair_number"] = pair_index

    order_in_pair = [random.choice([0,2]) for i in range(len(df)//2)] + [1]*(len(df)//2)
    df["order_in_pair"] = order_in_pair
    df = df.sort_values(by=["pair_number", "order_in_pair"])
        
    rest = orig[15:] + line[5:] + stanz[5:]
    rest_stanz, rest_scheme = zip(*rest)
    rest_stanz = list(rest_stanz) + base[5:]
    rest_scheme = list(rest_scheme) + ["-"]*5

    df2 = pd.DataFrame({"type": ["orig"]*5 + ["line"]*5 + ["stanza"]*5 + ["base"]*5,
                      "scheme": rest_scheme, "stanza": rest_stanz})

    df2 = df2.sample(n=len(df2), random_state=1337)
    
    df.to_csv(f"side_by_side_form_{j+1}.tsv", sep="\t")
    df2.to_csv(f"rest_{j+1}.tsv", sep="\t")
    
    s = ""
    for e in df.itertuples():
        s += f"""
        PAIR NUMBER: {e.pair_number} 
        type: {e.type}\t scheme: {e.scheme}
        
        {e.stanza}
        
        
        """   
    
    s += "-------------SINGLE STANZAS-------------"
    for e in df2.itertuples():
        s += f"""        
        type: {e.type}\t scheme: {e.scheme}
        
        {e.stanza}
        
        
        """ 
        
    with open(f"schema_{j+1}", "w+") as f:
        f.write(s)