In [None]:
%pip install pandas morfessor 

In [None]:
#unsupervised
import pandas as pd
import morfessor
import subprocess
import os

def train(type="standard", morph_length=-1, num_morph_types=-1, bin_model_path="models\m_morfessor.bin", training_file_path="data\\training_morfessor_cli.txt"):
  if type=="standard":
    train_res = subprocess.run(['morfessor-train.bat', '-s', bin_model_path, '--traindata-list', training_file_path])
  elif type=="morph_length" and morph_length>0:
    train_res = subprocess.run(['morfessor-train.bat', '-s', bin_model_path,'--morph-length', f'{morph_length}', '--traindata-list', training_file_path])
  elif type=="num_morph_types" and num_morph_types>0:
     train_res = subprocess.run(['morfessor-train.bat', '-s', bin_model_path,'--num-morph-types', f'{num_morph_types}', '--traindata-list', training_file_path])  
  else:
     print("Error in the training arguments!")


def segment(bin_model_path="models\m_morfessor.bin", input_tsv="data\\eng.sentence.test.gold.tsv", output_tsv="outputs\\eng.sentence.test.morfessor_guess.tsv"):

  io = morfessor.MorfessorIO()
  model = io.read_binary_model_file(bin_model_path)

  df_guess = pd.read_csv(input_tsv, sep='\t', header=None)
  data_guess = df_guess[0].astype(str)
  i = 0

  #segment on words

  for entry in data_guess:
    sent = ''
    j = 0
    words = entry.split() #list of words
    length_sent = len(words)  #number of words

    for word in words:
      list_word = model.viterbi_segment(word)[0]
      k = 0
      length_word = len(list_word)  #number of morphs

      for morph in list_word:
        if k != length_word - 1:  #not last morph
          sent += (morph + ' @@')
          k+=1
        else: #last morph
          if j != length_sent - 1:
            sent += (morph + ' ')
          else :
            sent +=morph
      j+=1

    df_guess[1][i] = sent
    i+=1

  df_guess.to_csv(output_tsv, sep='\t', header=None, index = False)


def evaluate(type="standard", gld="data\\eng.sentence.test.gold.tsv", gs="outputs\\eng.sentence.test.morfessor_guess.tsv", store="outputs\\output.json", morph_length=-1, num_morph_types=-1):
  import argparse
  args = argparse.Namespace(
      gold=gld,
      guess=gs,
      output=store,
      category=False 
  )
  import evaluate, json, os
  stats = evaluate.main(args)
  model_name = ""
  if type=="standard":
     model_name = "morfessor_standard"
  elif type=="morph_length" and morph_length>0:
     model_name = f"morfessor_len_{morph_length}"
  elif type=="num_morph_types" and num_morph_types>0:
     model_name = f"morfessor_types_{num_morph_types}"
  else:
     print("Error in arguments!")
  new_stats = {"model": model_name}
  new_stats.update(stats)
  data = {"data": []}
  if os.path.exists(args.output):
      with open(args.output, 'r') as output_file:
          data = json.load(output_file)

  # Skip adding data point if already present
  if not any(item.get("model") == model_name for item in data["data"]):
      data["data"].append(new_stats)
      data["data"] = sorted(data["data"], key=lambda x: x["model"])
      with open(args.output, 'w') as output_file:
          json.dump(data, output_file, indent=4)


In [None]:
#Add the standard evaluation without any flags to the JSON file
train()
segment()
evaluate()

In [6]:
# Add the evaluations for different --morph-length values to the JSON file

# Find the average length of a word in the training file
total_len = 0
total_words = 0
with open("data\\training_morfessor_cli.txt", "r", encoding="utf-8") as f:
    for word in f: 
        total_words +=1
        total_len += len(word)
avg_len = 0
if total_words!=0:
    avg_len = int(total_len / total_words)
print(avg_len)

5


In [None]:
for i in range(1,avg_len + 1):
    train(type="morph_length", morph_length=i)
    segment()
    evaluate(type="morph_length", morph_length=i)