In [None]:
%pip install pandas morfessor 

In [None]:
#unsupervised
import pandas as pd
import morfessor
import subprocess
import os

def train(type="standard", morph_length=-1, num_morph_types=-1, bin_model_path="models\m_morfessor.bin", training_file_path="data\\training_morfessor_cli.txt"):
  if type=="standard":
    train_res = subprocess.run(['morfessor-train.bat', '-s', bin_model_path, '--traindata-list', training_file_path])
  elif type=="morph_length" and morph_length>0:
    train_res = subprocess.run(['morfessor-train.bat', '-s', bin_model_path,'--morph-length', f'{morph_length}', '--traindata-list', training_file_path])
  elif type=="num_morph_types" and num_morph_types>0:
     train_res = subprocess.run(['morfessor-train.bat', '-s', bin_model_path,'--num-morph-types', f'{num_morph_types}', '--traindata-list', training_file_path])  
  else:
     print("Error in the training arguments!")


def segment(bin_model_path="models\m_morfessor.bin", input_tsv="data\\eng.sentence.test.gold.tsv", output_tsv="outputs\\eng.sentence.test.morfessor_guess.tsv"):

  io = morfessor.MorfessorIO()
  model = io.read_binary_model_file(bin_model_path)

  df_guess = pd.read_csv(input_tsv, sep='\t', header=None)
  data_guess = df_guess[0].astype(str)
  i = 0

  #segment on words

  for entry in data_guess:
    sent = ''
    j = 0
    words = entry.split() #list of words
    length_sent = len(words)  #number of words

    for word in words:
      list_word = model.viterbi_segment(word)[0]
      k = 0
      length_word = len(list_word)  #number of morphs

      for morph in list_word:
        if k != length_word - 1:  #not last morph
          sent += (morph + ' @@')
          k+=1
        else: #last morph
          if j != length_sent - 1:
            sent += (morph + ' ')
          else :
            sent +=morph
      j+=1

    df_guess[1][i] = sent
    i+=1

  df_guess.to_csv(output_tsv, sep='\t', header=None, index = False)


def evaluate(type="standard", gld="data\\eng.sentence.test.gold.tsv", gs="outputs\\eng.sentence.test.morfessor_guess.tsv", store="outputs\\output.json", morph_length=-1, num_morph_types=-1):
  import argparse
  args = argparse.Namespace(
      gold=gld,
      guess=gs,
      output=store,
      category=False 
  )
  import evaluate, json, os
  stats = evaluate.main(args)
  model_name = ""
  if type=="standard":
     model_name = "morfessor_standard"
  elif type=="morph_length" and morph_length>0:
     model_name = f"morfessor_len_{morph_length}"
  elif type=="num_morph_types" and num_morph_types>0:
     model_name = f"morfessor_types_{num_morph_types}"
  else:
     print("Error in arguments!")
  new_stats = {"model": model_name}
  new_stats.update(stats)
  data = {"data": []}
  if os.path.exists(args.output):
      with open(args.output, 'r') as output_file:
          data = json.load(output_file)

  # Skip adding data point if already present
  if not any(item.get("model") == model_name for item in data["data"]):
      data["data"].append(new_stats)
      data["data"] = sorted(data["data"], key=lambda x: x["model"])
      with open(args.output, 'w') as output_file:
          json.dump(data, output_file, indent=4)


In [None]:
#Add the standard evaluation without any flags to the JSON file
train()
segment()
evaluate()

In [None]:
# Add the evaluations for different --morph-length values to the JSON file

# Find the average length of a word in the training file. Avg. morph length <= Avg. word length
total_len = 0
total_words = 0
with open("data\\training_morfessor_cli.txt", "r", encoding="utf-8") as f:
    for word in f: 
        total_words +=1
        total_len += len(word)
avg_len = 0
if total_words!=0:
    avg_len = int(total_len / total_words)
print(avg_len)

In [None]:
for i in range(1,avg_len + 1):
    train(type="morph_length", morph_length=i)
    segment()
    evaluate(type="morph_length", morph_length=i)

In [12]:
#Calculate how many distinct words are in the training file. num_morph_types <= num_distinct_words
distinct_words = []
with open("data\\training_morfessor_cli.txt", "r", encoding="utf-8") as file:
    for word in file:
        if word not in distinct_words:
            distinct_words.append(word)
num_distinct_words = len(distinct_words)
print(num_distinct_words)

17324


In [13]:
for j in range(500, num_distinct_words, 1000):
    train(type="num_morph_types", num_morph_types=j)
    segment()
    evaluate(type="num_morph_types", num_morph_types=j)

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df_guess[1][i] = sent




category: all
distance	17.91
f_measure	21.37
precision	15.47
recall	34.56


You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df_guess[1][i] = sent




category: all
distance	10.26
f_measure	39.86
precision	32.14
recall	52.46


You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df_guess[1][i] = sent




category: all
distance	8.31
f_measure	47.18
precision	39.55
recall	58.46


You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df_guess[1][i] = sent




category: all
distance	7.35
f_measure	52.14
precision	44.67
recall	62.60


You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df_guess[1][i] = sent




category: all
distance	6.94
f_measure	53.37
precision	46.32
recall	62.94


You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df_guess[1][i] = sent




category: all
distance	6.17
f_measure	58.32
precision	51.62
recall	67.03


You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df_guess[1][i] = sent




category: all
distance	5.95
f_measure	59.02
precision	52.59
recall	67.24


You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df_guess[1][i] = sent




category: all
distance	5.68
f_measure	60.74
precision	54.54
recall	68.54


You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df_guess[1][i] = sent




category: all
distance	4.78
f_measure	67.67
precision	62.25
recall	74.13


You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df_guess[1][i] = sent




category: all
distance	4.63
f_measure	68.46
precision	63.59
recall	74.13


You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df_guess[1][i] = sent




category: all
distance	3.70
f_measure	74.86
precision	71.97
recall	77.98


You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df_guess[1][i] = sent




category: all
distance	3.14
f_measure	78.69
precision	77.60
recall	79.81


You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df_guess[1][i] = sent




category: all
distance	2.70
f_measure	81.49
precision	82.52
recall	80.49


You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df_guess[1][i] = sent




category: all
distance	2.64
f_measure	81.49
precision	84.16
recall	78.98


You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df_guess[1][i] = sent




category: all
distance	2.69
f_measure	80.65
precision	84.60
recall	77.05


You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df_guess[1][i] = sent




category: all
distance	2.77
f_measure	79.79
precision	84.56
recall	75.53


You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  df_guess[1][i] = sent




category: all
distance	2.98
f_measure	77.10
precision	82.93
recall	72.03
