# Guided-MT Code2Vec Evaluation

This Notebook runs over the experiment-outputs, extracts data and makes plots.

Expected Layout:

```
.
├── README.md
├── data
│   └── random-MRR-max
│       ├── seed-2880
│       │   ├── data
│       │   │   ├── gen0
│       │   │   │   ├── 3b2459
│       │   │   │   ├── 3b2459.json
│       │   │   │   ├── 447e22
│       │   │   │   ├── 447e22.json
│       │   │   │   ├── 4495c7
│       │   │   │   ├── 4495c7.json
│       │   │   │   ├── 52667b
│       │   │   │   ├── 52667b.json
│       │   │   │   ├── 6855ba
│       │   │   │   ├── 6855ba.json
│       │   │   │   ├── 68ec75
│       │   │   │   ├── 68ec75.json
│       │   │   │   ├── 6cc14d
│       │   │   │   ├── 6cc14d.json
│       │   │   │   ├── 6d6845
│       │   │   │   ├── 6d6845.json
│       │   │   │   ├── 7a2d67
│       │   │   │   ├── 7a2d67.json
│       │   │   │   ├── ed0dd9
│       │   │   │   └── ed0dd9.json
│       │   │   ├── gen1
│       │   │   ├── ...
│       │   │   ├── gen8
│       │   │   ├── ...
│       │   │   ├── generation_0
│       │   │   │   ├── Some.java
│       │   │   │   ├── ...
│       │   │   │   ├── Other.java
│       │   │   │   └── Different.java
│       │   │   └── initialGen
│       │   │       └── 3bf9ce
│       │   └── results.txt
│       ├── seed-5142
│           └── results.txt
│       ...
├── evaluation.ipynb
└── requirements.txt
```

In [None]:
import json
import os
import regex as re
import pandas as pd

# Important: Specify Directory without / at the end!
directory:str = "./data"

In [None]:
json_files:[str] = []

# iterate over files in
# that directory
for root, dirs, files in os.walk(directory):
    for filename in files:
        if ".json" in filename:
            json_files.append(os.path.join(root, filename))

print(f"found { len(json_files) } .json-files in {directory}")

In [None]:
sample_path = "./data/random-MRR-max/seed-8991/data/gen4/5f7d0f.json"

def extract_seed_from_path(path:str) -> int:
    # TODO
    pattern = r'seed-\d+'
    match = re.findall(pattern,path)[0]
    return match[5:]

def extract_experiment_from_path(path:str) -> str:
    pattern = directory + r'.*?/seed'
    match = re.findall(pattern,path)[0]
    return match[len(directory)+1:-5]

def extract_generation_from_path(path:str) -> int:
    pattern = r'gen\d+'
    match = re.findall(pattern,path)[0]
    return match[3:]

def count_transformers(datapoint):
    # There was an issue with the json, the genotype is just a string as some quotes were missing
    raw = datapoint["genotype"]
    pattern = "transformer"
    matches = re.findall(pattern,raw)
    return len(matches)

print("path:",sample_path)
print("seed:",extract_seed_from_path(sample_path))
print(" exp:",extract_experiment_from_path(sample_path))
print(" gen:",extract_generation_from_path(sample_path))

In [None]:
datapoints = []
for file in json_files:
    with open(file) as f:
      datapoint = json.loads(f.read())
      datapoint["path"] = file
      datapoint["seed"] = extract_seed_from_path(file)
      datapoint["experiment"] = extract_experiment_from_path(file)
      datapoint["TRANSFORMATIONS"] = count_transformers(datapoint)
      datapoint["generation"]=extract_generation_from_path(file)

      datapoints.append(datapoint)

In [None]:
datapoints[1]["genotype"]

In [None]:
all_experiments = set([datapoint["experiment"] for datapoint in datapoints])
all_seeds = set([datapoint["seed"] for datapoint in datapoints])
all_metrics = ["F1","MRR","EDITDIST","PMRR","REC","PREC"]
all_transformers = [] #TBD

In [None]:
df = pd.DataFrame(datapoints)
df.head()