In [15]:
# from clearn.utils.annotation_utils import combine_annotation_sessions
import pandas as pd
from clearn.analysis import CSV_COL_NAME_EPOCH, CSV_COL_NAME_STEP, CSV_COL_NAME_IMAGE_ID
from clearn.analysis import CSV_COL_NAME_ROW_ID_WITHIN_IMAGE


In [18]:
pd.set_option('display.max_rows', 500)

In [19]:
""" Returns if the row has multiple words separated by space """


def has_multiple_value(_column_name, row):
    if len(row[_column_name].split()) > 1:
        return True
    else:
        return False


In [20]:
""" An aggregation function, that can be used with `groupby` which converts a list of strings to single string
 separated by space """


def space_separated_string(x):
    x_as_list = list(x)
    if len(x_as_list) > 1:
        if len(set(x)) == 1:
            return x_as_list[0]
        else:
            x_as_list = x.values.tolist()
            return " ".join(x_as_list)
    else:
        return x


In [21]:
def get_annotations(annotated_path, batches=None):
    print("Reading annotation from ", annotated_path)
    df = None
    print(os.listdir(annotated_path))
    for annotation_file in os.listdir(annotated_path):
        if annotation_file.rsplit(".", 1)[1] == "csv":
            annotation_csv = os.path.join(annotated_path, annotation_file)
            _df = pd.read_csv(annotation_csv)
            print(annotation_file, _df.shape)
            if df is None:
                df = _df
            else:
                df = pd.concat([df, _df])
    df = df.fillna("xxxx")
    unique = df.groupby(["epoch", "step"]).size().reset_index().rename(columns={0: 'count'})
    df["epoch"] = df["epoch"].astype(int)
    df["step"] = df["step"].astype(int)
    df["_idx"] = df["_idx"].astype(int)
    df["num_rows_annotated"] = df["num_rows_annotated"].astype(int)
    df["batch"] = df["epoch"] * 935 + (df["step"] * 300)
    df["batch"] = df["batch"].astype(int)
    if batches is None:
        return df, unique
    else:
        df = df[df["batch"] == batches]
        unique = df.groupby(["epoch", "step"]).size().reset_index().rename(columns={0: 'count'})
        
        return df, unique


In [22]:
def combine_annotation_sessions(keys: list, base_path: str, max_epoch: int=1000000):
    """ Read all the individual data frames from location  into a dictionary of format {"annotator_id"}
    @:param keys List of keys- each key corresponds to annotation by one different user
    """
    data_dict = dict()
    for key in keys:
        annotation_path = base_path + key
        print(base_path)
        if not os.listdir(annotation_path):
            print(f"No csv files found in directory {annotation_path}")
            return data_dict
        df, _ = get_annotations(annotation_path, batches=None)
        df = df[df["epoch"] < max_epoch]
        # TODO Add code to fix invalid character in annotation

        if "text" not in df.columns:
            print(f"Files in  {annotation_path} does not have a column called text")

        group_by_columns = [CSV_COL_NAME_EPOCH, CSV_COL_NAME_STEP, CSV_COL_NAME_IMAGE_ID,
                            CSV_COL_NAME_ROW_ID_WITHIN_IMAGE]
        unique_df = df.groupby(group_by_columns).aggregate(lambda x: space_separated_string(x)).reset_index()
        print(df.head(100))
        print(unique_df.head(300))
        distinct_values = unique_df.apply(lambda x: has_multiple_value("text", x), axis=1)
        unique_df.insert(loc=len(unique_df.columns),
                         column="has_multiple_value",
                         value=distinct_values
                         )
        unique_df = unique_df.rename(columns={"text": f"text_{key}"})
        data_dict[key] = {KEY_FOR_DATA_FRAME: unique_df}
    return data_dict

In [23]:
import os
#from clearn.utils.annotation_utils import combine_annotation_sessions, combine_multiple_annotations
from clearn.utils.annotation_utils import KEY_FOR_DATA_FRAME, get_combined_data_frame
from clearn.utils.annotation_utils import get_manual_annotation, get_combined_annotation
from clearn.config import get_base_path, ExperimentConfig, check_and_create_folder, get_keys
from clearn.config import RUN_ID
from clearn.utils.annotation_utils import ANNOTATION_FOLDER_NAME_PREFIX

# Initialize variables

debug = False
exp_config = ExperimentConfig.get_exp_config()
exp_config.check_and_create_directories(RUN_ID)

NUMBER_OF_ROWS = 16
NUM_DIGITS_PER_ROW = 4

check_and_create_folder(exp_config.get_annotation_result_path())

# Setting debug = true will write all intermediate data frames
if debug:
    debug_path = os.path.join(exp_config.get_annotation_result_path(), "debug/")
    check_and_create_folder(debug_path)

num_batches_per_epoch = exp_config.num_train_samples // exp_config.BATCH_SIZE
number_of_evaluation_per_epoch = num_batches_per_epoch // exp_config.eval_interval

num_val_images = 2
max_epoch = 20
num_rows = max_epoch * number_of_evaluation_per_epoch * NUMBER_OF_ROWS * num_val_images

# Read all the individual data frames into a dictionary of format {"annotator_id"}
base_path = get_base_path(exp_config.root_path,
                          exp_config.Z_DIM,
                          exp_config.num_units[2],
                          exp_config.num_units[1],
                          exp_config.num_cluster_config,
                          run_id=RUN_ID
                          )
keys = get_keys(base_path, ANNOTATION_FOLDER_NAME_PREFIX)
number_of_keys = len(keys)

for key in keys:
    annotation_path = base_path + key
    if not os.listdir(annotation_path):
        print(f"No csv files found in directory. Skipping the directory")
        keys.remove(key)
data_dict = combine_annotation_sessions(keys=keys,
                                        base_path=base_path,
                                        max_epoch=max_epoch)

/home/sunilv/concept_learning_data/Exp_10_032_064_100/
Reading annotation from  /home/sunilv/concept_learning_data/Exp_10_032_064_100/manual_annotation_sunil
['backed_up_2_annotation.csv', 'annotation.csv']
backed_up_2_annotation.csv (32, 5)
annotation.csv (32, 5)
    epoch  step  _idx  num_rows_annotated  text  batch
0       8     3     0                   1  0x86   8380
1       8     3     0                   2  1x71   8380
2       8     3     0                   3  601x   8380
3       8     3     0                   4  1509   8380
4       8     3     0                   5  3832   8380
5       8     3     0                   6  4459   8380
6       8     3     0                   7  9453   8380
7       8     3     0                   8  4752   8380
8       8     3     0                   9  792x   8380
9       8     3     0                  10  7997   8380
10      8     3     0                  11  x126   8380
11      8     3     0                  12  x599   8380
12      8     3     

In [12]:
data_dict[keys[0]][KEY_FOR_DATA_FRAME]

Unnamed: 0,epoch,step,_idx,num_rows_annotated,text_manual_annotation_sunil,batch,has_multiple_value
0,8,3,0,1,0x86,8380,False
1,8,3,0,2,1x71,8380,False
2,8,3,0,3,601x,8380,False
3,8,3,0,4,1509,8380,False
4,8,3,0,5,3832,8380,False
5,8,3,0,6,4459 4x59,8380,True
6,8,3,0,7,9453,8380,False
7,8,3,0,8,4752,8380,False
8,8,3,0,9,792x,8380,False
9,8,3,0,10,7997 7x97,8380,True


In [37]:
df = data_dict[keys[0]][KEY_FOR_DATA_FRAME]
for index, row in df.iterrows():
    _epoch = row.epoch
    _step = row.step
    print(_epoch,_step)

8 3
8 3
8 3
8 3
8 3
8 3
8 3
8 3
8 3
8 3
8 3
8 3
8 3
8 3
8 3
8 3
8 3
8 3
8 3
8 3
8 3
8 3
8 3
8 3
8 3
8 3
8 3
8 3
8 3
8 3
8 3
8 3
