In [None]:
import numpy as np
import pandas as pd
import re

In [None]:
aspects = [
    'hotel#general', 'hotel#prices', 'hotel#design&features', 'hotel#cleanliness', 'hotel#comfort', 'hotel#quality', 'hotel#miscellaneous',
    'rooms#general', 'rooms#prices', 'rooms#design&features', 'rooms#cleanliness', 'rooms#comfort', 'rooms#quality', 'rooms#miscellaneous',
    'room_amenities#general', 'room_amenities#prices', 'room_amenities#design&features', 'room_amenities#cleanliness', 'room_amenities#comfort', 'room_amenities#quality', 'room_amenities#miscellaneous',
    'facilities#general', 'facilities#prices', 'facilities#design&features', 'facilities#cleanliness', 'facilities#comfort', 'facilities#quality', 'facilities#miscellaneous',
    'service#general',
    'location#general',
    'food&drinks#prices', 'food&drinks#quality', 'food&drinks#style&options', 'food&drinks#miscellaneous' ]

mapping = {np.nan: 0, 'dne': 0, 'positive': 1, 'negative': 2, 'neutral': 3}
rev_mapping = {1: 'positive', 2: 'negative', 3: 'neutral', 0: np.nan}

# Format txt -> csv

In [None]:
def label_encoder(label):
    y = [0] * len(aspects)
    ap_stm = re.findall('{(.+?), ([a-z]+)}', label)

    for aspect, sentiment in ap_stm:
        idx = aspects.index(aspect)
        y[idx] = mapping.get(sentiment, 0)

    return y

def txt2df(filepath):
    with open(filepath, 'r', encoding='utf-8-sig') as txt:
        data = txt.read().split('\n')

    df = pd.DataFrame()
    df['review'] = [review for review in data[1::4]]
    df[aspects] = [label_encoder(label) for label in data[2::4]]

    return df

# Format csv -> txt

In [None]:
def label_decoder(encoded_label):
    aps_stms = (pd.to_numeric(encoded_label, errors='coerce')
              .replace(0, np.nan)
              .dropna()
              .astype(int))

    return ', '.join([f'{{{aspect}, {rev_mapping[sentiment]}}}' 
                      for aspect, sentiment in 
                      zip(aps_stms.index, aps_stms)])

def csv2str(filepath):
    df = pd.read_csv(filepath)
    rows = []
    for id, row in df.iterrows():
        review = row[0]
        labels = label_decoder(row[1:])
        rows.extend((f'#{id+1}', review, labels, ''))
    return '\n'.join(rows)

# Main

In [None]:
root_dir = "./datasets/"

"""
    >>> root_dir = Path('CS221.M11.KHCL-Aspect-Based-Sentiment-Analysis/data')

    >>> train_txt_fp = root_dir/'original/1-VLSP2018-SA-Restaurant-train (7-3-2018).txt'
    >>> dev_txt_fp = root_dir/'original/2-VLSP2018-SA-Restaurant-dev (7-3-2018).txt'
    >>> test_txt_fp = root_dir/'original/3-VLSP2018-SA-Restaurant-test (8-3-2018).txt'

    >>> train_csv_fp = root_dir/'csv/train.csv'
    >>> dev_csv_fp = root_dir/'csv/dev.csv'
    >>> test_csv_fp = root_dir/'csv/test.csv'

    >>> assert train_txt_fp.is_file()
    >>> assert dev_txt_fp.is_file()
    >>> assert test_txt_fp.is_file()

    >>> train_df = txt2df(train_fp)
    >>> dev_df = txt2df(dev_fp)
    >>> test_df = txt2df(test_fp)

    >>> train_df.to_csv(train_csv_fp, index=False)
    >>> dev_df.to_csv(dev_csv_fp, index=False)
    >>> test_df.to_csv(test_csv_fp, index=False)
    
    >>> print(csv2str(train_csv_fp))
    >>> print(csv2str(dev_csv_fp))
    >>> print(csv2str(test_csv_fp))
"""