# Process `generated_reviews_enth` for [HuggingFace Datasets](https://github.com/huggingface/datasets)

In [42]:
import pandas as pd
import glob
from sklearn.model_selection import train_test_split

In [43]:
fnames = glob.glob('raw_data/*')
fnames

['raw_data/generated_reviews-review_level_sa2_04182020_all_n_sent_205501.csv',
 'raw_data/generated_reviews-review_level_sa1_04042020+review_star.csv']

## `machine_df` - All Machine-translated Reviews

Large portion of the dataset is machine-translated by Google Translate API then checked by annotators from [Hope Data Annotations](https://www.hopedata.org/). We apply the following preprocessing:
- Drop duplciates on `en_segment`,`th_segment`,`review_star`,`correct`; duplicates might exist because the translation checking (human-like or not) is done by annotators.
- Remove reviews that are not between 1-5 stars.
- Remove reviews whose `correct` are not 0 or 1.
- Deduplicate on `en_segment` which contains the source sentences.

In [100]:
def clean_df(df):
    df = df[['en_segment','th_segment','review_star','correct']]
    df = df.drop_duplicates()
    df = df[(df.review_star>=1)&(df.review_star<=5)]
    df = df[df.correct.isin([0,1])]
    #remove duplicate english
    agg = df.groupby('en_segment').th_segment.count().reset_index()
    agg = agg[agg.th_segment==1]
    df = df[df.en_segment.isin(agg.en_segment.tolist())]
    return df.reset_index(drop=True)

In [101]:
machine_df = pd.concat([pd.read_csv(fnames[i]) for i in range(2)])
machine_df = clean_df(machine_df)

In [102]:
#no duplicate en_segment, which is the source
machine_df.en_segment.value_counts().value_counts()

1    174530
Name: en_segment, dtype: int64

In [103]:
#some duplicate th_segment as a result of translation
machine_df.th_segment.value_counts().value_counts()

1    174003
2       248
3         7
5         2
Name: th_segment, dtype: int64

In [104]:
machine_df['en_segment'] = machine_df['en_segment'].map(lambda x: x.replace(' <SEP> ',' '))
machine_df['th_segment'] = machine_df['th_segment'].map(lambda x: x.replace(' <SEP> ',' '))

## Splits

In [105]:
train_valid, test = train_test_split(machine_df,test_size=0.1,random_state=1412)
train, valid = train_test_split(train_valid,test_size=0.1,random_state=1412)
train.shape,valid.shape,test.shape

((141369, 4), (15708, 4), (17453, 4))

In [106]:
train.correct.value_counts() #/ train.shape[0]

0    99296
1    42073
Name: correct, dtype: int64

In [107]:
valid.correct.value_counts() #/ valid.shape[0]

0    10936
1     4772
Name: correct, dtype: int64

In [108]:
test.correct.value_counts() #/ test.shape[0]

0    12208
1     5245
Name: correct, dtype: int64

In [109]:
train.review_star.value_counts() #/ train.shape[0]

1    50418
4    22876
3    22825
2    22671
5    22579
Name: review_star, dtype: int64

In [110]:
valid.review_star.value_counts() #/ valid.shape[0]

1    5628
4    2596
2    2521
5    2517
3    2446
Name: review_star, dtype: int64

In [111]:
test.review_star.value_counts() #/ test.shape[0]

1    6225
5    2852
4    2831
2    2778
3    2767
Name: review_star, dtype: int64

## Save

In [114]:
#save
train.to_json('data/train.jsonl',orient='records',lines=True)
valid.to_json('data/valid.jsonl',orient='records',lines=True)
test.to_json('data/test.jsonl',orient='records',lines=True)

In [115]:
df["length_th"] = df["th_segment"].map(lambda x: len(word_tokenize(x)))
df["length_en"] = df["en_segment"].map(lambda x: len(word_tokenize(x)))
machine_df.to_csv('raw_data/machine_df.csv',index=False)