In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from collections import Counter

In [2]:
DF_PATH = '../data/interim/'

## Import Data

In [3]:

df_food = pd.read_csv(DF_PATH + '3_food_cleaned_v3.csv')
df_electronic = pd.read_csv(DF_PATH + '3_electronic_cleaned_v3.csv')
df_fashion = pd.read_csv(DF_PATH + '3_fashion_cleaned_v3.csv')

In [4]:
df_food.head()

Unnamed: 0,reviews,label
0,harga sedang kualitas sedang rasa packing rapi...,1
1,bagus banget emas aman banget recomend pokoknya,1
2,kualitas harga standar dateng sesuai pesan lam...,1
3,bumbu mantap,1
4,harga rasa coklat kualitas terima kasih banyak...,1


In [5]:
df_electronic.head()

Unnamed: 0,reviews,label
0,harga murah speaker lumayan kenceng karoke mantap,1
1,terima kasih barang kirim cepat mantap banget,1
2,mantap barang kirim cepat seler ramah pokok ma...,1
3,mantap paket sesuai pesan,1
4,terimakasih seler shope barang aman bagus,1


In [6]:
df_fashion.head()

Unnamed: 0,reviews,label
0,alhamdulilah jilbab kualitas bagus banget kecewa,1
1,bahan suka adem lembut sangat muas sekali,1
2,hodie tebal bagus,1
3,respon jual ramah baik kirim lumayan cepat har...,1
4,bagus banget bahan tebal kirim cepat memuaskan...,1


## Split Dataset

In [7]:
def split_data(df, test_size, target):
    X = df.drop(target, axis=1)  
    y = df[target]			

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42, stratify=y)
    df_train, df_test = pd.DataFrame(), pd.DataFrame()
    df_train['reviews'], df_test['reviews'] = X_train, X_test
    df_train['label'], df_test['label'] = y_train, y_test
    df_train, df_test = df_train.reset_index(drop=True), df_test.reset_index(drop=True)
    return df_train, df_test    

In [8]:
train_food, test_food = split_data(df_food, 0.2, 'label')
train_food, test_food

(                                                reviews  label
 0     kualitas produk sangat baik cepat kirim sangat...      1
 1     luar krna alot keras pikir emng makan semua tu...      0
 2     baik kasih bonus cobain satu arti besok order ...      1
 3     kurang next lebih teliti complain krna lalai r...      0
 4     mantul baru pesan malam besok langsung sampe p...      1
 ...                                                 ...    ...
 8227  sesuai pesan cepat harga terimakasih moga maki...      1
 8228            coba moga cocok hehe terima kasih seler      1
 8229  harga mahal kualitas rasa coba emas rapi aman ...      0
 8230              rasa enak harga sesuai kualitas bagus      0
 8231  beli bundle dkrim kotak respon lama aktif beli...      0
 
 [8232 rows x 2 columns],
                                                 reviews  label
 0                 kucing selalu suka sama bikin mencret      0
 1     harga jenis kualitas kirim emas barang pesan s...      1
 2          

In [9]:
train_electronic, test_electronic = split_data(df_electronic, 0.2, 'label')
train_electronic, test_electronic

(                                                 reviews  label
 0      harga murah riah kualitas okey desain baik bar...      1
 1      produk asli bagus harga coba mudah mudah baran...      1
 2      alhamdulilah barang sampe aman kendala sampe l...      1
 3                 paket terima selamat coba thanks seler      1
 4                           bagus banget cucok moga awet      1
 ...                                                  ...    ...
 39063  bagus banget makasi banyak seler kirim cepet n...      1
 39064         ngerekam tulis storage padahal udah conect      0
 39065                     bagus packing aman fungsi baik      1
 39066  suka banget fungsi baik sesuai pesan enak bang...      1
 39067               kirim super lama suara kurang ngebas      0
 
 [39068 rows x 2 columns],
                                                 reviews  label
 0                        barang catat lampu nyala panas      1
 1     udah lanjur terima pakai beberapa hari mouse m...      0

In [10]:
train_fashion, test_fashion = split_data(df_fashion, 0.2, 'label')
train_fashion, test_fashion

(                                                 reviews  label
 0                       suka banget sama sandal markotop      1
 1      paket baju cukup bagus cuma bahan tipis okay h...      1
 2                     bagus cuma bahan bawah keras licin      1
 3      warna beda banget sama mustard malah lebih mir...      0
 4      sumpah bagus banget bahan enak lucu banget uku...      1
 ...                                                  ...    ...
 88703  plis enak banget bahan enak bentuk adem sama s...      1
 88704  kirim cepat kurir ramah bahan nerawang tipis u...      0
 88705  kirim lumayan cepet packing rapi sekaligus saf...      1
 88706  kain tipis biasa cukup muat keluar uang jahit ...      0
 88707                                 barang sesuai foto      0
 
 [88708 rows x 2 columns],
                                                  reviews  label
 0      mantap jaketny tipis bhanya hrgany mantap mant...      1
 1         tipis banget terus kecil ukur anak sesuai hrga    

In [11]:
print('Rasio Label Data Training Kategori Food :', Counter(train_food.label))
print('Rasio Label Data Testing Kategori Food :',Counter(test_food.label))

Rasio Label Data Training Kategori Food : Counter({0: 4119, 1: 4113})
Rasio Label Data Testing Kategori Food : Counter({0: 1030, 1: 1028})


## Save Data

In [16]:
train_food.to_csv('../data/processed/train/food_train.csv', index = False)
train_electronic.to_csv('../data/processed/train/electronic_train.csv', index = False)
train_fashion.to_csv('../data/processed/train/fashion_train.csv', index = False)

test_food.to_csv('../data/processed/test/food_test.csv', index = False)
test_electronic.to_csv('../data/processed/test/electronic_test.csv', index = False)
test_fashion.to_csv('../data/processed/test/fashion_test.csv', index = False)