In [16]:
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
import re
import nltk
import string
from collections import Counter
from torch.utils.data import DataLoader
import numpy as np

In [11]:
cols = ["sentiment", "review"]
train_orig  = pd.read_csv("train.csv", names=cols)
test = pd.read_csv("test.csv", names=cols)

In [12]:
test.head()

Unnamed: 0,sentiment,review
0,2,"Contrary to other reviews, I have zero complai..."
1,1,Last summer I had an appointment to get new ti...
2,2,"Friendly staff, same starbucks fair you get an..."
3,1,The food is good. Unfortunately the service is...
4,2,Even when we didn't have a car Filene's Baseme...


In [13]:
train_orig.head()

Unnamed: 0,sentiment,review
0,1,"Unfortunately, the frustration of being Dr. Go..."
1,2,Been going to Dr. Goldberg for over 10 years. ...
2,1,I don't know what Dr. Goldberg was like before...
3,1,I'm writing this review to give you a heads up...
4,2,All the food is great here. But the best thing...


In [14]:
train_ = train_orig.groupby('sentiment',as_index=False).apply(lambda x: x.sample(frac=0.3))
train_ = train_.reset_index( level = 0, drop=True)

In [17]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"([.,!?])", r" \1 ", text)
    #text = re.sub(r"[^a-zA-Z.,!?]+", r" ", text) 
    return text
train = train_.copy()
train.review = train.review.apply(clean_text)
test.review = test.review.apply(clean_text)
X_train, X_valid, y_train, y_valid = train_test_split(train["review"], train["sentiment"], test_size=0.15, random_state=42, stratify=train["sentiment"])
train = pd.concat([X_train, y_train], axis=1)
valid = pd.concat([X_valid, y_valid], axis=1)
train["split"] = "train"
test["split"] = "test"
valid["split"] = "val"
reviews_df = pd.concat([train, test, valid])
reviews_df.rename(columns={"sentiment": "rating"}, inplace=True)
reviews_df.head()

Unnamed: 0,review,rating,split
35313,i really wanted to like this place . . but i...,1,train
319592,service was good . it's cute inside . i love...,2,train
427167,i guess i was expecting more . the idea of t...,1,train
165911,went to this buffet for dinner on saturday nig...,1,train
377147,i've used caudle pest control twice for bee hi...,2,train


In [None]:
reviews_df.to_csv("./reviews.csv")

In [19]:
from torch.utils.data import Dataset

class ReviewDataset(Dataset):
    def __init__(self, review_df, vectorizer):
        """
        Args:
            review_df (pandas.DataFrame): the dataset
            vectorizer (ReviewVectorizer): vectorizer instantiated from dataset
        """
        self.review_df = review_df
        self._vectorizer = vectorizer

        self.train_df = self.review_df[self.review_df.split=='train']
        self.train_size = len(self.train_df)

        self.val_df = self.review_df[self.review_df.split=='val']
        self.validation_size = len(self.val_df)

        self.test_df = self.review_df[self.review_df.split=='test']
        self.test_size = len(self.test_df)

        self._lookup_dict = {'train': (self.train_df, self.train_size),
                             'val': (self.val_df, self.validation_size),
                             'test': (self.test_df, self.test_size)}

        self.set_split('train')

    @classmethod
    def load_dataset_and_make_vectorizer(cls, review_csv):
        """Load dataset and make a new vectorizer from scratch
        
        Args:
            review_csv (str): location of the dataset
        Returns:
            an instance of ReviewDataset
        """
        review_df = pd.read_csv(review_csv)
        return cls(review_df, ReviewVectorizer.from_dataframe(review_df))

    def get_vectorizer(self):
        """ returns the vectorizer """
        return self._vectorizer

    def set_split(self, split="train"):
        """ selects the splits in the dataset using a column in the dataframe 
        
        Args:
            split (str): one of "train", "val", or "test"
        """
        self._target_split = split
        self._target_df, self._target_size = self._lookup_dict[split]

    def __len__(self):
        return self._target_size

    def __getitem__(self, index):
        """the primary entry point method for PyTorch datasets
        
        Args:
            index (int): the index to the data point 
        Returns:
            a dict of the data point's features (x_data) and label (y_target)
        """
        row = self._target_df.iloc[index]

        review_vector = \
            self._vectorizer.vectorize(row.review)

        rating_index = \
            self._vectorizer.rating_vocab.lookup_token(row.rating)

        return {'x_data': review_vector,
                'y_target': rating_index}

    def get_num_batches(self, batch_size):
        """Given a batch size, return the number of batches in the dataset
        
        Args:
            batch_size (int)
        Returns:
            number of batches in the dataset
        """
        return len(self) // batch_size

In [20]:
class Vocabulary(object):
    """Class to process text and extract Vocabulary for mapping"""

    def __init__(self, token_to_idx=None, add_unk=True, unk_token="<UNK>"):
        """
        Args:
            token_to_idx (dict): a pre-existing map of tokens to indices
            add_unk (bool): a flag that indicates whether to add the UNK token
            unk_token (str): the UNK token to add into the Vocabulary
        """

        if token_to_idx is None:
            token_to_idx = {}
        self._token_to_idx = token_to_idx

        self._idx_to_token = {idx: token 
                              for token, idx in self._token_to_idx.items()}

        self._add_unk = add_unk
        self._unk_token = unk_token
        
        self.unk_index = -1
        if add_unk:
            self.unk_index = self.add_token(unk_token) 
        
        
    def to_serializable(self):
        """ returns a dictionary that can be serialized """
        return {'token_to_idx': self._token_to_idx, 
                'add_unk': self._add_unk, 
                'unk_token': self._unk_token}

    @classmethod
    def from_serializable(cls, contents):
        """ instantiates the Vocabulary from a serialized dictionary """
        return cls(**contents)

    def add_token(self, token):
        """Update mapping dicts based on the token.

        Args:
            token (str): the item to add into the Vocabulary
        Returns:
            index (int): the integer corresponding to the token
        """
        if token in self._token_to_idx:
            index = self._token_to_idx[token]
        else:
            index = len(self._token_to_idx)
            self._token_to_idx[token] = index
            self._idx_to_token[index] = token
        return index

    def lookup_token(self, token):
        """Retrieve the index associated with the token 
          or the UNK index if token isn't present.
        
        Args:
            token (str): the token to look up 
        Returns:
            index (int): the index corresponding to the token
        Notes:
            `unk_index` needs to be >=0 (having been added into the Vocabulary) 
              for the UNK functionality 
        """
        if self._add_unk:
            return self._token_to_idx.get(token, self.unk_index)
        else:
            return self._token_to_idx[token]

    def lookup_index(self, index):
        """Return the token associated with the index
        
        Args: 
            index (int): the index to look up
        Returns:
            token (str): the token corresponding to the index
        Raises:
            KeyError: if the index is not in the Vocabulary
        """
        if index not in self._idx_to_token:
            raise KeyError("the index (%d) is not in the Vocabulary" % index)
        return self._idx_to_token[index]

    def __str__(self):
        return "<Vocabulary(size=%d)>" % len(self)

    def __len__(self):
        return len(self._token_to_idx)

In [21]:
class ReviewVectorizer(object):
    """ The Vectorizer which coordinates the Vocabularies and puts them to use"""
    def __init__(self, review_vocab, rating_vocab):
        """
        Args:
            review_vocab (Vocabulary): maps words to integers
            rating_vocab (Vocabulary): maps class labels to integers
        """
        self.review_vocab = review_vocab
        self.rating_vocab = rating_vocab

    def vectorize(self, review):
        """Create a collapsed one-hit vector for the review
        
        Args:
            review (str): the review
        Returns:
            one_hot (np.ndarray): the collapsed one-hot encoding
        """
        one_hot = np.zeros(len(self.review_vocab), dtype=np.float32)
        
        for token in review.split(" "):
            if token not in string.punctuation:
                one_hot[self.review_vocab.lookup_token(token)] = 1

        return one_hot

    @classmethod
    def from_dataframe(cls, review_df, cutoff=25):
        """Instantiate the vectorizer from the dataset dataframe
        
        Args:
            review_df (pandas.DataFrame): the review dataset
            cutoff (int): the parameter for frequency-based filtering
        Returns:
            an instance of the ReviewVectorizer
        """
        review_vocab = Vocabulary(add_unk=True)
        rating_vocab = Vocabulary(add_unk=False)
        
        # Add ratings
        for rating in sorted(set(review_df.rating)):
            rating_vocab.add_token(rating)

        # Add top words if count > provided count
        word_counts = Counter()
        for review in review_df.review:
            for word in review.split(" "):
                if word not in string.punctuation:
                    word_counts[word] += 1
               
        for word, count in word_counts.items():
            if count > cutoff:
                review_vocab.add_token(word)

        return cls(review_vocab, rating_vocab)

    @classmethod
    def from_serializable(cls, contents):
        """Intantiate a ReviewVectorizer from a serializable dictionary
        
        Args:
            contents (dict): the serializable dictionary
        Returns:
            an instance of the ReviewVectorizer class
        """
        review_vocab = Vocabulary.from_serializable(contents['review_vocab'])
        rating_vocab =  Vocabulary.from_serializable(contents['rating_vocab'])

        return cls(review_vocab=review_vocab, rating_vocab=rating_vocab)

    def to_serializable(self):
        """Create the serializable dictionary for caching
        
        Returns:
            contents (dict): the serializable dictionary
        """
        return {'review_vocab': self.review_vocab.to_serializable(),
                'rating_vocab': self.rating_vocab.to_serializable()}


In [22]:
from torch.utils.data import DataLoader

def generate_batches(dataset, batch_size, shuffle=True,
                     drop_last=True, device="cpu"):
    """
    A generator function which wraps the PyTorch DataLoader. It will 
      ensure each tensor is on the write device location.
    """
    dataloader = DataLoader(dataset=dataset, batch_size=batch_size,
                            shuffle=shuffle, drop_last=drop_last)

    for data_dict in dataloader:
        out_data_dict = {}
        for name, tensor in data_dict.items():
            out_data_dict[name] = data_dict[name].to(device)
        yield out_data_dict

In [23]:
import torch.nn as nn
import torch.nn.functional as F

class ReviewClassifier(nn.Module):
    """ a simple perceptron-based classifier """
    def __init__(self, num_features):
        """
        Args:
            num_features (int): the size of the input feature vector
        """
        super(ReviewClassifier, self).__init__()
        self.fc1 = nn.Linear(in_features=num_features, 
                             out_features=1)

    def forward(self, x_in, apply_sigmoid=False):
        """The forward pass of the classifier
        
        Args:
            x_in (torch.Tensor): an input data tensor 
                x_in.shape should be (batch, num_features)
            apply_sigmoid (bool): a flag for the sigmoid activation
                should be false if used with the cross-entropy losses
        Returns:
            the resulting tensor. tensor.shape should be (batch,).
        """
        y_out = self.fc1(x_in).squeeze()
        if apply_sigmoid:
            y_out = F.sigmoid(y_out)
        return y_out

In [24]:
from argparse import Namespace

args = Namespace(
    # Data and path information
    frequency_cutoff=25,
    model_state_file='model.pth',
    review_csv='./reviews.csv',
    save_dir='./',
    vectorizer_file='vectorizer.json',
    # No model hyperparameters
    # Training hyperparameters
    batch_size=128,
    early_stopping_criteria=5,
    learning_rate=0.001,
    num_epochs=100,
    seed=1337,
    # Runtime options omitted for space
)
args

Namespace(batch_size=128, early_stopping_criteria=5, frequency_cutoff=25, learning_rate=0.001, model_state_file='model.pth', num_epochs=100, review_csv='./reviews.csv', save_dir='./', seed=1337, vectorizer_file='vectorizer.json')

In [25]:
def compute_accuracy(y_pred, y_target):
    y_target = y_target.cpu()
    y_pred_indices = (torch.sigmoid(y_pred)>0.5).cpu().long()#.max(dim=1)[1]
    n_correct = torch.eq(y_pred_indices, y_target).sum().item()
    return n_correct / len(y_pred_indices) * 100

In [26]:
import torch.optim as optim 

def make_train_state(args):
    return {'epoch_index': 0,
            'train_loss': [],
            'train_acc': [],
            'val_loss': [],
            'val_acc': [],
            'test_loss': -1,
            'test_acc': -1}
train_state = make_train_state(args)

if not torch.cuda.is_available():
    args.cuda = False
args.device = torch.device("cuda" if args.cuda else "cpu")

# dataset and vectorizer
dataset = ReviewDataset.load_dataset_and_make_vectorizer(args.review_csv)
vectorizer = dataset.get_vectorizer()

# model
classifier = ReviewClassifier(num_features=len(vectorizer.review_vocab))
classifier = classifier.to(args.device)

# loss and optimizer
loss_func = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(classifier.parameters(), lr=args.learning_rate)


In [27]:
for epoch_index in range(args.num_epochs):
    train_state['epoch_index'] = epoch_index

    # Iterate over training dataset

    # setup: batch generator, set loss and acc to 0, set train mode on
    dataset.set_split('train')
    batch_generator = generate_batches(dataset, 
                                       batch_size=args.batch_size, 
                                       device=args.device)
    running_loss = 0.0
    running_acc = 0.0
    classifier.train()
    
    for batch_index, batch_dict in enumerate(batch_generator):
        # the training routine is 5 steps:

        # step 1. zero the gradients
        optimizer.zero_grad()

        # step 2. compute the output
        y_pred = classifier(x_in=batch_dict['x_data'].float())

        # step 3. compute the loss
        loss = loss_func(y_pred, batch_dict['y_target'].float())
        loss_batch = loss.item()
        running_loss += (loss_batch - running_loss) / (batch_index + 1)

        # step 4. use loss to produce gradients
        loss.backward()

        # step 5. use optimizer to take gradient step
        optimizer.step()

        # -----------------------------------------
        # compute the accuracy
        acc_batch = compute_accuracy(y_pred, batch_dict['y_target'])
        running_acc += (acc_batch - running_acc) / (batch_index + 1)

    train_state['train_loss'].append(running_loss)
    train_state['train_acc'].append(running_acc)
    print("Accuracy: {} \nLoss: {}".format(running_acc, running_loss))
    
    # Iterate over val dataset

    # setup: batch generator, set loss and acc to 0, set eval mode on
    dataset.set_split('val')
    batch_generator = generate_batches(dataset, 
                                       batch_size=args.batch_size, 
                                       device=args.device)
    running_loss = 0.
    running_acc = 0.
    classifier.eval()

    for batch_index, batch_dict in enumerate(batch_generator):

        # step 1. compute the output
        y_pred = classifier(x_in=batch_dict['x_data'].float())

        # step 2. compute the loss
        loss = loss_func(y_pred, batch_dict['y_target'].float())
        loss_batch = loss.item()
        running_loss += (loss_batch - running_loss) / (batch_index + 1)

        # step 3. compute the accuracy
        acc_batch = compute_accuracy(y_pred, batch_dict['y_target'])
        running_acc += (acc_batch - running_acc) / (batch_index + 1)

    train_state['val_loss'].append(running_loss)
    train_state['val_acc'].append(running_acc)


ValueError: num_samples should be a positive integer value, but got num_samples=0

In [28]:
dataset.set_split('test')
batch_generator = generate_batches(dataset, 
                                   batch_size=args.batch_size, 
                                   device=args.device)
running_loss = 0.
running_acc = 0.
classifier.eval()

for batch_index, batch_dict in enumerate(batch_generator):
    # compute the output
    y_pred = classifier(x_in=batch_dict['x_data'].float())

    # compute the loss
    loss = loss_func(y_pred, batch_dict['y_target'].float())
    loss_batch = loss.item()
    running_loss += (loss_batch - running_loss) / (batch_index + 1)

    # compute the accuracy
    acc_batch = compute_accuracy(y_pred, batch_dict['y_target'])
    running_acc += (acc_batch - running_acc) / (batch_index + 1)

train_state['test_loss'] = running_loss
train_state['test_acc'] = running_acc

ValueError: num_samples should be a positive integer value, but got num_samples=0

In [30]:
def predict_rating(review, classifier, vectorizer,
                   decision_threshold=0.5):
    """Predict the rating of a review

    Args:
        review (str): the text of the review
        classifier (ReviewClassifier): the trained model
        vectorizer (ReviewVectorizer): the corresponding vectorizer
        decision_threshold (float): The numerical boundary which
            separates the rating classes
    """

    review = preprocess_text(review)
    vectorized_review = torch.tensor(vectorizer.vectorize(review))
    result = classifier(vectorized_review.view(1, -1))

    probability_value = F.sigmoid(result).item()

    index =  1
    if probability_value < decision_threshold:
        index = 0

    return vectorizer.rating_vocab.lookup_index(index)

test_review = "this is a pretty awesome book"
prediction = predict_rating(test_review, classifier, vectorizer)
print("{} -> {}".format(test_review, prediction)

SyntaxError: unexpected EOF while parsing (<ipython-input-30-67492dcc9e32>, line 27)

In [31]:
# Sort weights
fc1_weights = classifier.fc1.weight.detach()[0]
_, indices = torch.sort(fc1_weights, dim=0, descending=True)
indices = indices.numpy().tolist()

# Top 20 words
print("Influential words in Positive Reviews:")
print("--------------------------------------")
for i in range(20):
    print(vectorizer.review_vocab.lookup_index(indices[i]))

Influential words in Positive Reviews:
--------------------------------------
<UNK>


IndexError: list index out of range

In [32]:
# Top 20 negative words
print("Influential words in Negative Reviews:")
print("--------------------------------------")
indices.reverse()
for i in range(20):
    print(vectorizer.review_vocab.lookup_index(indices[i]))

Influential words in Negative Reviews:
--------------------------------------
<UNK>


IndexError: list index out of range

In [33]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data.sampler import SubsetRandomSampler
from torch.utils.data import TensorDataset, DataLoader

RANDOM_SEED=1

def set_random_seed_data(seed):
    RANDOM_SEED = seed

def lowercase(text):
    return text.lower()

def remove_nonaplhanumeric(text):
    text = re.sub('[^0-9a-zA-Z]+', ' ', text) 
    return text

def remove_unnecessary_char(text):
    text = re.sub('\n',' ',text) # Remove every '\n'
    text = re.sub('((www\.[^\s]+)|(https?://[^\s]+)|(http?://[^\s]+))',' ',text) # Remove every URL
    text = re.sub('  +', ' ', text) # Remove extra spaces
    text = text.strip()
    return text

def preprocess_text(text):
    text = lowercase(text)
    text = remove_nonaplhanumeric(text)
    text = remove_unnecessary_char(text)
    return text

def load_dataset_indonesian(data_name='prosa', data_path=None, data_path_test=None):
    if data_name == 'prosa':
        train = pd.read_csv('../input/dataset-prosa/data_train_full.tsv', sep='\t', header=None)
        train = train.rename(columns={0: "text", 1: "label"})
        train = train[train['label'] != 'neutral']
        train['label'] = train['label'].apply(lambda x: 1 if x=='positive' else 0)
        train['text'] = train['text'].apply(lambda x: preprocess_text(x))

        test = pd.read_csv('../input/dataset-prosa/data_testing_full.tsv', sep='\t', header=None)
        test = test.rename(columns={0: "text", 1: "label"})
        test = test[test['label'] != 'neutral']
        test['label'] = test['label'].apply(lambda x: 1 if x=='positive' else 0)
        test['text'] = test['text'].apply(lambda x: preprocess_text(x))
            
    elif data_name == 'trip_advisor':
        if data_path == None:
            train = pd.read_csv('../input/dataset-tripadvisor/train_set.csv')
#             train = pd.read_csv('../input/remove-duplicate-tripadvisor/train_set.csv')
        else:
            train = pd.read_csv(data_path)
            
        train = train.rename(columns={"content": "text", "polarity": "label"})
        train['label'] = train['label'].apply(lambda x: 1 if x=="positive" else 0)
        train['text'] = train['text'].apply(lambda x: preprocess_text(x))
        
        if data_path_test == None:
            test = pd.read_csv('../input/dataset-tripadvisor/test_set.csv')
#             test = pd.read_csv('../input/remove-duplicate-tripadvisor/test_set.csv')
        else:
            test = pd.read_csv(data_path_test)
            
        test = test.rename(columns={"content": "text", "polarity": "label"})
        test['label'] = test['label'].apply(lambda x: 1 if x=="positive" else 0)
        test['text'] = test['text'].apply(lambda x: preprocess_text(x))

    elif data_name == 'toxic':
        if data_path == None:
            data = pd.read_csv('../input/simpler-preprocess-indonesian-hate-abusive-text/preprocessed_indonesian_toxic_tweet.csv')
        else:
            data = pd.read_csv(data_path)
            
        data['label'] = ((data['HS'] == 1) | (data['Abusive'] == 1)).apply(lambda x: int(x))
        data = data[['Tweet', 'label']]
        data = data.rename(columns={'Tweet': 'text'})

        X_train, X_test, y_train, y_test = train_test_split(data.text.values, 
                                                            data.label.values, 
                                                            test_size=0.1,
                                                            random_state=RANDOM_SEED,
                                                            stratify=data.label.values)
        train = pd.DataFrame({'text': X_train,
                              'label': y_train})

        test = pd.DataFrame({'text': X_test,
                             'label': y_test})
        
    print("~~~Train Data~~~")
    print('Shape: ', train.shape)
    print(train[0:2])
    print("\nLabel:")
    print(train.label.value_counts())
    
    print("\n~~~Test Data~~~")
    print('Shape: ', test.shape)
    print(test[0:4])
    print("\nLabel:")
    print(test.label.value_counts())
    return train, test
    
def load_dataset_foreign(data_name='yelp'):
    train = None
    if data_name == 'yelp':
        train = pd.read_csv('../input/yelp-review-dataset/yelp_review_polarity_csv/train.csv', header=None)
        train = train.rename(columns={0: "label", 1: "text"})
        train['label'] = train['label'].apply(lambda x: 1 if x==2 else 0)
        train['text'] = train['text'].apply(lambda x: preprocess_text(x))
    
    elif data_name == 'toxic':
        data = pd.read_csv('../input/jigsaw-multilingual-toxic-comment-classification/jigsaw-unintended-bias-train.csv')
        data['toxic'] = data['toxic'].apply(lambda x: 1 if x>=0.5 else 0)

        data = data[['comment_text', 'toxic']]
        data = data.rename(columns={'comment_text': 'text',
                                    'toxic': 'label'})

        data_pos = data[data['label'] == 1]
        data_neg = data[data['label'] == 0]
        train = pd.concat([data_pos[0:152111], 
                           data_neg[0:152111]]).reset_index(drop=True)
        
        train['text'] = train['text'].apply(lambda x: preprocess_text(x))

     
    print("~~~Data~~~")
    print('Shape: ', train.shape)
    print(train[0:2])
    print("\nLabel:")
    print(train.label.value_counts())
    return train

def split_train_test(train_x, train_y, total_data=50, valid_size=0.2):
    train_x_split, valid_x_split, train_y_split, valid_y_split = train_test_split(train_x, 
                                                                                  train_y, 
                                                                                  test_size=valid_size,
                                                                                  random_state=RANDOM_SEED,
                                                                                  stratify=train_y)
    
        
    total_data_valid = int(np.floor(valid_size * total_data))
    total_data_train = total_data-total_data_valid

    train_x_split = train_x_split[:total_data_train]
    train_y_split = train_y_split[:total_data_train]
    valid_x_split = valid_x_split[:total_data_valid]
    valid_y_split = valid_y_split[:total_data_valid]
    
    return train_x_split, train_y_split, valid_x_split, valid_y_split
    
def load_features(data_path, total_data=50, valid_size=0.2):
    train_x = np.array([x for x in np.load('{}/train_text.npy'.format(data_path), allow_pickle=True)])
    train_y = pd.read_csv('{}/train_label.csv'.format(data_path)).label.values
    
    train_x_split, train_y_split, valid_x_split, valid_y_split = split_train_test(train_x,
                                                                                  train_y,
                                                                                  total_data=total_data,
                                                                                  valid_size=valid_size)
    return train_x_split, train_y_split, valid_x_split, valid_y_split
    

def load_experiment_features(data_path_indo,
                             data_path_foreign,
                             tipe='A', 
                             total_data=50, 
                             foreign_mult=1, 
                             valid_size=0.2,
                             ):
    ##########################
    # Load Preprocessed Data #
    ##########################
    if tipe == 'A':
        train_x, train_y, valid_x, valid_y = load_features(data_path_indo,
                                                           total_data=total_data, 
                                                           valid_size=valid_size)
        
    elif tipe == 'B':
        train_x, train_y, _, _ = load_features(data_path_foreign,
                                               total_data=total_data, 
                                               valid_size=valid_size)
        
        _, _, valid_x, valid_y = load_features(data_path_indo,
                                               total_data=total_data, 
                                               valid_size=valid_size)
        
    elif tipe == 'C':
        train_x_indo, train_y_indo, valid_x_indo, valid_y_indo = load_features(data_path_indo,
                                                                                total_data=total_data, 
                                                                                valid_size=valid_size)

        train_x_foreign, train_y_foreign, valid_x_foreign, valid_y_foreign = load_features(data_path_foreign,
                                                                                           total_data=int(total_data*foreign_mult), 
                                                                                           valid_size=valid_size)

        train_x = np.concatenate([
                    train_x_indo,
                    train_x_foreign,
                    ])

        train_y = np.concatenate([
                    train_y_indo,
                    train_y_foreign,
                ])

        valid_x = valid_x_indo

        valid_y = valid_y_indo
        

    test_x = np.array([x for x in np.load('{}/test_text.npy'.format(data_path_indo), allow_pickle=True)])
    test_y = pd.read_csv('{}/test_label.csv'.format(data_path_indo)).label.values

    #########################
    # Convert to dataloader #
    #########################
    batch_size = 32

    train_data = TensorDataset(torch.from_numpy(train_x), torch.from_numpy(train_y))
    valid_data = TensorDataset(torch.from_numpy(valid_x), torch.from_numpy(valid_y))
    test_data = TensorDataset(torch.from_numpy(test_x), torch.from_numpy(test_y))
    
    train_loader = torch.utils.data.DataLoader(train_data, batch_size=batch_size, shuffle=True)
    valid_loader = torch.utils.data.DataLoader(valid_data, batch_size=batch_size)
    test_loader = torch.utils.data.DataLoader(test_data, batch_size=batch_size)

    return train_loader, valid_loader, test_loader

def load_train_dataset(data_name, total_data=50, valid_size=0.2, is_foreign=False, remove_duplication=False):
    train = None
    if is_foreign:
        train = load_dataset_foreign(data_name)
    else:
        train, test = load_dataset_indonesian(data_name)
    
    if remove_duplication:
        print("Removing duplication...")
        print("Previous shape: ", train.shape)
        train = train.drop_duplicates(keep = 'first') 
        print("Current shape: ", train.shape)
        print("Duplicate removed.")
    
    train_x_split, train_y_split, valid_x_split, valid_y_split = split_train_test(train.text.values,
                                                                                  train.label.values,
                                                                                  total_data=total_data,
                                                                                  valid_size=valid_size)
    
    train_x_split = np.array([x for x in train_x_split])
    valid_x_split = np.array([x for x in valid_x_split])
    return train_x_split, train_y_split, valid_x_split, valid_y_split

def load_experiment_dataset(data_name_indo,
                            data_name_foreign,
                            tipe='A', 
                            total_data=50, 
                            foreign_mult=1, 
                            valid_size=0.2,
                            remove_duplication=False):
    
    #################
    # Load Raw Data #
    #################
    if tipe == 'A':
        train_x, train_y, valid_x, valid_y = load_train_dataset(data_name_indo,
                                                                total_data=total_data, 
                                                                valid_size=valid_size,
                                                                is_foreign=False,
                                                                remove_duplication=remove_duplication)
        
    elif tipe == 'B':
        train_x, train_y, _, _ = load_train_dataset(data_name_foreign,
                                                    total_data=total_data, 
                                                    valid_size=valid_size,
                                                    is_foreign=True,
                                                    remove_duplication=remove_duplication)
        
        _, _, valid_x, valid_y = load_train_dataset(data_name_indo,
                                                    total_data=total_data, 
                                                    valid_size=valid_size,
                                                    is_foreign=False,
                                                    remove_duplication=remove_duplication)
        
    elif tipe == 'C':
        train_x_indo, train_y_indo, valid_x_indo, valid_y_indo = load_train_dataset(data_name_indo,
                                                                                    total_data=total_data, 
                                                                                    valid_size=valid_size,
                                                                                    is_foreign=False,
                                                                                    remove_duplication=remove_duplication)

        train_x_foreign, train_y_foreign, valid_x_foreign, valid_y_foreign = load_train_dataset(data_name_foreign,
                                                                                                total_data=int(total_data*foreign_mult), 
                                                                                                valid_size=valid_size,
                                                                                                is_foreign=True,
                                                                                                remove_duplication=remove_duplication)

        train_x = np.concatenate([
                    train_x_indo,
                    train_x_foreign,
                    ])

        train_y = np.concatenate([
                    train_y_indo,
                    train_y_foreign,
                ])

        valid_x = valid_x_indo

        valid_y = valid_y_indo
        
    

    _, test = load_dataset_indonesian(data_name=data_name_indo)
    test_x = test.text.values
    test_x = np.array([x for x in test_x])
    test_y = test.label.values
    
    indices = np.arange(len(train_x))
    np.random.seed(RANDOM_SEED)
    np.random.shuffle(indices)
    train_x = train_x[indices]
    train_y = train_y[indices]

    return (train_x, train_y), (valid_x, valid_y), (test_x, test_y)