In [2]:
import pandas as pd
import numpy as np

import nltk
from nltk.corpus import wordnet as wn
from nltk.tokenize import word_tokenize
import nltk.data
import collections
import random
from IPython.display import clear_output

from sklearn.model_selection import train_test_split

In [3]:
filename = "data/reviews_Home_and_Kitchen_5.json"
filename_rawdata_100 = "rawdata_100.pkl"
filename_rawdata_20 = "rawdata_20.pkl"

### Down Sample

In [None]:
def saveDataframe(filename, filename_rawdata, subfraction):
    data = pd.read_json(filename, lines=True, orient="frame")
    
    data['reviewText'] = data['reviewText'].str.lower()
    
    data['overall'] = np.where(data['overall'] < 3, 0, data.overall)
    data['overall'] = np.where(data['overall'] > 3, 1, data.overall)

    # down sample
    data_0 = data.loc[data['overall'] == 0]
    data_0 = data_0.sample(frac = subfraction)
    data_1 = data.loc[data['overall'] == 1].sample(len(data_0))

    data = data_0.append(data_1).sample(frac=1)
    
    data = data[['reviewText', 'overall']]
    
    data.to_pickle(filename_rawdata)
    print(len(data))

In [None]:
saveDataframe(filename, filename_rawdata_100, 1) # Whole Set
saveDataframe(filename, filename_rawdata_20, 0.2) # Smaller set

### Make Adverserial Set

In [4]:
file_name = "rawdata_100.pkl"
file_name_s = "rawdata_20.pkl"
filename_adversarial_wordnet = "data_adversial_wordnet"
filename_extension = ".pkl"
filename_adversarial_glove = "data_adversial_glove.pkl"
max_features = 20000
maxlen = 100
batch_size = 10
num_class = 2

tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

POS_TO_WORDNET = {
    'NN': wn.NOUN,
    'JJ': wn.ADJ,
    'JJR': wn.ADJ,
    'JJS': wn.ADJ,
}

In [5]:
def addToReplacement(token, synDict):
    if token[1] not in POS_TO_WORDNET: return None
    w = token[0].lower()
    wn_pos = POS_TO_WORDNET[token[1]]
    synsets = wn.synsets(w, wn_pos)
    if not synsets: return None
    synset = synsets[0]
    synonyms = []
    antonyms = []
    
    for syn in wn.synsets(w, wn_pos):
        for lem in syn.lemmas():
            if (lem.name() != w):
                synDict[w] = lem.name()

In [6]:
def makeAdversarialDataset(numReplacement, numSample, texts, data):
    set_adversial = []
    set_unchanged = []
    index = 0
    sampleOutput = []
    for i, row in data.iterrows():
        synsetDictionary = {}

        text = texts[i]
        tokenized = tokenizer.tokenize(text)
        words = word_tokenize(text)
        tokens = nltk.pos_tag(words)
        for token in tokens:
            addToReplacement(token, synsetDictionary)
        if len(synsetDictionary) > 0:
            output = text
            for key in random.sample(synsetDictionary.keys(), min(numReplacement, len(synsetDictionary))):
                output = output.replace(key, synsetDictionary[key])
                if (numSample > 0):
                    sampleOutput.append("[" + str(i) + "]: key-" + str(key) + " value-" + str(synsetDictionary[key]))
                del synsetDictionary[key]
            data.at[i,'reviewText'] = output
            numSample = numSample - 1
            set_adversial.append(i)
        else:
            set_unchanged.append(i)
        print("progress bar: ", index, "/", len(data))
        clear_output(wait=True)
        index = index + 1

    print(sampleOutput)
    return set_adversial

#### Whole Set

In [36]:
dataWhole = pd.read_pickle(file_name)
X = dataWhole.reviewText
Y = dataWhole.overall
X_trains, X_test, Y_trains, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

dataIter = pd.DataFrame({'reviewText': X_test, 'overall': Y_test})

In [37]:
len(dataIter)

20568

In [38]:
for i in range(3):
    print("Numer of max replacement is ", i + 1, ": ")
    data = dataIter
    texts = data['reviewText']
    set_adversial = makeAdversarialDataset(i + 1, 3, texts, data)
#     data = data.loc[set_adversial,:]
    data.to_pickle(filename_adversarial_wordnet + "_" + str(i + 1) + filename_extension)

['[452577]: key-min value-Min', '[452577]: key-easy value-wanton', '[452577]: key-video value-TV', '[441929]: key-entire value-intact', '[441929]: key-amazing value-awing', '[441929]: key-scratch value-mark', '[342648]: key-dorm value-student_residence', '[342648]: key-unity value-oneness', '[342648]: key-shelf value-ledge']


In [39]:
data = dataIter
texts = data['reviewText']
set_adversial = makeAdversarialDataset(100000, 3, texts, data)
# data = data.loc[set_adversial,:]
data.to_pickle(filename_adversarial_wordnet + filename_extension)

['[452577]: key-muss value-genus_Mus', '[452577]: key-able value-able-bodied', '[452577]: key-safety value-prophylactic', '[452577]: key-garage value-service_department', '[452577]: key-greater value-with_child', '[452577]: key-cooking value-preparation', '[452577]: key-stick value-spliff', '[452577]: key-temp value-temporary_worker', '[452577]: key-older value-old', '[452577]: key-fryer value-pullet', '[452577]: key-line value-assembly_line', '[452577]: key-tv value-goggle_box', '[452577]: key-cavum value-bodily_cavity', '[452577]: key-amazon value-Amazon_River', '[452577]: key-easier value-easy', '[452577]: key-plug value-nag', '[452577]: key-ease value-relaxation', '[452577]: key-self value-ego', '[452577]: key-bottom value-merchant_ship', '[452577]: key-deck value-deck_of_cards', '[452577]: key-new value-young', '[452577]: key-last value-lowest', '[452577]: key-day value-Clarence_Shepard_Day_Jr.', '[452577]: key-bird value-shuttle', '[452577]: key-manner value-personal_manner', '[4

#### Smaller Set

In [None]:
data = pd.read_pickle(file_name_s)
texts = data['reviewText']
set_adversial = makeAdversarialDataset(100000, 3, texts, data)
data = data.loc[set_adversial,:]
data.to_pickle(filename_adversarial_wordnet + "_s" + filename_extension)

In [None]:
for i in range(3):
    print("Numer of max replacement is ", i + 1, ": ")
    data = pd.read_pickle(file_name_s)
    texts = data['reviewText']
    set_adversial = makeAdversarialDataset(i + 1, 3, texts, data)
    data = data.loc[set_adversial,:]
    data.to_pickle(filename_adversarial_wordnet + "_" + str(i + 1) + "_s" + filename_extension)

### Mix Data

In [13]:
file_name = "rawdata_100.pkl"
file_name_s = "rawdata_20.pkl"
file_name_mixed = "mixed"
file_name_20568 = "_20568"
filename_adversarial_wordnet = "data_adversial_wordnet"
filename_extension = ".pkl"
filename_adversarial_glove = "data_adversial_glove.pkl"

#### Whole Set

In [14]:
data_ori = pd.read_pickle(file_name)
data_adv = pd.read_pickle(filename_adversarial_wordnet + filename_extension)
data_adv1 = pd.read_pickle(filename_adversarial_wordnet + "_1" + filename_extension)
data_adv2 = pd.read_pickle(filename_adversarial_wordnet + "_2" + filename_extension)
data_adv3 = pd.read_pickle(filename_adversarial_wordnet + "_3" + filename_extension)
data = data_ori.append(data_adv)
data = data.append(data_adv1)
data = data.append(data_adv2)
data = data.append(data_adv3)
data.to_pickle(file_name_mixed + filename_extension)

In [15]:
print(len(data_adv))
print(len(data_adv1))
print(len(data_adv2))
print(len(data_adv3))

20568
20568
20568
20568


#### Smaller Set

In [None]:
data_ori = pd.read_pickle(file_name_s)
data_adv = pd.read_pickle(filename_adversarial_wordnet + "_s" + filename_extension)
data_adv1 = pd.read_pickle(filename_adversarial_wordnet + "_1_s" + filename_extension)
data_adv2 = pd.read_pickle(filename_adversarial_wordnet + "_2_s" + filename_extension)
data_adv3 = pd.read_pickle(filename_adversarial_wordnet + "_3_s" + filename_extension)
data = data_ori.append(data_adv)
data = data.append(data_adv1)
data = data.append(data_adv2)
data = data.append(data_adv3)
data.to_pickle(file_name_mixed + "_s" + filename_extension)

### Generate Adversarial samples for training

In [7]:
dataWhole = pd.read_pickle(file_name)
X = dataWhole.reviewText
Y = dataWhole.overall
X_trains, X_test, Y_trains, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
dataIter = pd.DataFrame({'reviewText': X_trains, 'overall': Y_trains})

In [8]:
data = dataIter
texts = data['reviewText']
set_adversial = makeAdversarialDataset(100000, 3, texts, data)
data = data.append(dataIter)
data.to_pickle("adversarialTest_train" + filename_extension)
data = pd.DataFrame({'reviewText': X_test, 'overall': Y_test})
data.to_pickle("adversarialTest_test" + filename_extension)

['[137686]: key-day value-Clarence_Shepard_Day_Jr.', '[137686]: key-messy value-mussy', '[137686]: key-easy value-wanton', '[137686]: key-easier value-easy', '[295086]: key-job value-caper', '[295086]: key-nothing value-zippo', '[295086]: key-scary value-shuddery', '[295086]: key-resistant value-repellent', '[295086]: key-same value-like', '[295086]: key-collapse value-crash', '[295086]: key-old value-previous', '[295086]: key-cent value-centime', '[295086]: key-cheap value-chintzy', '[295086]: key-teflon value-polytetrafluoroethylene', '[295086]: key-more value-more_than', '[295086]: key-fine value-hunky-dory', '[295086]: key-heating value-heat', '[295086]: key-i value-I', '[295086]: key-rubber value-gumshoe', '[295086]: key-season value-time_of_year', '[295086]: key-plastic value-charge_plate', '[295086]: key-impressive value-telling', '[295086]: key-panel value-board', '[295086]: key-outside value-away', '[295086]: key-loud value-forte', '[295086]: key-little value-small', '[295086]

In [10]:
data_ori_train = pd.read_pickle("adversarialTest_train.pkl")
data_ori_test = pd.read_pickle("adversarialTest_test.pkl")
data_adv = pd.read_pickle(filename_adversarial_wordnet + filename_extension)
data_adv1 = pd.read_pickle(filename_adversarial_wordnet + "_1" + filename_extension)
data_adv2 = pd.read_pickle(filename_adversarial_wordnet + "_2" + filename_extension)
data_adv3 = pd.read_pickle(filename_adversarial_wordnet + "_3" + filename_extension)
data = data_ori_train.append(data_ori_test)
data = data.append(data_adv)
data = data.append(data_adv1)
data = data.append(data_adv2)
data = data.append(data_adv3)
data.to_pickle("adversarialTest" + filename_extension)

In [11]:
print(len(data_ori_train))
print(len(data_ori_test))
print(len(data_adv))
print(len(data_adv1))
print(len(data_adv2))
print(len(data_adv3))

164540
20568
20568
20568
20568
20568


In [12]:
len(data)

267380