# Process SST

## Load dataset

In [44]:
from datasets import load_dataset
from sklearn.model_selection import train_test_split
import xlrd
import pandas as pd
import numpy as np

In [1]:
dataset = load_dataset('sst')

No config specified, defaulting to: sst/default
Reusing dataset sst (/home/dzhang5/.cache/huggingface/datasets/sst/default/1.0.0/b8a7889ef01c5d3ae8c379b84cc4080f8aad3ac2bc538701cbe0ac6416fb76ff)


In [2]:
dataset

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'tokens', 'tree'],
        num_rows: 8544
    })
    validation: Dataset({
        features: ['sentence', 'label', 'tokens', 'tree'],
        num_rows: 1101
    })
    test: Dataset({
        features: ['sentence', 'label', 'tokens', 'tree'],
        num_rows: 2210
    })
})

In [3]:
dataset['train'].set_format(type='pandas')
dataset['validation'].set_format(type='pandas')
dataset['test'].set_format(type='pandas')

In [4]:
train_data = dataset['train'][:]
val_data = dataset['validation'][:]
test_data = dataset['test'][:]

In [7]:
train_data.head()

Unnamed: 0,sentence,label,tokens,tree
0,The Rock is destined to be the 21st Century 's...,0.69444,The|Rock|is|destined|to|be|the|21st|Century|'s...,70|70|68|67|63|62|61|60|58|58|57|56|56|64|65|5...
1,The gorgeously elaborate continuation of `` Th...,0.83333,The|gorgeously|elaborate|continuation|of|``|Th...,71|70|69|69|67|67|66|64|63|62|62|61|61|58|57|5...
2,Singer\/composer Bryan Adams contributes a sle...,0.625,Singer\/composer|Bryan|Adams|contributes|a|sle...,72|71|71|70|68|68|67|67|66|63|62|62|60|60|58|5...
3,You 'd think by now America would have had eno...,0.5,You|'d|think|by|now|America|would|have|had|eno...,36|35|34|33|33|32|30|29|27|26|25|24|23|23|22|2...
4,Yet the act is still charming here .,0.72222,Yet|the|act|is|still|charming|here|.,15|13|13|10|9|9|11|12|10|11|12|14|14|15|0


In [8]:
train_data['token_list'] = train_data['tokens'].apply(lambda x: x.split('|'))
val_data['token_list'] = val_data['tokens'].apply(lambda x: x.split('|'))
test_data['token_list'] = test_data['tokens'].apply(lambda x: x.split('|'))

In [10]:
train_data['binary_label'] = train_data['label'] >= 0.5
val_data['binary_label'] = val_data['label'] >= 0.5
test_data['binary_label'] = test_data['label'] >= 0.5

In [9]:
train_data['token_list_len'] = train_data['token_list'].apply(len)
val_data['token_list_len'] = val_data['token_list'].apply(len)
test_data['token_list_len'] = test_data['token_list'].apply(len)

## Extract data for human annotation

In [12]:
(train_data['token_list_len'] >=10).sum()

7160

In [13]:
((train_data['label'] <=0.6)&(train_data['label'] >=0.4)).sum()

1624

In [14]:
train_pos_candidate = train_data.loc[(train_data['label'] >=0.6)&(train_data['token_list_len'] >=15)]
train_neg_candidate = train_data.loc[(train_data['label'] <0.4)&(train_data['token_list_len'] >=15)]

val_pos_candidate = val_data.loc[(val_data['label'] >=0.6)&(val_data['token_list_len'] >=15)]
val_neg_candidate = val_data.loc[(val_data['label'] <0.4)&(val_data['token_list_len'] >=15)]

test_pos_candidate = test_data.loc[((test_data['label'] >=0.6))&(test_data['token_list_len'] >=15)]
test_neg_candidate = test_data.loc[(test_data['label'] <0.4)&(test_data['token_list_len'] >=15)]

In [17]:
random_state = 1
_, train_pos_ham_index = train_test_split(train_pos_candidate.index, test_size=100, random_state=random_state)
_, train_neg_ham_index = train_test_split(train_neg_candidate.index, test_size=100, random_state=random_state)

_, val_pos_ham_index = train_test_split(val_pos_candidate.index, test_size=50, random_state=random_state)
_, val_neg_ham_index = train_test_split(val_neg_candidate.index, test_size=50, random_state=random_state)

_, test_pos_ham_index = train_test_split(test_pos_candidate.index, test_size=50, random_state=random_state)
_, test_neg_ham_index = train_test_split(test_neg_candidate.index, test_size=50, random_state=random_state)

In [265]:
train_pos = train_pos_candidate.loc[train_pos_ham_index]
train_neg = train_neg_candidate.loc[train_neg_ham_index]

val_pos = val_pos_candidate.loc[val_pos_ham_index]
val_neg = val_neg_candidate.loc[val_neg_ham_index]

test_pos = test_pos_candidate.loc[test_pos_ham_index]
test_neg = test_neg_candidate.loc[test_neg_ham_index]

In [267]:
val_all = pd.concat([val_pos, val_neg])
test_all = pd.concat([test_pos, test_neg])

In [272]:
test_data.tail()

Unnamed: 0,sentence,label,tokens,tree,token_list,token_list_len,binary_label
2205,An imaginative comedy\/thriller .,0.77778,An|imaginative|comedy\/thriller|.,7|6|5|5|6|7|0,"[An, imaginative, comedy\/thriller, .]",4,True
2206,"( A ) rare , beautiful film .",0.91667,"(|A|)|rare|,|beautiful|film|.",13|12|12|11|10|9|9|15|10|11|14|13|14|15|0,"[(, A, ), rare, ,, beautiful, film, .]",8,True
2207,( An ) hilarious romantic comedy .,0.88889,(|An|)|hilarious|romantic|comedy|.,12|11|11|9|8|8|10|9|10|13|12|13|0,"[(, An, ), hilarious, romantic, comedy, .]",7,True
2208,Never ( sinks ) into exploitation .,0.625,Never|(|sinks|)|into|exploitation|.,11|10|9|9|8|8|13|12|10|11|12|13|0,"[Never, (, sinks, ), into, exploitation, .]",7,True
2209,( U ) nrelentingly stupid .,0.069444,(|U|)|nrelentingly|stupid|.,10|9|9|7|7|8|8|11|10|11|0,"[(, U, ), nrelentingly, stupid, .]",6,False


In [276]:
def write_to_xlsx(data, xname):
    max_len = data['token_list_len'].max()
    with xlsxwriter.Workbook(xname) as workbook:
        worksheet = workbook.add_worksheet()
        line_ix = 0
        first_line = ['sentence_index', 'binary_label', 'float_label'] + [f'token_{i}' for i in range(max_len)]
        worksheet.write_row(line_ix, 0, first_line)
        line_ix += 1

        for i in range(data.shape[0]):
            line_1 = [int(data.index[i]), int(data['binary_label'].iloc[i]), data['label'].iloc[i]] + data['token_list'].iloc[i]
            line_2 = ['', '', ''] + [0 for w in data['token_list'].iloc[i]]
            line_3 = ['']
            worksheet.write_row(line_ix, 0, line_1)
            line_ix += 1
            worksheet.write_row(line_ix, 0, line_2)
            line_ix += 1
            worksheet.write_row(line_ix, 0, line_3)
            line_ix += 1

In [277]:
write_to_xlsx(train_pos, 'senti_data/train_pos.xlsx')
write_to_xlsx(train_neg, 'senti_data/train_neg.xlsx')
write_to_xlsx(val_all, 'senti_data/val_mix.xlsx')
write_to_xlsx(test_all, 'senti_data/test_mix.xlsx')

## Load saved data

In [127]:
train_data = pd.read_pickle('./senti_data/train.p')
val_data = pd.read_pickle('./senti_data/val.p')
test_data = pd.read_pickle('./senti_data/test.p')

## Load labeled data

In [20]:
tid_list, b_label_list, text_list, att_list = [], [], [], []
workbook = xlrd.open_workbook('senti_data/train_pos_labeled.xlsx')
for sheet in workbook.sheets():
    for row in range(sheet.nrows):
        if row == 0:
            pass
        if (row-1) % 3 == 0:
            tid = sheet.cell(row, 0).value
            b_label = sheet.cell(row, 1).value
            text = [i.value for i in sheet.row(row)[3:] if i.ctype != 0]
            tid_list.append(int(tid))
            b_label_list.append(int(b_label))
            text_list.append(text)
        if (row-1) %3 == 1:
            att = [int(i.value) for i in sheet.row(row)[3:] if i.ctype != 0]
            if any([a not in [0, 1] for a in att]):
                print(row)
            att_list.append(att)
            
workbook = xlrd.open_workbook('senti_data/train_neg_labeled.xlsx')
for sheet in workbook.sheets():
    for row in range(sheet.nrows):
        if row == 0:
            pass
        if (row-1) % 3 == 0:
            tid = sheet.cell(row, 0).value
            b_label = sheet.cell(row, 1).value
            text = [i.value for i in sheet.row(row)[3:] if i.ctype != 0]
            tid_list.append(int(tid))
            b_label_list.append(int(b_label))
            text_list.append(text)
        if (row-1) %3 == 1:
            att = [int(i.value) for i in sheet.row(row)[3:] if i.ctype != 0]
            if any([a not in [0, 1] for a in att]):
                print(row)
            att_list.append(att)

In [39]:
vtid_list, vb_label_list, vtext_list, vatt_list = [], [], [], []
workbook = xlrd.open_workbook('senti_data/val_mix_labeled.xlsx')
for sheet in workbook.sheets():
    for row in range(sheet.nrows):
        if row == 0:
            pass
        if (row-1) % 3 == 0:
            tid = sheet.cell(row, 0).value
            b_label = sheet.cell(row, 1).value
            text = [i.value for i in sheet.row(row)[3:] if i.ctype != 0]
            vtid_list.append(int(tid))
            vb_label_list.append(int(b_label))
            vtext_list.append(text)
        if (row-1) %3 == 1:
            att = [int(i.value) for i in sheet.row(row)[3:] if i.ctype != 0]
            if any([a not in [0, 1] for a in att]):
                print(row)
            vatt_list.append(att)

In [131]:
ttid_list, tb_label_list, ttext_list, tatt_list = [], [], [], []
workbook = xlrd.open_workbook('senti_data/test_mix_labeled.xlsx')
for sheet in workbook.sheets():
    for row in range(sheet.nrows):
        if row == 0:
            pass
        if (row-1) % 3 == 0:
            tid = sheet.cell(row, 0).value
            b_label = sheet.cell(row, 1).value
            text = [i.value for i in sheet.row(row)[3:] if i.ctype != 0]
            ttid_list.append(int(tid))
            tb_label_list.append(int(b_label))
            ttext_list.append(text)
        if (row-1) %3 == 1:
            att = [int(i.value) for i in sheet.row(row)[3:] if i.ctype != 0]
            if any([a not in [0, 1] for a in att]):
                print(row)
            tatt_list.append(att)

In [132]:
train_data_without_att = train_data.loc[~train_data.index.isin(tid_list)]
train_data_with_att = train_data.loc[train_data.index.isin(tid_list)]
val_data_without_att = val_data.loc[~val_data.index.isin(vtid_list)]
val_data_with_att = val_data.loc[val_data.index.isin(vtid_list)]
test_data_without_att = test_data.loc[~test_data.index.isin(ttid_list)]
test_data_with_att = test_data.loc[test_data.index.isin(ttid_list)]

In [133]:
train_att_data = pd.DataFrame({'attention':att_list}, index=tid_list)
train_data_with_att = train_data_with_att.merge(train_att_data, left_index=True, right_index=True, validate='1:1')

val_att_data = pd.DataFrame({'attention':vatt_list}, index=vtid_list)
val_data_with_att = val_data_with_att.merge(val_att_data, left_index=True, right_index=True, validate='1:1')

test_att_data = pd.DataFrame({'attention':tatt_list}, index=ttid_list)
test_data_with_att = test_data_with_att.merge(test_att_data, left_index=True, right_index=True, validate='1:1')

### Load embeddings

In [116]:
embeddings_index = dict()
with open('glove.6B.100d.txt') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

In [118]:
def load_embeddings(words_list, max_words=56):
    output_reviews = []
    for words in words_list:
        words = [w.lower() for w in words]
        words_mapped = [0]* max_words

        length = len(words)
        if (length<max_words):
            #print('We should never see this print')
            for i in range(0,length):
                words_mapped[i] = embeddings_index.get(words[i], embeddings_index['unk'])
            for i in range(length,max_words):
                words_mapped[i] =  embeddings_index['unk']
        elif (length>max_words):
            print('We should never see this print either')
        else:
            for i in range(0,max_words):
                words_mapped[i] = embeddings_index.get(words[i], embeddings_index['unk'])

        output_reviews.append(words_mapped)

    return output_reviews

In [119]:
train_data['embeddings'] = load_embeddings(train_data['token_list'])
val_data['embeddings'] = load_embeddings(val_data['token_list'])
test_data['embeddings'] = load_embeddings(test_data['token_list'])

In [125]:
train_data.to_pickle('./senti_data/train.p')
val_data.to_pickle('./senti_data/val.p')
test_data.to_pickle('./senti_data/test.p')

### Shuffle Labeled Data

In [243]:
att_data = pd.concat([train_data_with_att, val_data_with_att,test_data_with_att])
att_data.reset_index(inplace=True, drop=True)

In [244]:
random_state = 1

att_train_ix, att_test_ix = train_test_split(att_data.index, test_size=0.5, stratify=att_data['binary_label'], random_state=random_state)

In [246]:
train_val_without_att = pd.concat([train_data_without_att, val_data_without_att])

In [248]:
np.save('senti_data/raw_text_with_att_train.npy', att_data.loc[att_train_ix]['token_list'])
np.save('senti_data/raw_text_with_att_val_test.npy', att_data.loc[att_test_ix]['token_list'])
np.save('senti_data/raw_text_without_att_train.npy', train_val_without_att['token_list'])
np.save('senti_data/raw_text_without_att_val_test.npy', test_data_without_att['token_list'])

In [222]:
np.save('senti_data/y_with_att_train.npy', att_data.loc[att_train_ix]['binary_label'].astype(int))
np.save('senti_data/y_with_att_val_test.npy', att_data.loc[att_test_ix]['binary_label'].astype(int))
np.save('senti_data/y_without_att_train.npy', train_val_without_att['binary_label'].astype(int))
np.save('senti_data/y_without_att_val_test.npy', test_data_without_att['binary_label'].astype(int))

In [258]:
np.save('senti_data/x_with_att_train.npy', np.array(att_data.loc[att_train_ix]['embeddings'].tolist()))
np.save('senti_data/x_with_att_val_test.npy', np.array(att_data.loc[att_test_ix]['embeddings'].tolist()))
np.save('senti_data/x_without_att_train.npy', np.array(train_val_without_att['embeddings'].tolist()))
np.save('senti_data/x_without_att_val_test.npy', np.array(test_data_without_att['embeddings'].tolist()))

In [259]:
np.save('senti_data/att_labels_with_att_train.npy', att_data.loc[att_train_ix]['attention'])
np.save('senti_data/att_labels_with_att_val_test.npy', att_data.loc[att_test_ix]['attention'])