In [1]:
import os
import re
import glob
from collections import Counter,defaultdict
import pandas as pd
import numpy as np
import pickle

from numpy.random import RandomState
from sklearn import metrics
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split,KFold
import matplotlib.pyplot as plt

pd.set_option('display.max_colwidth', -1)

  from ipykernel import kernelapp as app


In [2]:
STANCES = ["agree", "neutral", "disagree"]
CLASS_NUMS = {s: i for i, s in enumerate(STANCES)}

In [42]:
# move to utils.py later
nli2stance = {'entailment': CLASS_NUMS['agree'], 
              'neutral': CLASS_NUMS['neutral'], 
              'contradiction': CLASS_NUMS['disagree']}

float2stance = {1.0: CLASS_NUMS['agree'],
               0.0: CLASS_NUMS['neutral'],
               -1.0: CLASS_NUMS['disagree']}

stance2nli = {0: 'entailment', 1: 'neutral', 2: 'contradiction'}
                
def stance_reg(label):
    """
    Regularize the stance labels 
    :param label: a label of str (agree(s)/entailment, neutral, disagree(s)/contradiction), 
     int (0, 1, 2) or str of int, or float (1.0, 0.0, -1.0)
    :return: the label as the corresponding class_num
    """
    
    if type(label) == str:
        if label.isalpha(): # could be a,n,d or NLI labels
            if label in STANCES:
                return CLASS_NUMS[label]
            elif label[-1] == 's':
                return CLASS_NUMS[label[:-1]]
            else:
                return nli2stance[label]
        else: # label is str of (0, 1, 2)
            return int(label)
    elif type(label) == float:
        return float2stance[label]
    else:
        return label

In [221]:
def add_backtrans_train(train_df,language,upsample=False):
    """
    Create df with backtranslations of train_df 
    :param train_df: base training data
    :param language: 'fr' or 'zh'
    :return: new df with previous training data + augmented data
    """
    
    backtrans_df = pd.DataFrame({
        'round':train_df['round'].values,
        'batch':train_df.batch.values,
        'sent_id':train_df.sent_id.values,
        'stance':train_df.stance.values,
        'sentence':[get_backtrans(guid,language) for guid in train_df.guid],
        'is_high_iaa':train_df.is_high_iaa.values,
        'guid':[guid+'_'+language for guid in train_df.guid]
    })
    
    backtrans_df = backtrans_df.loc[backtrans_df.stance.isin({'disagrees','disagree'})].append(
        train_df,ignore_index=True)
    return backtrans_df

# Load data

In [117]:
# labeled_data = pd.read_pickle('./data/labeled_data_df.pkl')
# labeled_data.shape

In [118]:
# labeled_data.type.value_counts()

## Estimated labels (MTurk)

In [4]:
est_labels = pd.read_csv('/Users/yiweiluo/scientific-debates/\
3_cc_stance/MTurk/MTurk_results/sent_scores_df.tsv',delimiter='\t',index_col=0)
est_labels['max_prob_label'] = est_labels[['disagree','neutral','agree']].idxmax(axis=1)
dedup_est_labels = est_labels.drop_duplicates('sentence',keep='first')
est_labels.shape, dedup_est_labels.shape

((2050, 8), (2042, 8))

In [5]:
assert len(dedup_est_labels) == 2042
dedup_est_labels['guid'] = ["{}_{}_{}".format(row['round'],row['batch'],row['sent_id']) 
                      for _,row in dedup_est_labels.iterrows()]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [6]:
dedup_est_labels.head()

Unnamed: 0,round,batch,sent_id,disagree,neutral,agree,sentence,max_prob_label,guid
0,1,0,t0,0.004241,0.260963,0.734797,"Warmer-than-normal sea surface temperatures are a key player in the development of hurricanes such as Katrina and superstorm Sandy, which hit the U.S. east coast in 2011.",agree,1_0_t0
1,1,0,t1,0.001548,0.996214,0.002239,We will continue to rely in part on fossil fuels while we transition to a low-carbon economy .,neutral,1_0_t1
2,1,0,t10,0.00144,0.996503,0.002057,The actual rise in sea levels measured only 1.2 millimeters instead of the previously accepted 1.6 to 1.9 millimeters.,neutral,1_0_t10
3,1,0,t11,0.996815,0.001588,0.001596,Claims of global warming have been greatly exaggerated.,disagree,1_0_t11
4,1,0,t12,0.035201,0.959757,0.005042,The Intergovernmental Panel on Climate Change should be clearer on how it draws conclusions from the body of research it assesses when gauging the impacts of global warming.,neutral,1_0_t12


## Raw labels (MTurk)

In [7]:
worker_labels_per_round = pickle.load(open('../MTurk/MTurk_results/full_ratings_per_round.pkl','rb'))

In [8]:
PROP_AGREE = 0.75
NUM_ROUNDS, NUM_BATCHES, NUM_WORKERS = 5, 10, 8

In [132]:
round_df,batch_df,sentid_df,stance_df,text_df,high_iaa_df,guid_df = [],[],[],[],[],[],[]
for r in range(1,1+NUM_ROUNDS):
    for b in range(NUM_BATCHES):
        labels = worker_labels_per_round[r][b]
        for s_id in labels.index[5:-1]:
            round_df.append(r)
            batch_df.append(b)
            sentid_df.append(s_id)
            text_df.append(labels.loc[s_id].sentence)
            guid_df.append("{}_{}_{}".format(r,b,s_id))
            
            ratings = labels.loc[s_id][['worker_{}'.format(w_id) for w_id in range(NUM_WORKERS)]].values
            top_rating = Counter(ratings).most_common()[0]
            if top_rating[-1] >= PROP_AGREE*NUM_WORKERS:
                stance_df.append(top_rating[0])
                high_iaa_df.append(True)
            else:
                stance_df.append(est_labels.loc[(est_labels['round'] == r) & 
                                             (est_labels['batch'] == b) & 
                                             (est_labels['sent_id'] == s_id)].max_prob_label.values[0])
                high_iaa_df.append(False)

mturk_df = pd.DataFrame({'round':round_df,"batch":batch_df,"sent_id":sentid_df,"stance":stance_df,
                 "sentence":text_df,'is_high_iaa':high_iaa_df,'guid':guid_df})
mturk_df = mturk_df.drop_duplicates('sentence',keep='first')
mturk_df.reset_index(drop=True,inplace=True)
mturk_df.shape

(2042, 7)

In [133]:
mturk_df.head()

Unnamed: 0,round,batch,sent_id,stance,sentence,is_high_iaa,guid
0,1,0,t0,agree,"Warmer-than-normal sea surface temperatures are a key player in the development of hurricanes such as Katrina and superstorm Sandy, which hit the U.S. east coast in 2011.",False,1_0_t0
1,1,0,t1,neutral,We will continue to rely in part on fossil fuels while we transition to a low-carbon economy .,True,1_0_t1
2,1,0,t10,neutral,The actual rise in sea levels measured only 1.2 millimeters instead of the previously accepted 1.6 to 1.9 millimeters.,True,1_0_t10
3,1,0,t11,disagrees,Claims of global warming have been greatly exaggerated.,True,1_0_t11
4,1,0,t12,neutral,The Intergovernmental Panel on Climate Change should be clearer on how it draws conclusions from the body of research it assesses when gauging the impacts of global warming.,True,1_0_t12


In [134]:
mturk_df.is_high_iaa.value_counts()

True     1119
False    923 
Name: is_high_iaa, dtype: int64

## Back translations

In [12]:
back_trans_fr = pd.read_csv('../datasets/mturk_french_backtranslations.tsv',sep='\t',
                        header=0,index_col=0)
back_trans_zh = pd.read_csv('../datasets/mturk_zh_backtranslations.tsv',sep='\t',
                        header=0,index_col=0)

In [13]:
def get_backtrans(guid,language):
    r,b,s_id = guid.split('_')
    if language == 'fr':
        return back_trans_fr.loc[(back_trans_fr['round'] == int(r)) &
                                (back_trans_fr['batch'] == int(b)) &
                                (back_trans_fr['sent_id'] == s_id)].backtranslation.values[0]
    else:
        return back_trans_zh.loc[(back_trans_fr['round'] == int(r)) &
                                (back_trans_fr['batch'] == int(b)) &
                                (back_trans_fr['sent_id'] == s_id)].backtranslation_zh_en.values[0]

In [76]:
get_backtrans('1_0_t0','fr')

'Warmer than normal sea surface temperatures are a key player in the development of hurricanes such as Katrina and Sandstorm Sandy, which hit the east coast of the United States in 2011.'

In [77]:
get_backtrans('1_0_t0','zh')

'Above-normal sea-level temperatures were a key factor in the development of hurricanes such as Hurricane Katrina and Sandy, which hit the US East Coast in 2011.'

## Sentence windows

In [14]:
fnames = os.listdir('../../1_data_scraping/fulltexts')

In [25]:
all_round_data = {r: {} for r in range(1,6)}
for round_no in range(1,6):
    all_round_data[round_no] = pickle.load(open('/Users/yiweiluo/Dropbox/research/QP2/code/Fox_and_friends/\
LIVE_ROUND{}_BATCH_DATA.pkl'.format(round_no),'rb'))
    
data_for_mturk_df = pd.read_pickle('/Users/yiweiluo/Dropbox/research/QP2/code/Fox_and_friends/\
data_for_mturk_2020.pkl')
data_for_mturk_df_old = pd.read_pickle('/Users/yiweiluo/Dropbox/research/QP2/code/Fox_and_friends/\
data_for_mturk.pkl')

In [16]:
from nltk.tokenize import sent_tokenize
from fuzzywuzzy import process

def get_window(guid,window_size):
    r,b,s_id = guid.split('_')
    target_sent = mturk_df.loc[(mturk_df['round'] == int(r)) &
                                (mturk_df['batch'] == int(b)) &
                                (mturk_df['sent_id'] == s_id)].sentence.values[0]

#     print('Target sent:',target_sent)
#     print('Round: {}, batch: {}, sent_id: {}'.format(r,b,s_id))
    rb_df = pd.DataFrame(all_round_data[int(r)][int(b)])
    df_key = rb_df.loc[rb_df.sent_id == s_id].df_key.values[0]
#     print('df key:',df_key)
    
    if int(r) < 5:
        sent_key = data_for_mturk_df_old.loc[df_key].sent_key
    else:
        sent_key = data_for_mturk_df.loc[df_key].sent_key
        
    url = sent_key.split(' of ')[-1].split('://')[-1]
    #print('url:',url)
    
    fname = url.replace('/','[SEP]')
    fname = '{}.txt'.format(fname) if '{}.txt'.format(fname) in fnames else '{}.txt'.format(fname[:90])
    #print('fname:',fname)
    
    if fname in fnames:
        with open(os.path.join('../../1_data_scraping/fulltexts',fname)) as f:
            text = f.readlines()
        if len(text) > 0:
            text = text[0]

            text_sents = sent_tokenize(text)
            sent_with_target = process.extract(target_sent, text_sents, limit=1)
            #print('Found sentence containing target sent:',sent_with_target)
            ix_target_sent = text_sents.index(sent_with_target[0][0])

            w_start = max(0,ix_target_sent-window_size)
            w_end = min(ix_target_sent+window_size,len(text_sents)-1)
            w_left = text_sents[w_start:ix_target_sent]
            w_right = text_sents[ix_target_sent+1:w_end+1]
            #print('Left sentence(s):',w_left)
            #print('Right sentence(s):',w_right)
            BERT_input = '[SEP] '.join(w_left)+' [SEP] [CLS] '+target_sent+' [SEP] '+' [SEP] '.join(w_right)
            if BERT_input[:6] != ' [SEP]':
                #print('Padding beginning with [SEP]...')
                BERT_input = '[SEP] '+BERT_input
                
            return BERT_input
        else:
            print('Fulltext is empty!')
    else:
        print('Fulltext file not found!')

In [17]:
get_window('1_0_t0',1)

'[SEP] “I think it’s very important to remind people the scope of what can happen with the hurricane season.”  Nonetheless, the events surrounding the hurricane, which caused $108 billion of damage, continue to interest to the scientific community. [SEP] [CLS] Warmer-than-normal sea surface temperatures are a key player in the development of hurricanes such as Katrina and superstorm Sandy, which hit the U.S. east coast in 2011. [SEP] “These storms may not have been caused by global warming, but because the ocean’s surface is warmer, it makes the storm more powerful,” Thomas Wagner, cryosphere program manager at NASA headquarters in Washington, D.C. told FoxNews.com.'

In [18]:
get_window('1_0_t0',2)

'[SEP] “We haven’t had a Category 3 hit the U.S. in 10 years – I think there’s a lot of complacency out there,” she said during a panel discussion at an American Meteorological Society conference in June.[SEP] “I think it’s very important to remind people the scope of what can happen with the hurricane season.”  Nonetheless, the events surrounding the hurricane, which caused $108 billion of damage, continue to interest to the scientific community. [SEP] [CLS] Warmer-than-normal sea surface temperatures are a key player in the development of hurricanes such as Katrina and superstorm Sandy, which hit the U.S. east coast in 2011. [SEP] “These storms may not have been caused by global warming, but because the ocean’s surface is warmer, it makes the storm more powerful,” Thomas Wagner, cryosphere program manager at NASA headquarters in Washington, D.C. told FoxNews.com. [SEP] “Then, because sea level is higher, the water can go further inland from the storm surge.”  President Obama briefly 

## SemEval tweets

In [19]:
semeval_test = pd.read_csv('../datasets/StanceDataset/test.csv',header=0,encoding='utf-8',engine='python')
semeval_test = semeval_test[semeval_test['Target'] == 'Climate Change is a Real Concern']

semeval_train = pd.read_csv('../datasets/StanceDataset/train.csv',header=0,encoding='utf-8', engine='python')
semeval_train = semeval_train[semeval_train['Target'] == 'Climate Change is a Real Concern']

len(semeval_test),len(semeval_train)

(169, 395)

In [20]:
semeval_test = semeval_test[['Tweet','Stance']]
semeval_train = semeval_train[['Tweet','Stance']]

In [21]:
tweetstance2label = {'NONE': CLASS_NUMS['neutral'],
                    'FAVOR': CLASS_NUMS['agree'],
                    'AGAINST': CLASS_NUMS['disagree']}

semeval_test['stance'] = semeval_test['Stance'].apply(lambda x: tweetstance2label[x])
semeval_train['stance'] = semeval_train['Stance'].apply(lambda x: tweetstance2label[x])
semeval_df = semeval_test.append(semeval_train,ignore_index=True)

In [22]:
semeval_df.stance.value_counts()

0    335
1    203
2    26 
Name: stance, dtype: int64

## Add additional info: original source media leaning

In [28]:
def get_orig_media_slant(guid):
    r,b,s_id = guid.split('_')
    if int(r) < 5:
        df_ = data_for_mturk_df_old
    else:
        df_ = data_for_mturk_df
    
    b_df_ = pd.DataFrame(all_round_data[int(r)][int(b)])
    df_key = b_df_.loc[b_df_.sent_id == s_id].df_key.values[0]
    
    def str_to_int(s):
        return int(s == 'pro') # 1 for pro, 0 for anti
        
    return str_to_int(df_.iloc[df_key].bias)

# Create train/dev/test splits

In [33]:
df_getter = {'raw_mturk': mturk_df,
            'est_mturk': dedup_est_labels,
            'semeval': semeval_df}

print(get_orig_media_slant('1_0_t12'))
print('\n')
print(get_window('1_0_t0',1))
print('\n')
print(get_backtrans('1_0_t12','zh'))

1


[SEP] “I think it’s very important to remind people the scope of what can happen with the hurricane season.”  Nonetheless, the events surrounding the hurricane, which caused $108 billion of damage, continue to interest to the scientific community. [SEP] [CLS] Warmer-than-normal sea surface temperatures are a key player in the development of hurricanes such as Katrina and superstorm Sandy, which hit the U.S. east coast in 2011. [SEP] “These storms may not have been caused by global warming, but because the ocean’s surface is warmer, it makes the storm more powerful,” Thomas Wagner, cryosphere program manager at NASA headquarters in Washington, D.C. told FoxNews.com.


How the Intergovernmental Panel on Climate Change should draw clearer conclusions from the research findings it assesses when assessing the effects of global warming.


In [31]:
df_getter['raw_mturk'].loc[df_getter['raw_mturk'].guid == '1_0_t12']

Unnamed: 0,round,batch,sent_id,stance,sentence,is_high_iaa,guid
4,1,0,t12,neutral,The Intergovernmental Panel on Climate Change should be clearer on how it draws conclusions from the body of research it assesses when gauging the impacts of global warming.,True,1_0_t12


In [214]:
def write_data(name,seed,desc,train_df,test_df,dev_df=None,do_downsample=False):
    """
    Writes data to a directory containing train.tsv, test.tsv, and optionally dev.tsv.
    :param name: name of directory (type of train/eval data)
    :param seed: random_seed used
    :param desc: list of type str with manipulations made (e.g., downsampled, upsampled, backtrans_fr, window_1)
    :return: None
    """
    
    # Check that train and eval text are deduplicated
    train_guids = set([x.replace('_fr','').replace('_zh','') for x in train_df.guid])
    test_guids = set([x.replace('_fr','').replace('_zh','') for x in test_df.guid])
    assert train_guids.intersection(test_guids) == set()
    print("Train/test text overlap:",set(train_df.sentence).intersection(set(test_df.sentence)))
    if dev_df is not None:
        dev_guids = set([x.replace('_fr','').replace('_zh','') for x in dev_df.guid])
        assert train_guids.intersection(dev_guids) == set()
        print("Train/dev text overlap:",set(train_df.sentence).intersection(set(dev_df.sentence)))
    
    # Make save_dir
    if do_downsample:
        desc.append('downsampled')
    save_dir = os.path.join('save',"_".join([name]+desc+[str(seed)]))
    if not os.path.exists(save_dir):
        os.mkdir(save_dir)
        
    # Regularize labels
    train_df['reg_stance'] = train_df['stance'].apply(stance_reg)
    test_df['reg_stance'] = test_df['stance'].apply(stance_reg)
    if dev_df is not None:
        dev_df['reg_stance'] = dev_df['stance'].apply(stance_reg) 
        
    # Aggregate examples by stance for downsampling/upsampling needs
    train_df_by_stance = {s: train_df.loc[train_df.reg_stance == i] for i,s in enumerate(STANCES)} 
    test_df_by_stance = {s: test_df.loc[test_df.reg_stance == i] for i,s in enumerate(STANCES)}
    dev_df_by_stance = {s: dev_df.loc[dev_df.reg_stance == i] for i,s in enumerate(STANCES)} if dev_df is not None else None

    # Split X, Y
    train_X_by_stance = {s: train_df_by_stance[s].sentence.values for s in STANCES}
    test_X_by_stance = {s: test_df_by_stance[s].sentence.values for s in STANCES}
    dev_X_by_stance = {s: dev_df_by_stance[s].sentence.values for s in STANCES} if dev_df is not None else None
    
    train_Y_by_stance = {s: train_df_by_stance[s].reg_stance.values for s in STANCES} 
    dev_Y_by_stance = {s: dev_df_by_stance[s].reg_stance.values for s in STANCES} if dev_df is not None else None
    test_Y_by_stance = {s: test_df_by_stance[s].reg_stance.values for s in STANCES}

    train_nli_by_stance = {s: train_df_by_stance[s].reg_stance.apply(lambda x: stance2nli[x]).values for s in STANCES}
    dev_nli_by_stance = {s: dev_df_by_stance[s].reg_stance.apply(lambda x: stance2nli[x]).values for s in STANCES} if dev_df is not None else None
    test_nli_by_stance = {s: test_df_by_stance[s].reg_stance.apply(lambda x: stance2nli[x]).values for s in STANCES}

    if do_downsample:
        min_N = min([len(train_X_by_stance[s]) for s in STANCES])
        print('Downsampling to ~{} examples per stance.'.format(min_N))
        for s in STANCES:
            train_X_by_stance[s] = train_X_by_stance[s][:min_N+50]

    trX = []
    trB = []
    trY = []
    trNLI = []
    for i,s in enumerate(STANCES):
        for t, y, nli in zip(train_X_by_stance[s], train_Y_by_stance[s], train_nli_by_stance[s]):
            #for text_b in TEXT_BS:
            trX.append(t)
            #trB.append(text_b)
            trY.append(y)
            trNLI.append(nli)

    teX = []
    teB = []
    teY = []
    teNLI = []
    for i,s in enumerate(STANCES):
        for t, y, nli in zip(test_X_by_stance[s], test_Y_by_stance[s], test_nli_by_stance[s]):
            #for text_b in TEXT_BS:
            teX.append(t)
            #teB.append(text_b)
            teY.append(y)
            teNLI.append(nli)

    if dev_df is not None:
        vaX = []
        vaY = []
        vaNLI = []
        for i,s in enumerate(STANCES):
            for t, y, nli in zip(dev_X_by_stance[s], dev_Y_by_stance[s], dev_nli_by_stance[s]):
                vaX.append(t)
                vaY.append(y)
                vaNLI.append(nli)


    test_dat = pd.DataFrame({'sentence':teX,'stance':teY,'nli_label':teNLI})
    train_dat = pd.DataFrame({'sentence':trX,'stance':trY,'nli_label':trNLI}) 
    val_dat = pd.DataFrame({'sentence':vaX,'stance':vaY,'nli_label':vaNLI}) if dev_df is not None else None
    
    print('Train distribution:')
    print(train_dat.stance.value_counts()) 
    print(train_dat.nli_label.value_counts())
    if dev_df is not None:
        print('\nDev distribution:')
        print(val_dat.stance.value_counts())
        print(val_dat.nli_label.value_counts())
    print('\nTest distribution:')
    print(test_dat.stance.value_counts())
    print(test_dat.stance.value_counts()/np.sum(test_dat.stance.value_counts().values))
    print(test_dat.nli_label.value_counts())

    train_dat.to_csv(save_dir+'/train.tsv',sep='\t',header=None,index=False)
    if dev_df is not None:
        val_dat.to_csv(save_dir+'/dev.tsv',sep='\t',header=None,index=False)
    test_dat.to_csv(save_dir+'/test.tsv',sep='\t',header=None,index=False)

## Train on high_iaa, test on rest

In [174]:
guids_train = mturk_df.loc[mturk_df.is_high_iaa].guid.values
guids_test = mturk_df.loc[~mturk_df.is_high_iaa].guid.values
len(guids_train),len(guids_test)

(1119, 923)

In [175]:
train_df = mturk_df.loc[mturk_df.guid.isin(guids_train)]
test_df = mturk_df.loc[mturk_df.guid.isin(guids_test)]
train_df.shape,test_df.shape

((1119, 7), (923, 7))

In [176]:
train_df.head()

Unnamed: 0,round,batch,sent_id,stance,sentence,is_high_iaa,guid
1,1,0,t1,neutral,We will continue to rely in part on fossil fuels while we transition to a low-carbon economy .,True,1_0_t1
2,1,0,t10,neutral,The actual rise in sea levels measured only 1.2 millimeters instead of the previously accepted 1.6 to 1.9 millimeters.,True,1_0_t10
3,1,0,t11,disagrees,Claims of global warming have been greatly exaggerated.,True,1_0_t11
4,1,0,t12,neutral,The Intergovernmental Panel on Climate Change should be clearer on how it draws conclusions from the body of research it assesses when gauging the impacts of global warming.,True,1_0_t12
7,1,0,t15,agrees,Simply reducing emissions will not sufficiently limit global warming.,True,1_0_t15


In [178]:
#write_data('high_iaa_train','',[],train_df,test_df)

### Augment w/ back translations

In [179]:
backtrans_fr_train_df = add_backtrans_train(train_df,'fr')
backtrans_fr_train_df.shape

(2238, 8)

In [181]:
#write_data('high_iaa_train','',['backtrans_fr'],backtrans_fr_train_df,test_df)

In [97]:
backtrans_zh_train_df = pd.DataFrame({
    'round':train_df['round'].values,
    'batch':train_df.batch.values,
    'sent_id':train_df.sent_id.values,
    'stance':train_df.stance.values,
    'sentence':[get_backtrans(guid,'zh') for guid in train_df.guid],
    'is_high_iaa':train_df.is_high_iaa.values,
    'guid':[guid+'_zh' for guid in train_df.guid]
})
backtrans_zh_train_df = backtrans_zh_train_df.append(train_df,ignore_index=True)
backtrans_zh_train_df.shape

(2238, 7)

In [182]:
#write_data('high_iaa_train','',['backtrans_zh'],backtrans_zh_train_df,test_df)

In [99]:
backtrans_both_train_df = backtrans_zh_train_df.append(backtrans_fr_train_df,ignore_index=True).\
drop_duplicates('guid',keep='first')
backtrans_both_train_df.shape

(3357, 7)

In [100]:
1119*3

3357

In [183]:
#write_data('high_iaa_train','',['backtrans_both'],backtrans_both_train_df,test_df)

## All SemEval tweets as eval

In [66]:
semeval_df['nli_label'] = semeval_df['stance'].apply(lambda x: stance2nli[x])

In [68]:
os.mkdir('./save/semeval_test')
semeval_df[['Tweet','stance','nli_label']].to_csv('./save/semeval_test'+'/test.tsv',sep='\t',header=None,index=False)

## SemEval as train, dev, and test

In [77]:
semeval_df['nli_label'] = semeval_df['stance'].apply(lambda x: stance2nli[x])
semeval_df['sentence'] = semeval_df['Tweet']
train_ix,eval_ix = train_test_split(list(semeval_df.index),test_size=0.3,random_state=42)
dev_ix,test_ix = train_test_split(eval_ix,test_size=0.55,random_state=42)
len(train_ix),len(dev_ix),len(test_ix)

(394, 76, 94)

In [78]:
train_df = semeval_df.loc[semeval_df.index.isin(train_ix)]
dev_df = semeval_df.loc[semeval_df.index.isin(dev_ix)]
test_df = semeval_df.loc[semeval_df.index.isin(test_ix)]
train_df.shape,dev_df.shape,test_df.shape

((394, 5), (76, 5), (94, 5))

In [86]:
write_data('semeval_train_eval',42,[],train_df,test_df,dev_df)

Train distribution:
0    229
1    144
2    21 
Name: stance, dtype: int64
entailment       229
neutral          144
contradiction    21 
Name: nli_label, dtype: int64

Dev distribution:
0    48
1    26
2    2 
Name: stance, dtype: int64
entailment       48
neutral          26
contradiction    2 
Name: nli_label, dtype: int64

Test distribution:
0    58
1    33
2    3 
Name: stance, dtype: int64
0    0.617021
1    0.351064
2    0.031915
Name: stance, dtype: float64
entailment       58
neutral          33
contradiction    3 
Name: nli_label, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


## Cross-val splits (test on item-response est. label)

In [153]:
seed = 42
np.random.seed(seed)

In [154]:
order = np.arange(len(mturk_df))
np.random.shuffle(order)
len(order)

2042

In [155]:
indices_per_fold = {}
n_folds = 10
for f in range(n_folds):
    test_indices = [order[i] for i in np.arange(len(mturk_df)) if i % n_folds == f]
    nontest_indices = list(set(np.arange(len(mturk_df))) - set(test_indices))
    dev_indices = list(np.random.choice(nontest_indices, size=len(test_indices), replace=False))
    train_indices = list(set(nontest_indices) - set(dev_indices))
    all_indices = set(test_indices).union(set(dev_indices)).union(set(train_indices))
    indices_per_fold[f] = {'train':train_indices,'dev':dev_indices,'test':test_indices}
    print(len(all_indices), len(test_indices) + len(dev_indices) + len(train_indices), len(test_indices), len(dev_indices), len(train_indices))

2042 2042 205 205 1632
2042 2042 205 205 1632
2042 2042 204 204 1634
2042 2042 204 204 1634
2042 2042 204 204 1634
2042 2042 204 204 1634
2042 2042 204 204 1634
2042 2042 204 204 1634
2042 2042 204 204 1634
2042 2042 204 204 1634


In [156]:
pickle.dump(indices_per_fold,open('cross_val_10_seed_42_indices.pkl','wb'))

### Vanilla MTurk (est. labels)

In [208]:
for f in range(1,n_folds):
    fold_0_ix = indices_per_fold[f]
    train_ix = fold_0_ix['train']
    test_ix = fold_0_ix['test']
    dev_ix = fold_0_ix['dev']

    train_df = mturk_df.loc[mturk_df.index.isin(train_ix)]
    dev_df = mturk_df.loc[mturk_df.index.isin(dev_ix)]
    test_df = mturk_df.loc[mturk_df.index.isin(test_ix)]
    print(train_df.shape,dev_df.shape,test_df.shape)
    #write_data('all_mturk_train_fold_{}'.format(f),42,[],train_df,test_df,dev_df)

(1632, 7) (205, 7) (205, 7)
(1634, 7) (204, 7) (204, 7)
(1634, 7) (204, 7) (204, 7)
(1634, 7) (204, 7) (204, 7)
(1634, 7) (204, 7) (204, 7)
(1634, 7) (204, 7) (204, 7)
(1634, 7) (204, 7) (204, 7)
(1634, 7) (204, 7) (204, 7)
(1634, 7) (204, 7) (204, 7)


### Back translation augmented train, with and without downsampling

In [234]:
# Add backtranslations of the train_ix examples to training
# for f in range(0,1):
#     fold_0_ix = indices_per_fold[f]
#     train_ix = fold_0_ix['train']
#     test_ix = fold_0_ix['test']
#     dev_ix = fold_0_ix['dev']

#     train_df = mturk_df.loc[mturk_df.index.isin(train_ix)]
#     dev_df = mturk_df.loc[mturk_df.index.isin(dev_ix)]
#     test_df = mturk_df.loc[mturk_df.index.isin(test_ix)]
#     backtrans_fr_df = add_backtrans_train(train_df,'fr')
#     backtrans_zh_df = add_backtrans_train(train_df,'zh')
#     backtrans_both_df = backtrans_fr_df.append(backtrans_zh_df,ignore_index=True).drop_duplicates('guid',keep='first')
#     print(backtrans_fr_df.shape,backtrans_zh_df.shape,backtrans_both_df.shape,dev_df.shape,test_df.shape)
#     write_data('all_mturk_train_fold_{}'.format(f),seed,['backtrans_fr'],backtrans_fr_df,test_df,dev_df)
#     write_data('all_mturk_train_fold_{}'.format(f),seed,['backtrans_zh'],backtrans_zh_df,test_df,dev_df)
#     write_data('all_mturk_train_fold_{}'.format(f),seed,['backtrans_both'],backtrans_both_df,test_df,dev_df)
#     write_data('all_mturk_train_fold_{}'.format(f),seed,['backtrans_fr'],backtrans_fr_df,test_df,dev_df,do_downsample=True)
#     write_data('all_mturk_train_fold_{}'.format(f),seed,['backtrans_zh'],backtrans_zh_df,test_df,dev_df,do_downsample=True)
#     write_data('all_mturk_train_fold_{}'.format(f),seed,['backtrans_both'],backtrans_both_df,test_df,dev_df,do_downsample=True)

### Back translation + upsample minority class

In [233]:
# for f in range(0,1):
#     fold_0_ix = indices_per_fold[f]
#     train_ix = fold_0_ix['train']
#     test_ix = fold_0_ix['test']
#     dev_ix = fold_0_ix['dev']

#     train_df = mturk_df.loc[mturk_df.index.isin(train_ix)]
#     dev_df = mturk_df.loc[mturk_df.index.isin(dev_ix)]
#     test_df = mturk_df.loc[mturk_df.index.isin(test_ix)]
#     backtrans_fr_df = add_backtrans_train(train_df,'fr',upsample=True)
#     backtrans_zh_df = add_backtrans_train(train_df,'zh',upsample=True)
#     backtrans_both_df = backtrans_fr_df.append(backtrans_zh_df,ignore_index=True).drop_duplicates('guid',keep='first')
#     print(backtrans_fr_df.shape,backtrans_zh_df.shape,backtrans_both_df.shape,dev_df.shape,test_df.shape)
#     write_data('all_mturk_train_fold_{}'.format(f),seed,['backtrans_fr_upsampled'],backtrans_fr_df,test_df,dev_df)
#     write_data('all_mturk_train_fold_{}'.format(f),seed,['backtrans_zh_upsampled'],backtrans_zh_df,test_df,dev_df)
#     write_data('all_mturk_train_fold_{}'.format(f),seed,['backtrans_both_upsampled'],backtrans_both_df,test_df,dev_df)

# SCP to cluster

In [236]:
from paramiko import SSHClient
from scp import SCPClient

ssh = SSHClient()
ssh.load_system_host_keys()
ssh.connect(hostname='jacob.stanford.edu',username='yiweil',password='yldwuaeo2699zhishao15')

# Define progress callback that prints the current percentage completed for the file
def progress(filename, size, sent):
    print("%s\'s progress: %.2f%%   \r" % (filename, float(sent)/float(size)*100) )
    
cluster_data_dir = '/u/scr/yiweil/sci-debates/cc_stance/climate_data'
local_data_dir = './save'

# SCPCLient takes a paramiko transport and progress callback as its arguments.
scp = SCPClient(ssh.get_transport(), progress=progress)

In [238]:
# for file in glob.glob(local_data_dir+'/high_iaa_train_*'):
#     scp.put(file, recursive=True, remote_path=cluster_data_dir)
    
for file in glob.glob(local_data_dir+'/all_mturk_*'):
    scp.put(file, recursive=True, remote_path=cluster_data_dir)
    break

scp.close()

b'dev.tsv''s progress: 0.00%   
b'dev.tsv''s progress: 66.12%   
b'dev.tsv''s progress: 100.00%   
b'test.tsv''s progress: 0.00%   
b'test.tsv''s progress: 62.81%   
b'test.tsv''s progress: 100.00%   
b'train.tsv''s progress: 0.00%   
b'train.tsv''s progress: 7.75%   
b'train.tsv''s progress: 15.51%   
b'train.tsv''s progress: 23.26%   
b'train.tsv''s progress: 31.01%   
b'train.tsv''s progress: 38.77%   
b'train.tsv''s progress: 46.52%   
b'train.tsv''s progress: 54.28%   
b'train.tsv''s progress: 62.03%   
b'train.tsv''s progress: 69.78%   
b'train.tsv''s progress: 77.54%   
b'train.tsv''s progress: 85.29%   
b'train.tsv''s progress: 93.04%   
b'train.tsv''s progress: 100.00%   
