In [1]:
import os
import re
import glob
from collections import Counter,defaultdict
import pandas as pd
import numpy as np
import pickle

from numpy.random import RandomState
from sklearn import metrics
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split,KFold
import matplotlib.pyplot as plt

pd.set_option('display.max_colwidth', -1)

  from ipykernel import kernelapp as app


In [2]:
STANCES = ["agree", "neutral", "disagree"]
CLASS_NUMS = {s: i for i, s in enumerate(STANCES)}

In [3]:
# move to utils.py later
nli2stance = {'entailment': CLASS_NUMS['agree'], 
              'neutral': CLASS_NUMS['neutral'], 
              'contradiction': CLASS_NUMS['disagree']}

float2stance = {1.0: CLASS_NUMS['agree'],
               0.0: CLASS_NUMS['neutral'],
               -1.0: CLASS_NUMS['disagree']}

stance2nli = {0: 'entailment', 1: 'neutral', 2: 'contradiction'}
                
def stance_reg(label):
    """
    Regularize the stance labels 
    :param label: a label of str (agree(s)/entailment, neutral, disagree(s)/contradiction), 
     int (0, 1, 2) or str of int, or float (1.0, 0.0, -1.0)
    :return: the label as the corresponding class_num
    """
    
    if type(label) == str:
        if label.isalpha(): # could be a,n,d or NLI labels
            if label in STANCES:
                return CLASS_NUMS[label]
            elif label[-1] == 's':
                return CLASS_NUMS[label[:-1]]
            else:
                return nli2stance[label]
        else: # label is str of (0, 1, 2)
            return int(label)
    elif type(label) == float:
        return float2stance[label]
    else:
        return label

In [4]:
def add_backtrans_train(train_df,language,upsample=False):
    """
    Create df with backtranslations of train_df 
    :param train_df: base training data
    :param language: 'fr' or 'zh'
    :return: new df with previous training data + augmented data
    """
    
    backtrans_df = pd.DataFrame({
        'round':train_df['round'].values,
        'batch':train_df.batch.values,
        'sent_id':train_df.sent_id.values,
        'stance':train_df.stance.values,
        'sentence':[get_backtrans(guid,language) for guid in train_df.guid],
        'guid':[guid+'_'+language for guid in train_df.guid]
    })
    
    
    if upsample:
        backtrans_df = backtrans_df.loc[backtrans_df.stance.isin({'disagrees','disagree'})].append(
            train_df,ignore_index=True)
    else:
        backtrans_df = backtrans_df.append(train_df,ignore_index=True)
        
    return backtrans_df

# Load data

In [117]:
# labeled_data = pd.read_pickle('./data/labeled_data_df.pkl')
# labeled_data.shape

In [118]:
# labeled_data.type.value_counts()

## Estimated labels (MTurk)

In [195]:
est_labels = pd.read_csv('/Users/yiweiluo/scientific-debates/\
3_cc_stance/MTurk/MTurk_results/sent_scores_df_final.tsv',delimiter='\t',index_col=0)
est_labels['max_prob_label'] = est_labels[['disagree','neutral','agree']].idxmax(axis=1)
dedup_est_labels = est_labels.drop_duplicates('sentence',keep='first')
est_labels.shape, dedup_est_labels.shape

((2050, 8), (2042, 8))

In [196]:
assert len(dedup_est_labels) == 2042
dedup_est_labels['guid'] = ["{}_{}_{}".format(row['round'],row['batch'],row['sent_id']) 
                      for _,row in dedup_est_labels.iterrows()]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [197]:
dedup_est_labels['stance'] = dedup_est_labels['max_prob_label']
dedup_est_labels['max_prob'] = dedup_est_labels[['disagree','neutral','agree']].max(axis=1)
dedup_est_labels.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,round,batch,sent_id,disagree,neutral,agree,sentence,max_prob_label,guid,stance,max_prob
0,1,0,t0,0.003105,0.285634,0.71126,"Warmer-than-normal sea surface temperatures are a key player in the development of hurricanes such as Katrina and superstorm Sandy, which hit the U.S. east coast in 2011.",agree,1_0_t0,agree,0.71126
1,1,0,t1,0.00083,0.998006,0.001163,We will continue to rely in part on fossil fuels while we transition to a low-carbon economy .,neutral,1_0_t1,neutral,0.998006
2,1,0,t10,0.000802,0.998023,0.001174,The actual rise in sea levels measured only 1.2 millimeters instead of the previously accepted 1.6 to 1.9 millimeters.,neutral,1_0_t10,neutral,0.998023
3,1,0,t11,0.997695,0.001134,0.001171,Claims of global warming have been greatly exaggerated.,disagree,1_0_t11,disagree,0.997695
4,1,0,t12,0.031351,0.965687,0.002962,The Intergovernmental Panel on Climate Change should be clearer on how it draws conclusions from the body of research it assesses when gauging the impacts of global warming.,neutral,1_0_t12,neutral,0.965687


In [191]:
held_out_test = pd.read_csv('./save/held_out_balanced_test.tsv',sep='\t',header=0)
#held_out_test.head()

In [194]:
held_out_test.stance.value_counts()

200

In [200]:
dedup_est_labels = dedup_est_labels.loc[~dedup_est_labels.guid.isin(held_out_test.guid)]
dedup_est_labels.reset_index(drop=True,inplace=True)
dedup_est_labels.shape # Expect 2042-200 = 1842

(1842, 11)

In [201]:
assert set(dedup_est_labels.guid.values).intersection(
    set(held_out_test.guid.values)) == set()

In [202]:
set(dedup_est_labels.sentence.values).intersection(
set(held_out_test.sentence.values))

set()

## Raw labels (MTurk)

In [203]:
worker_labels_per_round = pickle.load(open('../MTurk/MTurk_results/full_ratings_per_round.pkl','rb'))

In [204]:
PROP_AGREE = 0.75
NUM_ROUNDS, NUM_BATCHES, NUM_WORKERS = 5, 10, 8

In [205]:
round_df,batch_df,sentid_df,stance_df,text_df,high_iaa_df,guid_df = [],[],[],[],[],[],[]
for r in range(1,1+NUM_ROUNDS):
    for b in range(NUM_BATCHES):
        labels = worker_labels_per_round[r][b]
        for s_id in labels.index[5:-1]:
            round_df.append(r)
            batch_df.append(b)
            sentid_df.append(s_id)
            text_df.append(labels.loc[s_id].sentence)
            guid_df.append("{}_{}_{}".format(r,b,s_id))
            
            ratings = labels.loc[s_id][['worker_{}'.format(w_id) for w_id in range(NUM_WORKERS)]].values
            top_rating = Counter(ratings).most_common()[0]
            if top_rating[-1] >= PROP_AGREE*NUM_WORKERS:
                stance_df.append(top_rating[0])
                high_iaa_df.append(True)
            else:
                stance_df.append(est_labels.loc[(est_labels['round'] == r) & 
                                             (est_labels['batch'] == b) & 
                                             (est_labels['sent_id'] == s_id)].max_prob_label.values[0])
                high_iaa_df.append(False)

mturk_df = pd.DataFrame({'round':round_df,"batch":batch_df,"sent_id":sentid_df,"stance":stance_df,
                 "sentence":text_df,'is_high_iaa':high_iaa_df,'guid':guid_df})
mturk_df = mturk_df.drop_duplicates('sentence',keep='first')
mturk_df.reset_index(drop=True,inplace=True)
mturk_df.shape

(2042, 7)

In [206]:
mturk_df.head()

Unnamed: 0,round,batch,sent_id,stance,sentence,is_high_iaa,guid
0,1,0,t0,agree,"Warmer-than-normal sea surface temperatures are a key player in the development of hurricanes such as Katrina and superstorm Sandy, which hit the U.S. east coast in 2011.",False,1_0_t0
1,1,0,t1,neutral,We will continue to rely in part on fossil fuels while we transition to a low-carbon economy .,True,1_0_t1
2,1,0,t10,neutral,The actual rise in sea levels measured only 1.2 millimeters instead of the previously accepted 1.6 to 1.9 millimeters.,True,1_0_t10
3,1,0,t11,disagrees,Claims of global warming have been greatly exaggerated.,True,1_0_t11
4,1,0,t12,neutral,The Intergovernmental Panel on Climate Change should be clearer on how it draws conclusions from the body of research it assesses when gauging the impacts of global warming.,True,1_0_t12


In [207]:
mturk_df = mturk_df.loc[~mturk_df['guid'].isin(set(held_out_test.guid.values))]
mturk_df.reset_index(drop=True,inplace=True)
mturk_df.shape

(1842, 7)

In [208]:
mturk_df.is_high_iaa.value_counts()

True     1010
False    832 
Name: is_high_iaa, dtype: int64

In [209]:
assert set(mturk_df.guid.values).intersection(
    set(held_out_test.guid.values)) == set()

In [210]:
set(mturk_df.sentence.values).intersection(
set(held_out_test.sentence.values))

set()

## Back translations

In [82]:
back_trans_fr = pd.read_csv('../datasets/mturk_french_backtranslations.tsv',sep='\t',
                        header=0,index_col=0)
back_trans_zh = pd.read_csv('../datasets/mturk_zh_backtranslations.tsv',sep='\t',
                        header=0,index_col=0)

In [83]:
def get_backtrans(guid,language):
    r,b,s_id = guid.split('_')
    if language == 'fr':
        return back_trans_fr.loc[(back_trans_fr['round'] == int(r)) &
                                (back_trans_fr['batch'] == int(b)) &
                                (back_trans_fr['sent_id'] == s_id)].backtranslation.values[0]
    else:
        return back_trans_zh.loc[(back_trans_fr['round'] == int(r)) &
                                (back_trans_fr['batch'] == int(b)) &
                                (back_trans_fr['sent_id'] == s_id)].backtranslation_zh_en.values[0]

In [84]:
get_backtrans('1_0_t0','fr')

'Warmer than normal sea surface temperatures are a key player in the development of hurricanes such as Katrina and Sandstorm Sandy, which hit the east coast of the United States in 2011.'

In [85]:
get_backtrans('1_0_t0','zh')

'Above-normal sea-level temperatures were a key factor in the development of hurricanes such as Hurricane Katrina and Sandy, which hit the US East Coast in 2011.'

## Sentence windows

In [90]:
fnames = os.listdir('../../1_data_scraping/cc_texts')

In [13]:
all_round_data = {r: {} for r in range(1,6)}
for round_no in range(1,6):
    all_round_data[round_no] = pickle.load(open('/Users/yiweiluo/Dropbox/research/QP2/code/Fox_and_friends/\
LIVE_ROUND{}_BATCH_DATA.pkl'.format(round_no),'rb'))
    
data_for_mturk_df = pd.read_pickle('/Users/yiweiluo/Dropbox/research/QP2/code/Fox_and_friends/\
data_for_mturk_2020.pkl')
data_for_mturk_df_old = pd.read_pickle('/Users/yiweiluo/Dropbox/research/QP2/code/Fox_and_friends/\
data_for_mturk.pkl')

In [93]:
from nltk.tokenize import sent_tokenize
from fuzzywuzzy import process

def get_window(guid,window_size):
    r,b,s_id = guid.split('_')
    target_sent = mturk_df.loc[(mturk_df['round'] == int(r)) &
                                (mturk_df['batch'] == int(b)) &
                                (mturk_df['sent_id'] == s_id)].sentence.values[0]

#     print('Target sent:',target_sent)
#     print('Round: {}, batch: {}, sent_id: {}'.format(r,b,s_id))
    rb_df = pd.DataFrame(all_round_data[int(r)][int(b)])
    df_key = rb_df.loc[rb_df.sent_id == s_id].df_key.values[0]
#     print('df key:',df_key)
    
    if int(r) < 5:
        sent_key = data_for_mturk_df_old.loc[df_key].sent_key
    else:
        sent_key = data_for_mturk_df.loc[df_key].sent_key
        
    url = sent_key.split(' of ')[-1].split('://')[-1]
    #print('url:',url)
    
    fname = url.replace('/','[SEP]')
    fname = '{}.txt'.format(fname) if '{}.txt'.format(fname) in fnames else '{}.txt'.format(fname[:90])
    #print('fname:',fname)
    
    if fname in fnames:
        with open(os.path.join('../../1_data_scraping/cc_texts',fname)) as f:
            text = f.readlines()
        if len(text) > 0:
            text = text[0]

            text_sents = sent_tokenize(text)
            sent_with_target = process.extract(target_sent, text_sents, limit=1)
            #print('Found sentence containing target sent:',sent_with_target)
            ix_target_sent = text_sents.index(sent_with_target[0][0])

            w_start = max(0,ix_target_sent-window_size)
            w_end = min(ix_target_sent+window_size,len(text_sents)-1)
            w_left = text_sents[w_start:ix_target_sent]
            w_right = text_sents[ix_target_sent+1:w_end+1]
            #print('Left sentence(s):',w_left)
            #print('Right sentence(s):',w_right)
            BERT_input = '[SEP] '.join(w_left)+' [SEP] [CLS] '+target_sent+' [SEP] '+' [SEP] '.join(w_right)
            if BERT_input[:6] != ' [SEP]':
                #print('Padding beginning with [SEP]...')
                BERT_input = '[SEP] '+BERT_input
                
            return BERT_input
        else:
            print('Fulltext is empty!')
    else:
        print('Fulltext file not found!')

In [94]:
get_window('1_0_t0',1)

'[SEP] “I think it’s very important to remind people the scope of what can happen with the hurricane season.”  Nonetheless, the events surrounding the hurricane, which caused $108 billion of damage, continue to interest to the scientific community. [SEP] [CLS] Warmer-than-normal sea surface temperatures are a key player in the development of hurricanes such as Katrina and superstorm Sandy, which hit the U.S. east coast in 2011. [SEP] “These storms may not have been caused by global warming, but because the ocean’s surface is warmer, it makes the storm more powerful,” Thomas Wagner, cryosphere program manager at NASA headquarters in Washington, D.C. told FoxNews.com.'

In [95]:
get_window('1_0_t0',2)

'[SEP] “We haven’t had a Category 3 hit the U.S. in 10 years – I think there’s a lot of complacency out there,” she said during a panel discussion at an American Meteorological Society conference in June.[SEP] “I think it’s very important to remind people the scope of what can happen with the hurricane season.”  Nonetheless, the events surrounding the hurricane, which caused $108 billion of damage, continue to interest to the scientific community. [SEP] [CLS] Warmer-than-normal sea surface temperatures are a key player in the development of hurricanes such as Katrina and superstorm Sandy, which hit the U.S. east coast in 2011. [SEP] “These storms may not have been caused by global warming, but because the ocean’s surface is warmer, it makes the storm more powerful,” Thomas Wagner, cryosphere program manager at NASA headquarters in Washington, D.C. told FoxNews.com. [SEP] “Then, because sea level is higher, the water can go further inland from the storm surge.”  President Obama briefly 

## SemEval tweets

In [19]:
semeval_test = pd.read_csv('../datasets/StanceDataset/test.csv',header=0,encoding='utf-8',engine='python')
semeval_test = semeval_test[semeval_test['Target'] == 'Climate Change is a Real Concern']

semeval_train = pd.read_csv('../datasets/StanceDataset/train.csv',header=0,encoding='utf-8', engine='python')
semeval_train = semeval_train[semeval_train['Target'] == 'Climate Change is a Real Concern']

len(semeval_test),len(semeval_train)

(169, 395)

In [20]:
semeval_test = semeval_test[['Tweet','Stance']]
semeval_train = semeval_train[['Tweet','Stance']]

In [21]:
tweetstance2label = {'NONE': CLASS_NUMS['neutral'],
                    'FAVOR': CLASS_NUMS['agree'],
                    'AGAINST': CLASS_NUMS['disagree']}

semeval_test['stance'] = semeval_test['Stance'].apply(lambda x: tweetstance2label[x])
semeval_train['stance'] = semeval_train['Stance'].apply(lambda x: tweetstance2label[x])
semeval_df = semeval_test.append(semeval_train,ignore_index=True)

In [22]:
semeval_df.stance.value_counts()

0    335
1    203
2    26 
Name: stance, dtype: int64

## Add additional info: original source media leaning

In [11]:
def get_orig_media_slant(guid):
    r,b,s_id = guid.split('_')
    if int(r) < 5:
        df_ = data_for_mturk_df_old
    else:
        df_ = data_for_mturk_df
    
    b_df_ = pd.DataFrame(all_round_data[int(r)][int(b)])
    df_key = b_df_.loc[b_df_.sent_id == s_id].df_key.values[0]
    
    def str_to_int(s):
        return int(s == 'pro') # 1 for pro, 0 for anti
        
    return str_to_int(df_.iloc[df_key].bias)

In [12]:
int('pro' == 'pro')

1

## Titles, with source media outlet as proxy label

In [412]:
df = pd.read_pickle('../../1_data_scraping/dedup_combined_df.pkl')
df.shape

(44582, 10)

In [667]:
# df.loc[(df.stance=='pro') & 
#       (df.topic=='cc')].domain.value_counts()

In [668]:
# df.loc[(df.stance=='anti') & 
#       (df.topic=='cc')].domain.value_counts()

In [None]:
# Other extreme L candidates:
# grist, inthesetimes, guardian_us (2307 total)

# Other extreme R candidates:
# daily_caller, drudgereport, infowars (1153 total)

In [431]:
CC_KEYWORDS = {'warming','climate','carbon','co2','fossil',
              'temperature','environment','ice','antarctica','sea','seas',
              'IPCC','gore','green'}

def has_keyword(title):
    return len(set(title.lower().split()).intersection(CC_KEYWORDS)) > 0

In [432]:
counted_breitbart_titles = Counter(df.loc[df.domain == 'breitbart'].title.values)
keyword_breitbart_titles = [x for x in counted_breitbart_titles if has_keyword(x)]
len(counted_breitbart_titles),len(keyword_breitbart_titles)

(4037, 1365)

In [434]:
counted_mj_titles = Counter(df.loc[df.domain == 'mj'].title.values)
keyword_mj_titles = [x for x in counted_mj_titles if has_keyword(x)]
len(counted_mj_titles),len(keyword_mj_titles)

(3430, 896)

In [436]:
titles_df = pd.DataFrame({"sentence":keyword_breitbart_titles+keyword_mj_titles,
                                  "stance":['disagrees']*len(keyword_breitbart_titles)+\
                                  ['agrees']*len(keyword_mj_titles)
        })

In [471]:
#titles_df

# Create train/dev/test splits

In [97]:
assert len(mturk_df) == 2042-len(held_out_test)
assert len(dedup_est_labels) == 2042-len(held_out_test)

In [33]:
df_getter = {'raw_mturk': mturk_df,
            'est_mturk': dedup_est_labels,
            'semeval': semeval_df}

print(get_orig_media_slant('1_0_t12'))
print('\n')
print(get_window('1_0_t0',1))
print('\n')
print(get_backtrans('1_0_t12','zh'))

1


[SEP] “I think it’s very important to remind people the scope of what can happen with the hurricane season.”  Nonetheless, the events surrounding the hurricane, which caused $108 billion of damage, continue to interest to the scientific community. [SEP] [CLS] Warmer-than-normal sea surface temperatures are a key player in the development of hurricanes such as Katrina and superstorm Sandy, which hit the U.S. east coast in 2011. [SEP] “These storms may not have been caused by global warming, but because the ocean’s surface is warmer, it makes the storm more powerful,” Thomas Wagner, cryosphere program manager at NASA headquarters in Washington, D.C. told FoxNews.com.


How the Intergovernmental Panel on Climate Change should draw clearer conclusions from the research findings it assesses when assessing the effects of global warming.


In [31]:
df_getter['raw_mturk'].loc[df_getter['raw_mturk'].guid == '1_0_t12']

Unnamed: 0,round,batch,sent_id,stance,sentence,is_high_iaa,guid
4,1,0,t12,neutral,The Intergovernmental Panel on Climate Change should be clearer on how it draws conclusions from the body of research it assesses when gauging the impacts of global warming.,True,1_0_t12


In [288]:
def write_data(name,fold_no,train_df,test_df,dev_df=None,weights=True,do_downsample=False,
              add_titles=False):
    """
    Writes data to a directory containing train.tsv, test.tsv, and optionally dev.tsv.
    :param name: name of directory (type of train/eval data)
    :param desc: list of type str with manipulations made (e.g., downsampled, upsampled, backtrans_fr, window_1)
    :return: None
    """
    
    # Check that train and eval text are deduplicated
    train_guids = set([x.replace('_fr','').replace('_zh','') for x in train_df.guid])
    test_guids = set([x.replace('_fr','').replace('_zh','') for x in test_df.guid])
    assert train_guids.intersection(test_guids) == set()
    print("Train/test text overlap:",set(train_df.sentence).intersection(set(test_df.sentence)))
    if dev_df is not None:
        dev_guids = set([x.replace('_fr','').replace('_zh','') for x in dev_df.guid])
        assert train_guids.intersection(dev_guids) == set()
        print("Train/dev text overlap:",set(train_df.sentence).intersection(set(dev_df.sentence)))
    train_df = train_df[['stance','sentence','max_prob']+STANCES]
    test_df = test_df[['stance','sentence','max_prob']+STANCES]
    if dev_df is not None:
        dev_df = dev_df[['stance','sentence','max_prob']+STANCES]
    
    # Make save_dir
    # Want: ./new_save/datatype_or_name/folds/fold_no/
    # os.path.join(basedir, task, datatype, subdir, str(fold)),
    if do_downsample:
        name += '_downsampled'
    if weights:
        name += '_weights'
    if add_titles:
        name += '_with_titles'
    print(name)
    save_dir = os.path.join('new_save',name,'folds',str(fold_no))
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
        
    # Add titles from Breitbart and MJ--TODO: make outlets a parameter
    if add_titles:
        print('Adding Breitbart and MJ titles to train_df...')
        train_df = train_df.append(titles_df,ignore_index=True)
        
    # Regularize labels
    train_df['reg_stance'] = train_df['stance'].apply(stance_reg)
    test_df['reg_stance'] = test_df['stance'].apply(stance_reg)
    if dev_df is not None:
        dev_df['reg_stance'] = dev_df['stance'].apply(stance_reg) 
        
    # Aggregate examples by stance for downsampling/upsampling needs
    train_df_by_stance = {s: train_df.loc[train_df.reg_stance == i] for i,s in enumerate(STANCES)} 
    test_df_by_stance = {s: test_df.loc[test_df.reg_stance == i] for i,s in enumerate(STANCES)}
    dev_df_by_stance = {s: dev_df.loc[dev_df.reg_stance == i] for i,s in enumerate(STANCES)} if dev_df is not None else None
    
    if do_downsample:
        min_N = min([len(train_df_by_stance[s]) for s in STANCES])
        print('Downsampling to ~{} examples per stance.'.format(min_N))
        for s in STANCES:
            train_df_by_stance[s] = train_df_by_stance[s].loc[np.random.choice(train_df_by_stance[s].index,
                                                                               size=min_N,replace=False)]


    trX = []
    trY = []
    trNLI = []
    trW = []
    for i,s in enumerate(STANCES):
        for _,row in train_df_by_stance[s].iterrows():
            if weights:
                trX.extend([row['sentence']]*3)
                trW.extend([row[x] for x in STANCES])
                trY.extend([x for x in STANCES])
                trNLI.extend([stance2nli[stance_reg(x)] for x in STANCES])
            else:
                trX.append(row['sentence'])
                trY.append(row['stance'])
                trNLI.append(stance2nli[row['reg_stance']])

    teX = []
    teY = []
    teNLI = []
    teW = []
    for i,s in enumerate(STANCES):
        for _,row in test_df_by_stance[s].iterrows():
            teX.append(row['sentence'])
            teY.append(row['stance'])
            teNLI.append(stance2nli[row['reg_stance']])
            if weights:
                teW.append(row['max_prob'])

    if dev_df is not None:
        vaX = []
        vaY = []
        vaNLI = []
        vaW = []
        for i,s in enumerate(STANCES):
            for _,row in dev_df_by_stance[s].iterrows():
                vaX.append(row['sentence'])
                vaY.append(row['stance'])
                vaNLI.append(stance2nli[row['reg_stance']])
                if weights:
                    vaW.append(row['max_prob'])


    if weights:
        train_dat = pd.DataFrame({'sentence':trX,'stance':trY,'nli_label':trNLI,'weight':trW}) 
        test_dat = pd.DataFrame({'sentence':teX,'stance':teY,'nli_label':teNLI,'weight':teW})
        val_dat = pd.DataFrame({'sentence':vaX,'stance':vaY,'nli_label':vaNLI,'weight':vaW}) if dev_df is not None else None
    else:
        train_dat = pd.DataFrame({'sentence':trX,'stance':trY,'nli_label':trNLI})
        test_dat = pd.DataFrame({'sentence':teX,'stance':teY,'nli_label':teNLI})
        val_dat = pd.DataFrame({'sentence':vaX,'stance':vaY,'nli_label':vaNLI}) if dev_df is not None else None
    
    print('Train distribution:')
    print(train_dat.stance.value_counts()) 
    print(train_dat.nli_label.value_counts())
    if dev_df is not None:
        print('\nDev distribution:')
        print(val_dat.stance.value_counts())
        print(val_dat.nli_label.value_counts())
    print('\nTest distribution:')
    print(test_dat.stance.value_counts())
    print(test_dat.stance.value_counts()/np.sum(test_dat.stance.value_counts().values))
    print(test_dat.nli_label.value_counts())

    print('Writing to save_dir:',save_dir)
    train_dat.to_csv(save_dir+'/train.tsv',sep='\t',header=None,index=False)
    if dev_df is not None:
        val_dat.to_csv(save_dir+'/dev.tsv',sep='\t',header=None,index=False)
    test_dat.to_csv(save_dir+'/test.tsv',sep='\t',header=None,index=False)

## Completely held-out, second test set

Drawn from all MTurk labels, balanced over outlet sources and annotator ratings

In [135]:
dedup_est_labels['outlet_stance'] = dedup_est_labels['guid'].apply(get_orig_media_slant)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [136]:
dedup_est_labels.outlet_stance.value_counts()

0    1038
1    1004
Name: outlet_stance, dtype: int64

In [17]:
dedup_est_labels.stance.value_counts()/dedup_est_labels.stance.value_counts().sum()

neutral     0.427032
agree       0.378550
disagree    0.194417
Name: stance, dtype: float64

In [140]:
dedup_est_labels.loc[dedup_est_labels.outlet_stance==1].stance.value_counts()/\
dedup_est_labels.loc[dedup_est_labels.outlet_stance==1].stance.value_counts().sum() # L-wing stance dist.

agree       0.460159
neutral     0.423307
disagree    0.116534
Name: stance, dtype: float64

In [141]:
dedup_est_labels.loc[dedup_est_labels.outlet_stance==0].stance.value_counts()/\
dedup_est_labels.loc[dedup_est_labels.outlet_stance==0].stance.value_counts().sum() # R-wing stance dist.

neutral     0.430636
agree       0.299615
disagree    0.269750
Name: stance, dtype: float64

In [142]:
# Want held-out test set that's 46% agree+L, 42% neutral+L, 12% disagree+L
# 30% agree+R, 43% neutral+R, 27% disagree+R
N_needed = {1: round(dedup_est_labels.loc[dedup_est_labels.outlet_stance==1].stance.value_counts()/\
dedup_est_labels.loc[dedup_est_labels.outlet_stance==1].stance.value_counts().sum()*100),
            0: round(dedup_est_labels.loc[dedup_est_labels.outlet_stance==0].stance.value_counts()/\
dedup_est_labels.loc[dedup_est_labels.outlet_stance==0].stance.value_counts().sum()*100)}
N_needed

{1: agree       46.0
 neutral     42.0
 disagree    12.0
 Name: stance, dtype: float64, 0: neutral     43.0
 agree       30.0
 disagree    27.0
 Name: stance, dtype: float64}

In [143]:
indices_per_outlet_stance = defaultdict(dict)
for outlet_stance in [0,1]:
    for stance in ['agree','neutral','disagree']:
        sub_df = dedup_est_labels.loc[(dedup_est_labels.stance == stance) & 
                                     (dedup_est_labels.outlet_stance == outlet_stance)]
        str_outlet_stance = 'pro' if outlet_stance == 1 else 'anti'
        print(str_outlet_stance,stance,sub_df.shape)
        indices_per_outlet_stance[outlet_stance][stance] = np.random.choice(
            sub_df.index,size=int(N_needed[outlet_stance][stance]),replace=False)

anti agree (311, 12)
anti neutral (447, 12)
anti disagree (280, 12)
pro agree (462, 12)
pro neutral (425, 12)
pro disagree (117, 12)


In [144]:
balanced_test_ix = []
for outlet_stance in indices_per_outlet_stance:
    for stance in STANCES:
        balanced_test_ix.extend(indices_per_outlet_stance[outlet_stance][stance])
len(balanced_test_ix)

200

In [145]:
dedup_est_labels.loc[balanced_test_ix].stance.value_counts()

neutral     85
agree       76
disagree    39
Name: stance, dtype: int64

In [146]:
dedup_est_labels.loc[balanced_test_ix][dedup_est_labels.outlet_stance==1].stance.value_counts()

  """Entry point for launching an IPython kernel.


agree       46
neutral     42
disagree    12
Name: stance, dtype: int64

In [147]:
dedup_est_labels.loc[balanced_test_ix][dedup_est_labels.outlet_stance==0].stance.value_counts()

  """Entry point for launching an IPython kernel.


neutral     43
agree       30
disagree    27
Name: stance, dtype: int64

In [187]:
# Inspect sentences manually to filter out problematic cases
#list(zip(balanced_test_ix,list(dedup_est_labels.loc[balanced_test_ix].sentence.values),dedup_est_labels.loc[balanced_test_ix].stance.values))

In [149]:
problematic = [2006,1797,1331,667,927,177,216,1685,191,826,417,438,412,846,1448,1310,1802,801]

In [186]:
#dedup_est_labels.loc[problematic].sort_values('guid')

In [42]:
#mturk_df.loc[mturk_df.guid.isin(dedup_est_labels.loc[problematic].guid.values)]

In [151]:
dedup_est_labels.loc[problematic].loc[
    dedup_est_labels.loc[problematic].outlet_stance == 0].stance.value_counts()
# Need 9 more neutrals, 2 agrees, 1 disagree from R-wing outlets

neutral     9
agree       2
disagree    1
Name: stance, dtype: int64

In [152]:
dedup_est_labels.loc[problematic].loc[
    dedup_est_labels.loc[problematic].outlet_stance == 1].stance.value_counts()
# Need 3 more neutrals, 2 agree, and 1 disagree from L-wing outlets

neutral     3
agree       2
disagree    1
Name: stance, dtype: int64

In [153]:
N_needed_per_outlet_stance = {0: dedup_est_labels.loc[problematic].loc[
    dedup_est_labels.loc[problematic].outlet_stance == 0].stance.value_counts(),
                             1: dedup_est_labels.loc[problematic].loc[
    dedup_est_labels.loc[problematic].outlet_stance == 1].stance.value_counts()}

In [156]:
N_needed_per_outlet_stance[0]['neutral']

9

In [157]:
new_indices_per_outlet_stance = defaultdict(dict)
for outlet_stance in [0,1]:
    for stance in ['agree','neutral','disagree']:
        sub_df = dedup_est_labels.loc[(dedup_est_labels.stance == stance) & 
                                     (dedup_est_labels.outlet_stance == outlet_stance) &
                                     (~dedup_est_labels.index.isin(
                                         indices_per_outlet_stance[outlet_stance][stance]))]
        str_outlet_stance = 'pro' if outlet_stance == 1 else 'anti'
        print(str_outlet_stance,stance,sub_df.shape)
        new_indices_per_outlet_stance[outlet_stance][stance] = np.random.choice(
            sub_df.index,size=N_needed_per_outlet_stance[outlet_stance][stance],replace=False)

anti agree (281, 12)
anti neutral (404, 12)
anti disagree (253, 12)
pro agree (416, 12)
pro neutral (383, 12)
pro disagree (105, 12)


In [158]:
for outlet_stance in [0,1]:
    for stance in STANCES:
        print(outlet_stance,stance,len(new_indices_per_outlet_stance[outlet_stance][stance]))

0 agree 2
0 neutral 9
0 disagree 1
1 agree 2
1 neutral 3
1 disagree 1


In [159]:
new_balanced_test_ix = []
for outlet_stance in new_indices_per_outlet_stance:
    for stance in STANCES:
        new_balanced_test_ix.extend([x for x in 
                                 new_indices_per_outlet_stance[outlet_stance][stance] 
                                if x not in problematic])
len(new_balanced_test_ix)

18

In [160]:
dedup_est_labels.loc[new_balanced_test_ix].outlet_stance.value_counts()

0    12
1    6 
Name: outlet_stance, dtype: int64

In [161]:
dedup_est_labels.loc[new_balanced_test_ix][
    dedup_est_labels.loc[new_balanced_test_ix]['outlet_stance'] == 0].stance.value_counts()

neutral     9
agree       2
disagree    1
Name: stance, dtype: int64

In [162]:
dedup_est_labels.loc[new_balanced_test_ix][
    dedup_est_labels.loc[new_balanced_test_ix]['outlet_stance'] == 1].stance.value_counts()

neutral     3
agree       2
disagree    1
Name: stance, dtype: int64

In [188]:
# Re-nspect sentences manually to filter out problematic cases
#list(zip(new_balanced_test_ix,list(dedup_est_labels.loc[new_balanced_test_ix].sentence.values),dedup_est_labels.loc[new_balanced_test_ix].stance.values))

In [164]:
problematic_2 = [158,677,1165,1547]

In [None]:
# Add everything that's not problematic to base list

In [165]:
non_problematic_balanced_test_ix = []
for outlet_stance in indices_per_outlet_stance:
    for stance in STANCES:
        non_problematic_balanced_test_ix.extend([x for x in indices_per_outlet_stance[outlet_stance][stance]
                                if x not in problematic])
        non_problematic_balanced_test_ix.extend([x for x in new_indices_per_outlet_stance[outlet_stance][stance]
                                if x not in problematic_2])
len(non_problematic_balanced_test_ix)

196

In [166]:
dedup_est_labels.loc[non_problematic_balanced_test_ix].outlet_stance.value_counts()

1    100
0    96 
Name: outlet_stance, dtype: int64

In [167]:
dedup_est_labels.loc[non_problematic_balanced_test_ix][
    dedup_est_labels.loc[non_problematic_balanced_test_ix].outlet_stance == 0
].stance.value_counts()

neutral     39
agree       30
disagree    27
Name: stance, dtype: int64

In [168]:
dedup_est_labels.loc[non_problematic_balanced_test_ix][
    dedup_est_labels.loc[non_problematic_balanced_test_ix].outlet_stance == 1
].stance.value_counts()

agree       46
neutral     42
disagree    12
Name: stance, dtype: int64

In [169]:
dedup_est_labels.loc[problematic_2].loc[
    dedup_est_labels.loc[problematic_2].outlet_stance == 0].stance.value_counts()
# Need 4 more neutrals from R-wing outlets

neutral    4
Name: stance, dtype: int64

In [170]:
dedup_est_labels.loc[problematic_2].loc[
    dedup_est_labels.loc[problematic_2].outlet_stance == 1].stance.value_counts()

Series([], Name: stance, dtype: int64)

In [171]:
new_N_needed_per_outlet_stance = {0: {
    "agree": 0, "disagree": 0, "neutral": 4
},
                             1: {
                                 "agree": 0, "disagree": 0, "neutral": 0
                             }}

In [172]:
# New random sample to bring up numbers
new_indices_per_outlet_stance_2 = defaultdict(dict)
for outlet_stance in [0,1]:
    for stance in ['agree','neutral','disagree']:
        sub_df = dedup_est_labels.loc[(dedup_est_labels.stance == stance) & 
                                     (dedup_est_labels.outlet_stance == outlet_stance) &
                                     (~dedup_est_labels.index.isin(
                                         indices_per_outlet_stance[outlet_stance][stance])) & 
                                     (~dedup_est_labels.index.isin(
                                     new_indices_per_outlet_stance[outlet_stance][stance]))]
        str_outlet_stance = 'pro' if outlet_stance == 1 else 'anti'
        print(str_outlet_stance,stance,sub_df.shape)
        new_indices_per_outlet_stance_2[outlet_stance][stance] = np.random.choice(
            sub_df.index,size=new_N_needed_per_outlet_stance[outlet_stance][stance],
            replace=False)

anti agree (279, 12)
anti neutral (395, 12)
anti disagree (252, 12)
pro agree (414, 12)
pro neutral (380, 12)
pro disagree (104, 12)


In [173]:
for outlet_stance in new_indices_per_outlet_stance_2:
    for stance in STANCES:
        print(outlet_stance,stance,len(new_indices_per_outlet_stance_2[outlet_stance][stance]))

0 agree 0
0 neutral 4
0 disagree 0
1 agree 0
1 neutral 0
1 disagree 0


In [174]:
balanced_test_ix_2 = []
for outlet_stance in new_indices_per_outlet_stance_2:
    for stance in STANCES:
        balanced_test_ix_2.extend(new_indices_per_outlet_stance_2[outlet_stance][stance])
len(balanced_test_ix_2)

4

In [175]:
# Inspect sentences manually to filter out problematic cases
list(zip(balanced_test_ix_2,
         list(dedup_est_labels.loc[balanced_test_ix_2].sentence.values),
         dedup_est_labels.loc[balanced_test_ix_2].stance.values))

[(1708,
  "But if we ground every plane, leveled every building in America, and lived in huts, the United States' amount of carbon emissions only accounts for 15 percent of the entire world.",
  'neutral'),
 (1194, 'People would never accept carbon rationing.', 'neutral'),
 (1977,
  '224 of the 386 "climate change contrarians" quoted by the media have at least one publication in peer-reviewed scientific journals.',
  'neutral'),
 (862,
  'The fossil fuel industry may be emitting twice as much methane as previously thought.',
  'neutral')]

In [178]:
problematic_3 = []

In [179]:
for outlet_stance in indices_per_outlet_stance:
    for stance in STANCES:
        non_problematic_balanced_test_ix.extend([x for x in new_indices_per_outlet_stance_2[outlet_stance][stance]
                                if x not in problematic_3])
len(non_problematic_balanced_test_ix)

203

In [182]:
non_problematic_balanced_test_ix = np.unique(non_problematic_balanced_test_ix)

In [185]:
dedup_est_labels.loc[non_problematic_balanced_test_ix].outlet_stance.value_counts()

1    100
0    100
Name: outlet_stance, dtype: int64

In [183]:
dedup_est_labels.loc[non_problematic_balanced_test_ix][
    dedup_est_labels.loc[non_problematic_balanced_test_ix].outlet_stance == 0
].stance.value_counts()

neutral     43
agree       30
disagree    27
Name: stance, dtype: int64

In [184]:
dedup_est_labels.loc[non_problematic_balanced_test_ix][
    dedup_est_labels.loc[non_problematic_balanced_test_ix].outlet_stance == 1
].stance.value_counts()

agree       46
neutral     42
disagree    12
Name: stance, dtype: int64

In [189]:
dedup_est_labels.loc[non_problematic_balanced_test_ix].to_pickle('./save/held_out_balanced_test.pkl')

In [190]:
dedup_est_labels.loc[non_problematic_balanced_test_ix].to_csv('./save/held_out_balanced_test.tsv',
                                                             sep='\t',header=True,index=False)

## All SemEval tweets as eval

In [66]:
semeval_df['nli_label'] = semeval_df['stance'].apply(lambda x: stance2nli[x])

In [68]:
os.mkdir('./save/semeval_test')
semeval_df[['Tweet','stance','nli_label']].to_csv('./save/semeval_test'+'/test.tsv',sep='\t',header=None,index=False)

## SemEval as train, dev, and test

In [77]:
semeval_df['nli_label'] = semeval_df['stance'].apply(lambda x: stance2nli[x])
semeval_df['sentence'] = semeval_df['Tweet']
train_ix,eval_ix = train_test_split(list(semeval_df.index),test_size=0.3,random_state=42)
dev_ix,test_ix = train_test_split(eval_ix,test_size=0.55,random_state=42)
len(train_ix),len(dev_ix),len(test_ix)

(394, 76, 94)

In [78]:
train_df = semeval_df.loc[semeval_df.index.isin(train_ix)]
dev_df = semeval_df.loc[semeval_df.index.isin(dev_ix)]
test_df = semeval_df.loc[semeval_df.index.isin(test_ix)]
train_df.shape,dev_df.shape,test_df.shape

((394, 5), (76, 5), (94, 5))

In [445]:
#write_data('semeval_train_eval',42,[],train_df,test_df,dev_df)

## Cross-val splits (test on item-response est. label)

In [242]:
seed = 42
np.random.seed(seed)

In [243]:
order = np.arange(len(mturk_df))
np.random.shuffle(order)
len(order)

1842

In [244]:
indices_per_fold = {}
n_folds = 10
for f in range(n_folds):
    test_indices = [order[i] for i in np.arange(len(mturk_df)) if i % n_folds == f]
    nontest_indices = list(set(np.arange(len(mturk_df))) - set(test_indices))
    dev_indices = list(np.random.choice(nontest_indices, size=len(test_indices), replace=False))
    train_indices = list(set(nontest_indices) - set(dev_indices))
    all_indices = set(test_indices).union(set(dev_indices)).union(set(train_indices))
    indices_per_fold[f] = {'train':train_indices,'dev':dev_indices,'test':test_indices}
    print(len(all_indices), len(test_indices) + len(dev_indices) + len(train_indices), len(test_indices), len(dev_indices), len(train_indices))

1842 1842 185 185 1472
1842 1842 185 185 1472
1842 1842 184 184 1474
1842 1842 184 184 1474
1842 1842 184 184 1474
1842 1842 184 184 1474
1842 1842 184 184 1474
1842 1842 184 184 1474
1842 1842 184 184 1474
1842 1842 184 184 1474


In [245]:
pickle.dump(indices_per_fold,open('cross_val_10_seed_42_indices.pkl','wb'))

In [246]:
indices_per_fold = pickle.load(open('cross_val_10_seed_42_indices.pkl','rb'))

### Title-augmented train data

In [728]:
# for f in range(n_folds):
#     fold_ix = indices_per_fold[f]
#     train_ix = fold_ix['train']
#     test_ix = fold_ix['test']
#     dev_ix = fold_ix['dev']
    
#     train_df = mturk_df.loc[mturk_df.index.isin(train_ix)]
#     dev_df = mturk_df.loc[mturk_df.index.isin(dev_ix)]
#     test_df = mturk_df.loc[mturk_df.index.isin(test_ix)]
#     print(train_df.shape,dev_df.shape,test_df.shape)
#     write_data('all_mturk_with_titles_train_{}_fold_{}'.format(42,f),[],train_df,test_df,dev_df,
#            add_titles=False)

Oops, accidentally re-wrote with vanilla (non-title-augmented) data splits.

### Vanilla MTurk (est. labels)

In [247]:
dedup_est_labels.index

RangeIndex(start=0, stop=1842, step=1)

In [289]:
for f in range(0,n_folds):
    fold_0_ix = indices_per_fold[f]
    train_ix = fold_0_ix['train']
    test_ix = fold_0_ix['test']
    dev_ix = fold_0_ix['dev']

    train_df = dedup_est_labels.loc[dedup_est_labels.index.isin(train_ix)]
    dev_df = dedup_est_labels.loc[dedup_est_labels.index.isin(dev_ix)]
    test_df = dedup_est_labels.loc[dedup_est_labels.index.isin(test_ix)]
    print(train_df.shape,dev_df.shape,test_df.shape)
    write_data('all_mturk_train_{}'.format(seed),f,train_df,test_df,dev_df)

(1472, 11) (185, 11) (185, 11)
Train/test text overlap: set()
Train/dev text overlap: set()
all_mturk_train_42_weights


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Train distribution:
disagree    1472
agree       1472
neutral     1472
Name: stance, dtype: int64
contradiction    1472
neutral          1472
entailment       1472
Name: nli_label, dtype: int64

Dev distribution:
neutral     83
agree       64
disagree    38
Name: stance, dtype: int64
neutral          83
entailment       64
contradiction    38
Name: nli_label, dtype: int64

Test distribution:
neutral     85
agree       75
disagree    25
Name: stance, dtype: int64
neutral     0.459459
agree       0.405405
disagree    0.135135
Name: stance, dtype: float64
neutral          85
entailment       75
contradiction    25
Name: nli_label, dtype: int64
Writing to save_dir: new_save/all_mturk_train_42_weights/folds/0
(1472, 11) (185, 11) (185, 11)
Train/test text overlap: set()
Train/dev text overlap: set()
all_mturk_train_42_weights
Train distribution:
disagree    1472
agree       1472
neutral     1472
Name: stance, dtype: int64
contradiction    1472
neutral          1472
entailment       1472
Nam

In [746]:
pd.read_csv('./save/all_mturk_train_42_fold_0/train.tsv',sep='\t',header=None)[1].value_counts()

1    635
0    568
2    267
Name: 1, dtype: int64

In [750]:
pd.read_csv('./save/all_mturk_train_backtrans_fr_42_fold_0/train.tsv',sep='\t',header=None)[1].value_counts()

1    1270
0    1136
2    534 
Name: 1, dtype: int64

In [751]:
pd.read_csv('./save/all_mturk_train_backtrans_fr_upsampled_42_fold_0/train.tsv',sep='\t',
            header=None)[1].value_counts()

1    635
0    568
2    534
Name: 1, dtype: int64

In [410]:
677+658+626

1961

In [752]:
pd.read_csv('./save/all_mturk_train_backtrans_fr_42_fold_0_downsampled/train.tsv',sep='\t',
            header=None)[1].value_counts()

2    534
1    534
0    534
Name: 1, dtype: int64

#### Back translation augmented train, with and without downsampling

In [108]:
#Add backtranslations of the train_ix examples to training
# for f in range(n_folds):
#     fold_0_ix = indices_per_fold[f]
#     train_ix = fold_0_ix['train']
#     test_ix = fold_0_ix['test']
#     dev_ix = fold_0_ix['dev']

#     train_df = dedup_est_labels.loc[dedup_est_labels.index.isin(train_ix)]
#     dev_df = dedup_est_labels.loc[dedup_est_labels.index.isin(dev_ix)]
#     test_df = dedup_est_labels.loc[dedup_est_labels.index.isin(test_ix)]
#     backtrans_fr_df = add_backtrans_train(train_df,'fr')
#     backtrans_zh_df = add_backtrans_train(train_df,'zh')
#     backtrans_both_df = backtrans_fr_df.append(backtrans_zh_df,ignore_index=True).drop_duplicates('guid',keep='first')
#     print(backtrans_fr_df.shape,backtrans_zh_df.shape,backtrans_both_df.shape,dev_df.shape,test_df.shape)
#     write_data('all_mturk_train_backtrans_fr_{}_fold_{}'.format(seed,f),[],backtrans_fr_df,test_df,dev_df)
#     write_data('all_mturk_train_backtrans_zh_{}_fold_{}'.format(seed,f),[],backtrans_zh_df,test_df,dev_df)
#     write_data('all_mturk_train_backtrans_both_{}_fold_{}'.format(seed,f),[],backtrans_both_df,test_df,dev_df)
#     write_data('all_mturk_train_backtrans_fr_{}_fold_{}'.format(seed,f),[],backtrans_fr_df,test_df,dev_df,do_downsample=True)
#     write_data('all_mturk_train_backtrans_zh_{}_fold_{}'.format(seed,f),[],backtrans_zh_df,test_df,dev_df,do_downsample=True)
#     write_data('all_mturk_train_backtrans_both_{}_fold_{}'.format(seed,f),[],backtrans_both_df,test_df,dev_df,do_downsample=True)

#### Back translation + upsample minority class

In [109]:
# for f in range(n_folds):
#     fold_0_ix = indices_per_fold[f]
#     train_ix = fold_0_ix['train']
#     test_ix = fold_0_ix['test']
#     dev_ix = fold_0_ix['dev']

#     train_df = dedup_est_labels.loc[dedup_est_labels.index.isin(train_ix)]
#     dev_df = dedup_est_labels.loc[dedup_est_labels.index.isin(dev_ix)]
#     test_df = dedup_est_labels.loc[dedup_est_labels.index.isin(test_ix)]
#     backtrans_fr_df = add_backtrans_train(train_df,'fr',upsample=True)
#     backtrans_zh_df = add_backtrans_train(train_df,'zh',upsample=True)
#     backtrans_both_df = backtrans_fr_df.append(backtrans_zh_df,ignore_index=True).drop_duplicates('guid',keep='first')
#     print(backtrans_fr_df.shape,backtrans_zh_df.shape,backtrans_both_df.shape,dev_df.shape,test_df.shape)
#     write_data('all_mturk_train_backtrans_fr_upsampled_{}_fold_{}'.format(seed,f),[],backtrans_fr_df,test_df,dev_df)
#     write_data('all_mturk_train_backtrans_zh_upsampled_{}_fold_{}'.format(seed,f),[],backtrans_zh_df,test_df,dev_df)
#     write_data('all_mturk_train_backtrans_both_upsampled_{}_fold_{}'.format(seed,f),[],backtrans_both_df,test_df,dev_df)

### High IAA train, eval on rest (splits differ only in dev/test distribution)

In [246]:
train_indices = mturk_df.loc[mturk_df.is_high_iaa].index

In [270]:
indices_per_fold = {}
n_folds = 10
for f in range(n_folds):
    all_indices = list(low_iaa_df.index)
    test_indices = list(np.random.choice(all_indices, size=round(len(all_indices)/2), replace=False))
    dev_indices = list(set(all_indices) - set(test_indices))
    all_indices = set(test_indices).union(set(dev_indices))#.union(set(train_indices))
    indices_per_fold[f] = {'train':train_indices,'dev':dev_indices,'test':test_indices}
    print(len(all_indices), len(test_indices) + len(dev_indices) + len(train_indices), len(test_indices), len(dev_indices), len(train_indices))

923 2042 462 461 1119
923 2042 462 461 1119
923 2042 462 461 1119
923 2042 462 461 1119
923 2042 462 461 1119
923 2042 462 461 1119
923 2042 462 461 1119
923 2042 462 461 1119
923 2042 462 461 1119
923 2042 462 461 1119


In [271]:
pickle.dump(indices_per_fold,open('high_iaa_cross_val_10_seed_42_high_iaa_indices.pkl','wb'))

In [273]:
# for f in range(0,n_folds):
#     fold_0_ix = indices_per_fold[f]
#     train_ix = fold_0_ix['train']
#     test_ix = fold_0_ix['test']
#     dev_ix = fold_0_ix['dev']

#     train_df = mturk_df.loc[mturk_df.index.isin(train_ix)]
#     dev_df = mturk_df.loc[mturk_df.index.isin(dev_ix)]
#     test_df = mturk_df.loc[mturk_df.index.isin(test_ix)]
#     print(train_df.shape,dev_df.shape,test_df.shape)
#     write_data('high_iaa_train_fold_{}'.format(f),42,[],train_df,test_df,dev_df)

#### Augment w/ back translations, with and without downsampling

In [276]:
# for f in range(n_folds):
#     fold_0_ix = indices_per_fold[f]
#     train_ix = fold_0_ix['train']
#     test_ix = fold_0_ix['test']
#     dev_ix = fold_0_ix['dev']

#     train_df = mturk_df.loc[mturk_df.index.isin(train_ix)]
#     dev_df = mturk_df.loc[mturk_df.index.isin(dev_ix)]
#     test_df = mturk_df.loc[mturk_df.index.isin(test_ix)]
#     print(train_df.shape,dev_df.shape,test_df.shape)
#     backtrans_fr_df = add_backtrans_train(train_df,'fr')
#     backtrans_zh_df = add_backtrans_train(train_df,'zh')
#     backtrans_both_df = backtrans_fr_df.append(backtrans_zh_df,ignore_index=True).drop_duplicates('guid',keep='first')
#     print(backtrans_fr_df.shape,backtrans_zh_df.shape,backtrans_both_df.shape,dev_df.shape,test_df.shape)
#     write_data('high_iaa_train_fold_{}'.format(f),seed,['backtrans_fr'],backtrans_fr_df,test_df,dev_df)
#     write_data('high_iaa_train_fold_{}'.format(f),seed,['backtrans_zh'],backtrans_zh_df,test_df,dev_df)
#     write_data('high_iaa_train_fold_{}'.format(f),seed,['backtrans_both'],backtrans_both_df,test_df,dev_df)
#     write_data('high_iaa_train_fold_{}'.format(f),seed,['backtrans_fr'],backtrans_fr_df,test_df,dev_df,do_downsample=True)
#     write_data('high_iaa_train_fold_{}'.format(f),seed,['backtrans_zh'],backtrans_zh_df,test_df,dev_df,do_downsample=True)
#     write_data('high_iaa_train_fold_{}'.format(f),seed,['backtrans_both'],backtrans_both_df,test_df,dev_df,do_downsample=True)

#### Back translation + upsample minority class

In [279]:
# for f in range(n_folds):
#     fold_0_ix = indices_per_fold[f]
#     train_ix = fold_0_ix['train']
#     test_ix = fold_0_ix['test']
#     dev_ix = fold_0_ix['dev']

#     train_df = mturk_df.loc[mturk_df.index.isin(train_ix)]
#     dev_df = mturk_df.loc[mturk_df.index.isin(dev_ix)]
#     test_df = mturk_df.loc[mturk_df.index.isin(test_ix)]
#     print(train_df.shape,dev_df.shape,test_df.shape)
#     backtrans_fr_df = add_backtrans_train(train_df,'fr',upsample=True)
#     backtrans_zh_df = add_backtrans_train(train_df,'zh',upsample=True)
#     backtrans_both_df = backtrans_fr_df.append(backtrans_zh_df,ignore_index=True).drop_duplicates('guid',keep='first')
#     print(backtrans_fr_df.shape,backtrans_zh_df.shape,backtrans_both_df.shape,dev_df.shape,test_df.shape)
#     write_data('high_iaa_train_fold_{}'.format(f),seed,['backtrans_fr_upsampled'],backtrans_fr_df,test_df,dev_df)
#     write_data('high_iaa_train_fold_{}'.format(f),seed,['backtrans_zh_upsampled'],backtrans_zh_df,test_df,dev_df)
#     write_data('high_iaa_train_fold_{}'.format(f),seed,['backtrans_both_upsampled'],backtrans_both_df,test_df,dev_df)

# SCP to cluster

In [290]:
from paramiko import SSHClient
from scp import SCPClient

ssh = SSHClient()
ssh.load_system_host_keys()
ssh.connect(hostname='jacob.stanford.edu',username='yiweil',password='yldwuaeo2699zhishao15')

# Define progress callback that prints the current percentage completed for the file
def progress(filename, size, sent):
    print("%s\'s progress: %.2f%%   \r" % (filename, float(sent)/float(size)*100) )

In [291]:
local_data_type = 'all_mturk_train_42_weights'

In [295]:
cluster_data_dir = os.path.join('/u/scr/yiweil/sci_debates/cc_stance/climate_data/climate-weight',
                                local_data_type,'folds')
local_data_base_dir = './new_save'

# SCPCLient takes a paramiko transport and progress callback as its arguments.
scp = SCPClient(ssh.get_transport(), progress=progress)

In [296]:
glob.glob(os.path.join(local_data_base_dir,local_data_type,'folds/*'))

['./new_save/all_mturk_train_42_weights/folds/0',
 './new_save/all_mturk_train_42_weights/folds/1',
 './new_save/all_mturk_train_42_weights/folds/2',
 './new_save/all_mturk_train_42_weights/folds/3',
 './new_save/all_mturk_train_42_weights/folds/4',
 './new_save/all_mturk_train_42_weights/folds/5',
 './new_save/all_mturk_train_42_weights/folds/6',
 './new_save/all_mturk_train_42_weights/folds/7',
 './new_save/all_mturk_train_42_weights/folds/8',
 './new_save/all_mturk_train_42_weights/folds/9']

In [297]:
# for file in glob.glob(local_data_dir+'/high_iaa_train_42*'):
#     scp.put(file, recursive=True, remote_path=cluster_data_dir)
    
for file in glob.glob(os.path.join(local_data_base_dir,local_data_type,'folds/*')):
    scp.put(file, recursive=True, remote_path=cluster_data_dir)

scp.close()

b'dev.tsv''s progress: 0.00%   
b'dev.tsv''s progress: 58.18%   
b'dev.tsv''s progress: 100.00%   
b'test.tsv''s progress: 0.00%   
b'test.tsv''s progress: 56.39%   
b'test.tsv''s progress: 100.00%   
b'train.tsv''s progress: 0.00%   
b'train.tsv''s progress: 2.41%   
b'train.tsv''s progress: 4.81%   
b'train.tsv''s progress: 7.22%   
b'train.tsv''s progress: 9.62%   
b'train.tsv''s progress: 12.03%   
b'train.tsv''s progress: 14.44%   
b'train.tsv''s progress: 16.84%   
b'train.tsv''s progress: 19.25%   
b'train.tsv''s progress: 21.65%   
b'train.tsv''s progress: 24.06%   
b'train.tsv''s progress: 26.47%   
b'train.tsv''s progress: 28.87%   
b'train.tsv''s progress: 31.28%   
b'train.tsv''s progress: 33.68%   
b'train.tsv''s progress: 36.09%   
b'train.tsv''s progress: 38.49%   
b'train.tsv''s progress: 40.90%   
b'train.tsv''s progress: 43.31%   
b'train.tsv''s progress: 45.71%   
b'train.tsv''s progress: 48.12%   
b'train.tsv''s progress: 50.52%   
b'train.tsv''s progress: 52.93%   

b'train.tsv''s progress: 84.45%   
b'train.tsv''s progress: 86.87%   
b'train.tsv''s progress: 89.28%   
b'train.tsv''s progress: 91.69%   
b'train.tsv''s progress: 94.11%   
b'train.tsv''s progress: 96.52%   
b'train.tsv''s progress: 98.93%   
b'train.tsv''s progress: 100.00%   
b'dev.tsv''s progress: 0.00%   
b'dev.tsv''s progress: 59.92%   
b'dev.tsv''s progress: 100.00%   
b'test.tsv''s progress: 0.00%   
b'test.tsv''s progress: 57.89%   
b'test.tsv''s progress: 100.00%   
b'train.tsv''s progress: 0.00%   
b'train.tsv''s progress: 2.39%   
b'train.tsv''s progress: 4.78%   
b'train.tsv''s progress: 7.17%   
b'train.tsv''s progress: 9.56%   
b'train.tsv''s progress: 11.94%   
b'train.tsv''s progress: 14.33%   
b'train.tsv''s progress: 16.72%   
b'train.tsv''s progress: 19.11%   
b'train.tsv''s progress: 21.50%   
b'train.tsv''s progress: 23.89%   
b'train.tsv''s progress: 26.28%   
b'train.tsv''s progress: 28.67%   
b'train.tsv''s progress: 31.05%   
b'train.tsv''s progress: 33.44%  

b'train.tsv''s progress: 84.10%   
b'train.tsv''s progress: 86.50%   
b'train.tsv''s progress: 88.90%   
b'train.tsv''s progress: 91.30%   
b'train.tsv''s progress: 93.71%   
b'train.tsv''s progress: 96.11%   
b'train.tsv''s progress: 98.51%   
b'train.tsv''s progress: 100.00%   
