In [1]:
import numpy as np
import pandas as pd

In [2]:
%store -r HUMAN

In [5]:
if HUMAN:
    diff = pd.read_csv('./HumanObserved-Dataset/HumanObserved-Features-Data/diffn_pairs.csv')
    same = pd.read_csv('./HumanObserved-Dataset/HumanObserved-Features-Data/same_pairs.csv')
else:
    diff = pd.read_csv('./GSC-Dataset/GSC-Features-Data/diffn_pairs.csv')
    same = pd.read_csv('./GSC-Dataset/GSC-Features-Data/same_pairs.csv')

In [6]:
same['writer_A'] = same.img_id_A.str.extract(r'^([0-9]+)',expand=False)
same['writer_B'] = same.img_id_B.str.extract(r'^([0-9]+)',expand=False)

diff['writer_A'] = diff.img_id_A.str.extract(r'^([0-9]+)',expand=False)
diff['writer_B'] = diff.img_id_B.str.extract(r'^([0-9]+)',expand=False)

In [7]:
test_writers = set(same.writer_A.value_counts()[:5].index)

In [8]:
def get_test_idx(df, test_writers):
    return df.img_id_A.str[:-1].isin(test_writers) |\
           df.img_id_B.str[:-1].isin(test_writers)

In [9]:
same_test = same[get_test_idx(same, test_writers)].copy()
same.drop(get_test_idx(same, test_writers),inplace=True)

In [10]:
diff_test = diff[get_test_idx(diff, test_writers)].copy()
diff.drop(get_test_idx(diff, test_writers), inplace=True)

In [11]:
full = pd.concat([same, diff])
full_test = pd.concat([same_test, diff_test])

### Train test separation

![Img](https://d1b10bmlvqabco.cloudfront.net/attach/jlbcueow34qdq/isamd3soc56z/jnj25zqee36h/Screen_Shot_20181021_at_11.59.46_AM.png)

### Scheme 1 Unseen writers

for now any pair with test writer is considered a test pair    
    for more check [link](https://piazza.com/class/jlbcueow34qdq?cid=338)

In [12]:
def partition_unseen(df, train_writers=0.6):
    A = list(set(df.writer_A))
    np.random.shuffle(A)

    wr_train = set(A[:int(len(A)*train_writers)])
    wr_test = set(A[int(len(A)*train_writers):])

    df['test'] = (df.writer_A.isin(wr_test) | df.writer_B.isin(wr_test)).astype('uint8')

    df_tr = df[df.test == 0].drop(['test'],axis=1)
    df_ts = df[df.test == 1].drop(['test'],axis=1)
    df.drop('test',axis=1,inplace=True)
    print(f"target distribution in first: {df_tr.target.mean():.5f}")
    print(f"target distribution in second: {df_ts.target.mean():.5f}")
    return df_tr, df_ts

In [13]:
u_tr, u_v = partition_unseen(full, 0.6)

target distribution in first: 0.00454
target distribution in second: 0.00171


In [14]:
folder = 'human/' if HUMAN else 'gsc/'

In [15]:
full_test.to_csv('./data/'+folder+'test.csv')

In [None]:
u_tr.to_csv('./data/' + folder + 'unseen_train.csv')
u_v.to_csv('./data/' + folder + 'unseen_valid.csv')

### Scheme 2 Shuffled

In [None]:
def partition_shuffled(df, train_writers=0.8):
    shuffled = df.sample(frac=1)
    breakpoint = int(train_writers * df.shape[0])
    df_tr = shuffled.iloc[:breakpoint]
    df_ts = shuffled.iloc[breakpoint:]
    print(f"target distribution in train: {df_tr.target.mean():.5f}")
    print(f"target distribution in test: {df_ts.target.mean():.5f}")
    return df_tr, df_ts

In [None]:
r_tr, r_v = partition_shuffled(full, 0.6)

r_tr.to_csv('./data/' + folder + 'random_train.csv')
r_v.to_csv('./data/' + folder + 'random_valid.csv')

### Scheme 3 Seen

once again not completely clear how to treat pairs of users    
let's see how many duplicated pairs we have

In [None]:
full[['writer_A','writer_B']].duplicated().mean()

that's enought to partitiob by pairs

In [None]:
def partition_seen(df, train_writers=0.8):
    df_s = df.sort_values(by=['writer_A','writer_B'])
    pairs = (df_s.writer_A + '_').str.cat(df_s.writer_B).reset_index().drop(['index'],axis=1)
    pairs.columns = ['pair']
    pairs['idx'] = pairs.groupby(by='pair').cumcount()
    pairs = pairs.merge(
    (pairs.groupby('pair')['idx'].count() * train_writers).reset_index(),
    how='left',
    on='pair', suffixes=('_cum','_thr')
    )
    df_tr = df_s[(pairs.idx_cum <= pairs.idx_thr).values]
    df_ts = df_s[(pairs.idx_cum > pairs.idx_thr).values]
    
    print(f"first size: {df_tr.shape[0] / df.shape[0]}")
    print(f"second size: {df_ts.shape[0] / df.shape[0]}")
    
    print(f"first distribution in train: {df_tr.target.mean():.5f}")
    print(f"second distribution in test: {df_ts.target.mean():.5f}")
    return df_tr, df_ts

In [None]:
s_tr, s_v = partition_seen(full, train_writers=0.6)

In [None]:
s_tr.to_csv('./data/' + folder + 'seen_train.csv')
s_v.to_csv('./data/' + folder + 'seen_valid.csv')