In [1]:
import numpy as np
import pandas as pd

In [2]:
diff = pd.read_csv('./HumanObserved-Dataset/HumanObserved-Features-Data/diffn_pairs.csv')
same = pd.read_csv('./HumanObserved-Dataset/HumanObserved-Features-Data/same_pairs.csv')
features = pd.read_csv('./HumanObserved-Dataset/HumanObserved-Features-Data/HumanObserved-Features-Data.csv', index_col=0)

In [3]:
same.head()

Unnamed: 0,img_id_A,img_id_B,target
0,0359a,0359b,1
1,0577a,0577b,1
2,0577a,0577c,1
3,1120a,1120b,1
4,1120a,1120c,1


In [4]:
test_writers = set(same.img_id_A.str[:-1].value_counts()[:5].index)

In [5]:
def get_test_idx(df, test_writers):
    return df.img_id_A.str[:-1].isin(test_writers) |\
           df.img_id_B.str[:-1].isin(test_writers)

In [6]:
same_test = same[get_test_idx(same, test_writers)].copy()
same.drop(get_test_idx(same, test_writers),inplace=True)

In [7]:
diff_test = diff[get_test_idx(diff, test_writers)].copy()
diff.drop(get_test_idx(diff, test_writers), inplace=True)

In [8]:
def concat_features(pairs, features):
    columns = ['img_id_A', 'img_id_B', 'target'] +\
                [f"f{i}_A" for i in range(1,10)] +\
                [f"f{i}_B" for i in range(1,10)]
    mask = [True,False,True,False] + [True]*19
    return pairs.merge(
    features,
    how='left',
    left_on='img_id_A',
    right_on='img_id').merge(
    features,
    how='left',
    left_on='img_id_B',
    right_on='img_id',
    suffixes=('_A','_B'))[columns].loc[:,mask]

def subtr_features(pairs, features):
    columns = [f"f{i}" for i in range(1,10)]
    A = pairs.merge(features,
           left_on='img_id_A',
           right_on='img_id',
           how='left')[columns]
    B = pairs.merge(features,
           left_on='img_id_B',
           right_on='img_id',
           how='left')[columns]
    return pd.concat([pairs,np.abs(A - B)], axis=1)

In [9]:
same_c = concat_features(same, features).dropna()
same_s = subtr_features(same, features).dropna()
same_test_c = concat_features(same_test, features).dropna()
same_test_s = subtr_features(same_test, features).dropna()


diff_c = concat_features(diff, features).dropna()
diff_s = subtr_features(diff, features).dropna()
diff_test_c = concat_features(diff_test, features).dropna()
diff_test_s = subtr_features(diff_test, features).dropna()

diff_s.head()

Unnamed: 0,img_id_A,img_id_B,target,f1,f2,f3,f4,f5,f6,f7,f8,f9
2,0359a,1120b,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,1.0,0.0
3,0359a,1120c,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0359a,1121a,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
5,0359a,1121b,0.0,1.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0
6,0359a,1121c,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,2.0,0.0


In [10]:
pd.concat([diff_c,same_c]).sample(frac=1).drop(
    ['img_id_A','img_id_B'],
    axis=1).to_csv('./data/concat/test.csv',index=False)

pd.concat([diff_s,same_s]).sample(frac=1).drop(
    ['img_id_A','img_id_B'],
    axis=1).to_csv('./data/subtr/test.csv',index=False)

In [11]:
full_c = pd.concat([diff_c, same_c])
full_s = pd.concat([diff_s, same_s])

In [12]:
full_c['writer_A'] = full_c['img_id_A'].str.extract(r'^([0-9]+)',expand=False)
full_c['writer_B'] = full_c['img_id_B'].str.extract(r'^([0-9]+)',expand=False)

full_s['writer_A'] = full_s['img_id_A'].str.extract(r'^([0-9]+)',expand=False)
full_s['writer_B'] = full_s['img_id_B'].str.extract(r'^([0-9]+)',expand=False)

In [13]:
full_s.head()

Unnamed: 0,img_id_A,img_id_B,target,f1,f2,f3,f4,f5,f6,f7,f8,f9,writer_A,writer_B
2,0359a,1120b,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,1.0,0.0,359,1120
3,0359a,1120c,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,359,1120
4,0359a,1121a,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,359,1121
5,0359a,1121b,0.0,1.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,359,1121
6,0359a,1121c,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,2.0,0.0,359,1121


In [14]:
full_c.head()

Unnamed: 0,img_id_A,img_id_B,target,f1_A,f2_A,f3_A,f4_A,f5_A,f6_A,f7_A,...,f2_B,f3_B,f4_B,f5_B,f6_B,f7_B,f8_B,f9_B,writer_A,writer_B
0,0359a,1120b,0,2,1,1,0,2,2,0,...,1,1,0,2,2,0,2,2,359,1120
1,0359a,1120c,0,2,1,1,0,2,2,0,...,1,1,0,2,2,0,0,2,359,1120
2,0359a,1121a,0,2,1,1,0,2,2,0,...,1,1,3,2,2,0,1,2,359,1121
3,0359a,1121b,0,2,1,1,0,2,2,0,...,1,1,0,2,2,0,3,2,359,1121
4,0359a,1121c,0,2,1,1,0,2,2,0,...,1,1,0,2,2,0,1,2,359,1121


In [15]:
full_c.drop(['img_id_A','img_id_B'],axis=1, inplace=True)
full_s.drop(['img_id_A','img_id_B'],axis=1, inplace=True)

### Train test separation

![Img](https://d1b10bmlvqabco.cloudfront.net/attach/jlbcueow34qdq/isamd3soc56z/jnj25zqee36h/Screen_Shot_20181021_at_11.59.46_AM.png)

### Scheme 1 Unseen writers

for now any pair with test writer is considered a test pair    
    for more check [link](https://piazza.com/class/jlbcueow34qdq?cid=338)

In [16]:
def partition_unseen(df, train_writers=0.6):
    A = list(set(df.writer_A))
    np.random.shuffle(A)

    wr_train = set(A[:int(len(A)*train_writers)])
    wr_test = set(A[int(len(A)*train_writers):])

    df['test'] = (df.writer_A.isin(wr_test) | df.writer_B.isin(wr_test)).astype('uint8')

    df_tr = df[df.test == 0].drop(['test'],axis=1)
    df_ts = df[df.test == 1].drop(['test'],axis=1)
    df.drop('test',axis=1,inplace=True)
    print(f"target distribution in first: {df_tr.target.mean():.5f}")
    print(f"target distribution in second: {df_ts.target.mean():.5f}")
    return df_tr.astype(np.int32), df_ts.astype(np.int32)

In [17]:
fs_u_tr, fs_u_v = partition_unseen(full_s, 0.6)

target distribution in first: 0.00472
target distribution in second: 0.00165


In [18]:
fc_u_tr, fc_u_v = partition_unseen(full_c, 0.6)

target distribution in first: 0.00437
target distribution in second: 0.00171


In [19]:
fs_u_tr.drop(['writer_A','writer_B'],axis=1).to_csv('./data/subtr/unseen_train.csv', index=False)
fs_u_v.drop(['writer_A','writer_B'],axis=1).to_csv('./data/subtr/unseen_valid.csv', index=False)

fc_u_tr.drop(['writer_A','writer_B'],axis=1).to_csv('./data/concat/unseen_train.csv', index=False)
fc_u_v.drop(['writer_A','writer_B'],axis=1).to_csv('./data/concat/unseen_valid.csv', index=False)

### Scheme 2 Shuffled

In [20]:
def partition_shuffled(df, train_writers=0.8):
    shuffled = df.sample(frac=1)
    breakpoint = int(train_writers * df.shape[0])
    df_tr = shuffled.iloc[:breakpoint]
    df_ts = shuffled.iloc[breakpoint:]
    print(f"target distribution in train: {df_tr.target.mean():.5f}")
    print(f"target distribution in test: {df_ts.target.mean():.5f}")
    return df_tr.astype(np.int32), df_ts.astype(np.int32)

In [21]:
fs_r_tr, fs_r_v = partition_shuffled(full_s, 0.6)

fc_r_tr, fc_r_v = partition_shuffled(full_c, 0.6)

target distribution in train: 0.00272
target distribution in test: 0.00262
target distribution in train: 0.00259
target distribution in test: 0.00283


In [22]:
fs_r_tr.drop(['writer_A','writer_B'],axis=1).to_csv('./data/subtr/shuffled_train.csv', index=False)
fs_r_v.drop(['writer_A','writer_B'],axis=1).to_csv('./data/subtr/shuffled_valid.csv', index=False)

fc_r_tr.drop(['writer_A','writer_B'],axis=1).to_csv('./data/concat/shuffled_train.csv', index=False)
fc_r_v.drop(['writer_A','writer_B'],axis=1).to_csv('./data/concat/shuffled_valid.csv', index=False)

### Scheme 3 Seen

once again not completely clear how to treat pairs of users    
let's see how many duplicated pairs we have

In [23]:
full_s[['writer_A','writer_B']].duplicated().mean()

0.5737998400353964

that's enought to partitiob by pairs

In [24]:
def partition_seen(df, train_writers=0.8):
    df_s = df.sort_values(by=['writer_A','writer_B'])
    pairs = (df_s.writer_A + '_').str.cat(df_s.writer_B).reset_index().drop(['index'],axis=1)
    pairs.columns = ['pair']
    pairs['idx'] = pairs.groupby(by='pair').cumcount()
    pairs = pairs.merge(
    (pairs.groupby('pair')['idx'].count() * train_writers).reset_index(),
    how='left',
    on='pair', suffixes=('_cum','_thr')
    )
    df_tr = df_s[(pairs.idx_cum <= pairs.idx_thr).values]
    df_ts = df_s[(pairs.idx_cum > pairs.idx_thr).values]
    
    print(f"first size: {df_tr.shape[0] / df.shape[0]}")
    print(f"second size: {df_ts.shape[0] / df.shape[0]}")
    
    print(f"first distribution in train: {df_tr.target.mean():.5f}")
    print(f"second distribution in test: {df_ts.target.mean():.5f}")
    return df_tr, df_ts

In [25]:
fs_s_tr, fs_s_v = partition_seen(full_s, train_writers=0.6)
print('\n')
fc_s_tr, fc_s_v = partition_seen(full_c, train_writers=0.6)

first size: 0.8149005326480949
second size: 0.1850994673519051
first distribution in train: 0.00236
second distribution in test: 0.00408
first size: 0.8148996491036999
second size: 0.18510035089630011
first distribution in train: 0.00236
second distribution in test: 0.00410


In [26]:
fs_s_tr.drop(['writer_A','writer_B'],axis=1).to_csv('./data/subtr/seen_train.csv', index=False)
fs_s_v.drop(['writer_A','writer_B'],axis=1).to_csv('./data/subtr/seen_valid.csv', index=False)

fc_s_tr.drop(['writer_A','writer_B'],axis=1).to_csv('./data/concat/seen_train.csv', index=False)
fc_s_v.drop(['writer_A','writer_B'],axis=1).to_csv('./data/concat/seen_valid.csv', index=False)