In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
diff = pd.read_csv('./HumanObserved-Dataset/HumanObserved-Features-Data/diffn_pairs.csv')
same = pd.read_csv('./HumanObserved-Dataset/HumanObserved-Features-Data/same_pairs.csv')
features = pd.read_csv('./HumanObserved-Dataset/HumanObserved-Features-Data/HumanObserved-Features-Data.csv', index_col=0)

In [3]:
same.head()

Unnamed: 0,img_id_A,img_id_B,target
0,0359a,0359b,1
1,0577a,0577b,1
2,0577a,0577c,1
3,1120a,1120b,1
4,1120a,1120c,1


In [4]:
features.head()

Unnamed: 0,img_id,f1,f2,f3,f4,f5,f6,f7,f8,f9
0,0359a,2,1,1,0,2,2,0,2,2
1,0577a,2,1,1,0,2,2,0,1,2
2,1120a,2,1,1,3,2,2,0,2,2
3,1120b,1,1,1,0,2,2,0,2,2
4,1120c,2,1,1,0,2,2,0,0,2


In [5]:
def concat_features(pairs, features):
    columns = ['img_id_A', 'img_id_B', 'target'] +\
                [f"f{i}_A" for i in range(1,10)] +\
                [f"f{i}_B" for i in range(1,10)]
    mask = [True,False,True,False] + [True]*19
    return pairs.merge(
    features,
    left_on='img_id_A',
    right_on='img_id').merge(
    features,
    left_on='img_id_B',
    right_on='img_id',
    suffixes=('_A','_B'))[columns].loc[:,mask]

def subtr_features(pairs, features):
    columns = [f"f{i}" for i in range(1,10)]
    A = pairs.merge(features,
           left_on='img_id_A',
           right_on='img_id',
           how='left')[columns]
    B = pairs.merge(features,
           left_on='img_id_B',
           right_on='img_id',
           how='left')[columns]
    return pd.concat([pairs,np.abs(A - B)], axis=1)

In [6]:
same_c = concat_features(same, features)
same_c.head()

Unnamed: 0,img_id_A,img_id_B,target,f1_A,f2_A,f3_A,f4_A,f5_A,f6_A,f7_A,...,f9_A,f1_B,f2_B,f3_B,f4_B,f5_B,f6_B,f7_B,f8_B,f9_B
0,0359a,0359b,1,2,1,1,0,2,2,0,...,2,3,2,1,0,2,2,3,0,2
1,0577a,0577b,1,2,1,1,0,2,2,0,...,2,2,1,0,3,2,2,1,2,2
2,0577a,0577c,1,2,1,1,0,2,2,0,...,2,1,1,1,1,2,3,0,0,2
3,0577b,0577c,1,2,1,0,3,2,2,1,...,2,1,1,1,1,2,3,0,0,2
4,1120a,1120b,1,2,1,1,3,2,2,0,...,2,1,1,1,0,2,2,0,2,2


In [7]:
same_s = subtr_features(same, features)
same_s.head()

Unnamed: 0,img_id_A,img_id_B,target,f1,f2,f3,f4,f5,f6,f7,f8,f9
0,0359a,0359b,1,1,1,0,0,0,0,3,2,0
1,0577a,0577b,1,0,0,1,3,0,0,1,1,0
2,0577a,0577c,1,1,0,0,1,0,1,0,1,0
3,1120a,1120b,1,1,0,0,3,0,0,0,0,0
4,1120a,1120c,1,0,0,0,3,0,0,0,2,0


In [8]:
diff_c = concat_features(diff, features)
diff_s = subtr_features(diff, features)

diff_s.head()

Unnamed: 0,img_id_A,img_id_B,target,f1,f2,f3,f4,f5,f6,f7,f8,f9
0,0359a,0577a,0,0,0,0,0,0,0,0,1,0
1,0359a,1120a,0,0,0,0,3,0,0,0,0,0
2,0359a,1120b,0,1,0,0,0,0,0,0,0,0
3,0359a,1120c,0,0,0,0,0,0,0,0,2,0
4,0359a,1121a,0,0,0,0,3,0,0,0,1,0


In [9]:
diff_c.head()

Unnamed: 0,img_id_A,img_id_B,target,f1_A,f2_A,f3_A,f4_A,f5_A,f6_A,f7_A,...,f9_A,f1_B,f2_B,f3_B,f4_B,f5_B,f6_B,f7_B,f8_B,f9_B
0,0359a,0577a,0,2,1,1,0,2,2,0,...,2,2,1,1,0,2,2,0,1,2
1,0359a,1120a,0,2,1,1,0,2,2,0,...,2,2,1,1,3,2,2,0,2,2
2,0577a,1120a,0,2,1,1,0,2,2,0,...,2,2,1,1,3,2,2,0,2,2
3,0359a,1120b,0,2,1,1,0,2,2,0,...,2,1,1,1,0,2,2,0,2,2
4,0577a,1120b,0,2,1,1,0,2,2,0,...,2,1,1,1,0,2,2,0,2,2


In [10]:
full_c = pd.concat([diff_c, same_c])
full_s = pd.concat([diff_s, same_s])

In [11]:
full_c['writer_A'] = full_c['img_id_A'].str.extract(r'^([0-9]+)',expand=False)
full_c['writer_B'] = full_c['img_id_B'].str.extract(r'^([0-9]+)',expand=False)

full_s['writer_A'] = full_s['img_id_A'].str.extract(r'^([0-9]+)',expand=False)
full_s['writer_B'] = full_s['img_id_B'].str.extract(r'^([0-9]+)',expand=False)

In [12]:
full_s.head()

Unnamed: 0,img_id_A,img_id_B,target,f1,f2,f3,f4,f5,f6,f7,f8,f9,writer_A,writer_B
0,0359a,0577a,0,0,0,0,0,0,0,0,1,0,359,577
1,0359a,1120a,0,0,0,0,3,0,0,0,0,0,359,1120
2,0359a,1120b,0,1,0,0,0,0,0,0,0,0,359,1120
3,0359a,1120c,0,0,0,0,0,0,0,0,2,0,359,1120
4,0359a,1121a,0,0,0,0,3,0,0,0,1,0,359,1121


In [13]:
full_c.head()

Unnamed: 0,img_id_A,img_id_B,target,f1_A,f2_A,f3_A,f4_A,f5_A,f6_A,f7_A,...,f2_B,f3_B,f4_B,f5_B,f6_B,f7_B,f8_B,f9_B,writer_A,writer_B
0,0359a,0577a,0,2,1,1,0,2,2,0,...,1,1,0,2,2,0,1,2,359,577
1,0359a,1120a,0,2,1,1,0,2,2,0,...,1,1,3,2,2,0,2,2,359,1120
2,0577a,1120a,0,2,1,1,0,2,2,0,...,1,1,3,2,2,0,2,2,577,1120
3,0359a,1120b,0,2,1,1,0,2,2,0,...,1,1,0,2,2,0,2,2,359,1120
4,0577a,1120b,0,2,1,1,0,2,2,0,...,1,1,0,2,2,0,2,2,577,1120


In [14]:
full_c.drop(['img_id_A','img_id_B'],axis=1, inplace=True)
full_s.drop(['img_id_A','img_id_B'],axis=1, inplace=True)

### Train test separation

![Img](https://d1b10bmlvqabco.cloudfront.net/attach/jlbcueow34qdq/isamd3soc56z/jnj25zqee36h/Screen_Shot_20181021_at_11.59.46_AM.png)

### Scheme 1 Unseen writers

for now any pair with test writer is considered a test pair    
    for more check [link](https://piazza.com/class/jlbcueow34qdq?cid=338)

In [33]:
def partition_unseen(df, train_writers=0.8):
    A = list(set(df.writer_A))

    wr_train = set(A[:int(len(A)*train_writers)])
    wr_test = set(A[int(len(A)*train_writers):])

    df['test'] = (df.writer_A.isin(wr_test) | df.writer_B.isin(wr_test)).astype('uint8')

    df_tr = df[df.test == 0].drop(['writer_A','writer_B','test'],axis=1)
    df_ts = df[df.test == 1].drop(['writer_A','writer_B','test'],axis=1)
    df.drop('test',axis=1,inplace=True)
    print(f"target distribution in train: {df_tr.target.mean():.5f}")
    print(f"target distribution in test: {df_ts.target.mean():.5f}")
    return df_tr.astype(np.int32), df_ts.astype(np.int32)

In [42]:
fs_u_tr, fs_u_ts = partition_unseen(full_s)

target distribution in train: 0.00334
target distribution in test: 0.00154


In [43]:
fc_u_tr, fc_u_ts = partition_unseen(full_c)

target distribution in train: 0.00334
target distribution in test: 0.00154


In [44]:
fs_u_tr.to_csv('./data/subtr/unseen_train.csv')
fs_u_ts.to_csv('./data/subtr/unseen_test.csv')

fc_u_tr.to_csv('./data/concat/unseen_train.csv')
fc_u_ts.to_csv('./data/concat/unseen_test.csv')

### Scheme 2 Shuffled

In [18]:
def partition_shuffled(df, train_writers=0.8):
    shuffled = df.sample(frac=1)
    breakpoint = int(train_writers * df.shape[0])
    df_tr = shuffled.iloc[:breakpoint]
    df_ts = shuffled.iloc[breakpoint:]
    print(f"target distribution in train: {df_tr.target.mean():.5f}")
    print(f"target distribution in test: {df_ts.target.mean():.5f}")
    return df_tr.astype(np.int32), df_ts.astype(np.int32)

In [19]:
fs_r_tr, fs_r_ts = partition_shuffled(full_s)

target distribution in train: 0.00263
target distribution in test: 0.00294


In [20]:
fc_r_tr, fc_r_ts = partition_shuffled(full_c)

target distribution in train: 0.00262
target distribution in test: 0.00296


In [45]:
fs_r_tr.to_csv('./data/subtr/shuffled_train.csv')
fs_r_ts.to_csv('./data/subtr/shuffled_test.csv')

fc_r_tr.to_csv('./data/concat/shuffled_train.csv')
fc_r_ts.to_csv('./data/concat/shuffled_test.csv')

### Scheme 3 Seen

once again not completely clear how to treat pairs of users    
let's see how many duplicated pairs we have

In [21]:
full_s[['writer_A','writer_B']].duplicated().mean()

0.5738012340762977

that's enought to partitiob by pairs

In [22]:
full_s = full_s.sort_values(by=['writer_A','writer_B'])

In [26]:
def partition_seen(df, train_writers=0.8):
    df_s = df.sort_values(by=['writer_A','writer_B'])
    pairs = (df_s.writer_A + '_').str.cat(df_s.writer_B).reset_index().drop(['index'],axis=1)
    pairs.columns = ['pair']
    pairs['idx'] = pairs.groupby(by='pair').cumcount()
    pairs = pairs.merge(
    (pairs.groupby('pair')['idx'].count() * train_writers).reset_index(),
    how='left',
    on='pair', suffixes=('_cum','_thr')
    )
    df_tr = df_s[(pairs.idx_cum <= pairs.idx_thr).values]
    df_ts = df_s[(pairs.idx_cum > pairs.idx_thr).values]
    
    print(f"train size: {df_tr.shape[0] / df.shape[0]}")
    print(f"test size: {df_ts.shape[0] / df.shape[0]}")
    
    print(f"target distribution in train: {df_tr.target.mean():.5f}")
    print(f"target distribution in test: {df_ts.target.mean():.5f}")
    return df_tr.astype(np.int32), df_ts.astype(np.int32)

In [27]:
fs_s_tr, fs_s_ts = partition_seen(full_s, train_writers=0.6)

train size: 0.8148919587642901
test size: 0.18510804123570992
target distribution in train: 0.00237
target distribution in test: 0.00412


In [28]:
fc_s_tr, fc_s_ts = partition_seen(full_c, train_writers=0.6)

train size: 0.8148919587642901
test size: 0.18510804123570992
target distribution in train: 0.00237
target distribution in test: 0.00412


In [48]:
fs_s_tr.to_csv('./data/subtr/seen_train.csv')
fs_s_ts.to_csv('./data/subtr/seen_test.csv')

fc_s_tr.to_csv('./data/concat/seen_train.csv')
fc_s_ts.to_csv('./data/concat/seen_test.csv')