In [1]:
%load_ext autoreload
%autoreload 1
%matplotlib inline

In [4]:
import sys
import numpy as np
import pandas as pd
np.set_printoptions(suppress=True)
sys.path.append('./code')
%aimport data_generator
from data_generator import generate_block_data, linear_model_generation

### 0. Synthetic Data Assumption
Assume X can be decomposed into X={S, V} and generated from a multivariate normal distribution $X\sim N(0,\Sigma)$, by specifying the structure of covariance matrix $\Sigma$, we can simulate different correlation structue of X. 

Specifically, we can divide $\Sigma$ into four blocks according to S and V, as follows:
$$
\Sigma = \left(\begin{array}{cc} 
                \Sigma_{SS} & \Sigma_{SV}\\ 
                \Sigma_{VS} & \Sigma_{VV}\\
         \end{array}\right).
$$

### 1. S&V data generation
In this main experiment, we assume the correlation only comes from S and V, thus becomes spurious correlation

In [45]:
beta = np.array([[1/5],[-2/5],[3/5],[-4/5],[1],[-1/5],[2/5],[-3/5],[4/5],[-1]])
beta_ = beta.copy()
beta = np.vstack([beta, -beta_])
beta = np.vstack([beta, beta])
beta

array([[ 0.2],
       [-0.4],
       [ 0.6],
       [-0.8],
       [ 1. ],
       [-0.2],
       [ 0.4],
       [-0.6],
       [ 0.8],
       [-1. ],
       [-0.2],
       [ 0.4],
       [-0.6],
       [ 0.8],
       [-1. ],
       [ 0.2],
       [-0.4],
       [ 0.6],
       [-0.8],
       [ 1. ],
       [ 0.2],
       [-0.4],
       [ 0.6],
       [-0.8],
       [ 1. ],
       [-0.2],
       [ 0.4],
       [-0.6],
       [ 0.8],
       [-1. ],
       [-0.2],
       [ 0.4],
       [-0.6],
       [ 0.8],
       [-1. ],
       [ 0.2],
       [-0.4],
       [ 0.6],
       [-0.8],
       [ 1. ]])

In [74]:
# Training Data Generation
trial_num = 30
X_train, Y_train = [], []
n, p, s, rho = 1000, 10, 2, 0.8
degree, magnitude = 5, 0.5
beta = np.array([[1/5],[-2/5],[3/5],[-4/5],[1],[-1/5],[2/5],[-3/5],[4/5],[-1]])
for i in range(trial_num):
    np.random.seed(i)
    X = generate_block_data(sample_size=n, block_size=s, block_num=int(p/s), rho=rho)
    Y = linear_model_generation(X, beta, magnitude=magnitude, degree=degree)
    X_train.append(X)
    Y_train.append(Y)

In [76]:
# Testing Data Generation
trial_num = 30
X_test, Y_test = [], []
n, p, s, rho = 1000, 10, 2, 0.8
degree, magnitude = 5, 0.5
#beta = np.array([[1/5],[-2/5],[3/5],[-4/5],[1],[-1/5],[2/5],[-3/5],[4/5],[-1]])
environments = [-0.9, -0.7, -0.5, -0.3, -0.1, 0.1, 0.3, 0.5, 0.7, 0.9]

for i in range(trial_num):
    np.random.seed(i+100)
    X_env, Y_env = [], []
    for rho in environments: 
        X = generate_block_data(sample_size=n, block_size=s, block_num=int(p/s), rho=rho)
        Y = linear_model_generation(X, beta, magnitude=magnitude, degree=degree)
        X_env.append(X)
        Y_env.append(Y)
    X_test.append(X_env)
    Y_test.append(Y_env)

In [77]:
np.savez('synthetic_data_main.npz', xtrain=X_train, ytrain=Y_train, xtest=X_test, ytest=Y_test)

In [34]:
synthetic_data_sv = np.load('synthetic_data_sv.npz')

### 2. WeChat Ads Data

In [182]:
# Load the header file
fd = open('./data/Ads_description.txt')
headers = [line.strip().strip(',') for line in fd.readlines()]

In [183]:
# Load the Data
ads_data = pd.read_csv('./data/Ads.data', names=headers)
print(ads_data.shape)

(324170, 67)


In [184]:
# Filter the invalid age data
age = ads_data['fage']
ads_data = ads_data[age<100]

In [185]:
# Drop uin
ads_data.drop(['uin'], axis=1, inplace=True)
print(ads_data.shape)

(323609, 66)


In [186]:
# Filter the binary features without enough information
for col in ads_data.columns:
    data = ads_data[col]
    if data.max()<1.1 and data.min()>-0.1: #binary
        if data.mean()<0.05 or data.mean()>0.95:
            print(col, data.mean())
            ads_data.drop([col], axis=1, inplace=True)
print(ads_data.shape)

fmplugin_ 0.990420538366
facebookplugin_ 0.0353945656641
feedsappplugin_ 0.995497653032
masssendplugin_ 0.960659314172
meishiplugin_ 0.00720931741701
sex_ 0.0124533001245
personalcard_ 1.0
verifyuser_ 0.987117169176
recomendfacebookfriend_ 0.00643369003952
readerapppush_ 6.18029782855e-06
meishicard_ 0.00767283975415
weixinhdimg_ 0.999252183963
weixinimg_ 0.999765148683
albumfrontcover_ 0.975535291046
snsflag_ 0.993108967921
paybank_ 0.999947467468
(323609, 50)


Up to now, we have 49 features and 1 outcome, among the 49 features there are 11 non-binary and 38 binary features, we want to hold ~15 features total

In [187]:
# Hold a subset of binary features
feature_num = 15 # Exclude Age
drop_cols = ads_data.columns[feature_num+1:-1]
ads_data.drop(drop_cols, axis=1, inplace=True)
ads_data.shape

(323609, 50)

In [None]:
data_list = list()
# Split the data with Age
edges = [0, 20, 30, 40, 50, 60, 100]
for i in range(1, len(edges)):
    data = ads_data[ads_data.fage>=edges[i-1]]
    data = data[data.fage<edges[i]]
    data.drop(['fage'], axis=1, inplace=True)
    data_list.append(data)
    print(len(data))

# Save into file
for i in range(1, len(edges)-1):
    Y = data_list[i]['outcome']
    X = data_list[i].drop(['outcome'], axis=1, inplace=False)
    np.save('data/Ads_age_5group/Ads_X'+str(i-1)+'.npy', np.array(X))
    np.save('data/Ads_age_5group/Ads_Y'+str(i-1)+'.npy', np.array(Y))

In [196]:
data_list = list()    
# Split the data with friend_count
edges = [0, 6, 7, 8, 9, 10, 100]
for i in range(1, len(edges)):
    data = ads_data[ads_data.ftotal_friend_count>=edges[i-1]]
    data = data[data.ftotal_friend_count<edges[i]]
    data.drop(['ftotal_friend_count'], axis=1, inplace=True)
    data_list.append(data)
    print(len(data))

# Save into file
for i in range(len(edges)-1):
    Y = data_list[i]['outcome']
    X = data_list[i].drop(['outcome'], axis=1, inplace=False)
    np.save('data/Ads_friend_6group/Ads_X'+str(i)+'.npy', np.array(X))
    np.save('data/Ads_friend_6group/Ads_Y'+str(i)+'.npy', np.array(Y))

9712
50054
120522
102332
34150
6839


In [197]:
data_list[0].columns

Index(['fage', 'flogin_counts', 'fsendmsg_count', 'frecvmsg_count',
       'fsnsupload_count', 'fgamereview_count', 'fbankcard_view_count',
       'fsns_view', 'forder_num', 'fgrade', 'fis_game_user', 'device_',
       'qqmailplugin_', 'pmplugin_', 'weiboplugin_', 'medianoteplugin_',
       'qqmsgplugin_', 'bottleplugin_', 'qqsyncplugin_', 'shakeplugin_',
       'lbsplugin_', 'gmailplugin_', 'checkqqfriendplugin_',
       'readerappplugin_', 'blogplugin_', 'newsappplugin_', 'nightnovoice_',
       'alldaynovoice_', 'emailverify_', 'mobileverify_', 'qqsearch_',
       'promotetoother_', 'qqmsgnotify_', 'promoteqqfriendtome_',
       'promotetome_', 'mobilesearch_', 'addcontact_', 'apnstips_',
       'qqweixinonline_', 'uploadmobilecontact_', 'weibourl_',
       'readerapptxnewspush_', 'readerappweibopush_', 'txweiboicon_',
       'bottlehdimg_', 'newsapptxnewspush_', 'snsflagex_', 'voiceinput_',
       'outcome'],
      dtype='object')

In [198]:
data_list[0].describe()

Unnamed: 0,fage,flogin_counts,fsendmsg_count,frecvmsg_count,fsnsupload_count,fgamereview_count,fbankcard_view_count,fsns_view,forder_num,fgrade,...,uploadmobilecontact_,weibourl_,readerapptxnewspush_,readerappweibopush_,txweiboicon_,bottlehdimg_,newsapptxnewspush_,snsflagex_,voiceinput_,outcome
count,9712.0,9712.0,9712.0,9712.0,9712.0,9712.0,9712.0,9712.0,9712.0,9712.0,...,9712.0,9712.0,9712.0,9712.0,9712.0,9712.0,9712.0,9712.0,9712.0,9712.0
mean,27.032846,8.207702,7.172357,9.061019,2.748853,2.43874,1.851859,6.890589,1.116274,3.004839,...,0.816516,0.078357,0.020181,0.142504,0.081755,0.416186,0.304057,0.373867,0.958505,0.111306
std,9.624162,2.633818,3.321355,3.257955,2.11682,1.91325,2.024938,2.564211,1.626852,1.907574,...,0.387083,0.268746,0.140627,0.349584,0.274004,0.492951,0.460031,0.483854,0.199443,0.314526
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,21.0,7.851749,5.643856,7.971544,1.0,1.0,0.0,6.247928,0.0,2.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
50%,26.0,8.823367,7.842347,9.670656,2.807355,2.321928,1.0,7.58871,0.0,3.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
75%,32.0,9.707359,9.459432,11.087629,4.392317,3.70044,3.321928,8.511753,2.0,4.0,...,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0
max,99.0,16.077442,15.038105,17.71993,9.763212,10.542065,11.316847,12.06777,9.33985,6.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
