In [1]:
import numpy as np
import csv
import random

In [2]:
male_data = []
with open('MALE.csv') as csvfile:
    csv_reader = csv.reader(csvfile)  
    male_header = next(csv_reader)  
    for row in csv_reader: 
        male_data.append(row)
        
female_data = []
with open('FEMALE.csv') as csvfile:
    csv_reader = csv.reader(csvfile)  
    female_header = next(csv_reader)  
    for row in csv_reader: 
        female_data.append(row)
        
mixed_data = []
with open('MIXED.csv') as csvfile:
    csv_reader = csv.reader(csvfile)  
    mixed_header = next(csv_reader)  
    for row in csv_reader: 
        mixed_data.append(row)
        
male_data = [[int(a) for a in row] for row in male_data]
female_data = [[int(b) for b in row] for row in female_data]
mixed_data = [[int(c) for c in row] for row in mixed_data]

In [3]:
# shuffle dataset
random.shuffle(male_data)
random.shuffle(female_data)
random.shuffle(mixed_data)

In [4]:
# get score(label)
male_score = []
for e in male_data:
    male_score.append(e[6])
    del e[6]
    
female_score = []
for e in female_data:
    female_score.append(e[6])
    del e[6]
    
mixed_score = []
for e in mixed_data:
    mixed_score.append(e[6])
    del e[6]

In [18]:
# divide train, dev and test set
test_male_data_x = male_data[:100]
dev_male_data_x = male_data[100:200]
train_male_data_x = male_data[200:]

test_male_data_y = male_score[:100]
dev_male_data_y = male_score[100:200]
train_male_data_y = male_score[200:]

test_female_data_x = female_data[:100]
dev_female_data_x = female_data[100:200]
train_female_data_x = female_data[200:]

test_female_data_y = female_score[:100]
dev_female_data_y = female_score[100:200]
train_female_data_y = female_score[200:]

test_mixed_data_x = mixed_data[:100]
dev_mixed_data_x = mixed_data[100:200]
train_mixed_data_x = mixed_data[200:]

test_mixed_data_y = mixed_score[:100]
dev_mixed_data_y = mixed_score[100:200]
train_mixed_data_y = mixed_score[200:]

In [6]:
import sklearn.metrics
from cvxopt import matrix, solvers

In [69]:
def kernel(ker, X1, X2, gamma):
    K = None
    if ker == 'linear':
        if X2 is not None:
            K = sklearn.metrics.pairwise.linear_kernel(np.asarray(X1), np.asarray(X2))
        else:
            K = sklearn.metrics.pairwise.linear_kernel(np.asarray(X1))
    elif ker == 'rbf':
        if X2 is not None:
            K = sklearn.metrics.pairwise.rbf_kernel(np.asarray(X1), np.asarray(X2), gamma)
        else:
            K = sklearn.metrics.pairwise.rbf_kernel(np.asarray(X1), None, gamma)
    return K

class KMM:
    def __init__(self, kernel_type='linear', gamma=1.0, B=1.0, eps=None):
        '''
        Initialization function
        :param kernel_type: 'linear' | 'rbf'
        :param gamma: kernel bandwidth for rbf kernel
        :param B: bound for beta
        :param eps: bound for sigma_beta
        '''
        self.kernel_type = kernel_type
        self.gamma = gamma
        self.B = B
        self.eps = eps

    def fit(self, Xs, Xt):
        '''
        Fit source and target using KMM (compute the coefficients)
        :param Xs: ns * dim
        :param Xt: nt * dim
        :return: Coefficients (Pt / Ps) value vector (Beta in the paper)
        '''
        ns = Xs.shape[0]
        nt = Xt.shape[0]
        if self.eps == None:
            self.eps = self.B / np.sqrt(ns)
        K = kernel(self.kernel_type, Xs, None, self.gamma)
        kappa = np.sum(kernel(self.kernel_type, Xs, Xt, self.gamma) * float(ns) / float(nt), axis=1)

        K = matrix(K)
        kappa = matrix(kappa)
        G = matrix(np.r_[np.ones((1, ns)), -np.ones((1, ns)), np.eye(ns), -np.eye(ns)])
        h = matrix(np.r_[ns * (1 + self.eps), ns * (self.eps - 1), self.B * np.ones((ns,)), np.zeros((ns,))])

        sol = solvers.qp(K, -kappa, G, h)
        beta = np.array(sol['x'])
        return beta


if __name__ == '__main__':
    Xs1 = train_male_data_x
    Xs2 = train_female_data_x
    Xt = train_mixed_data_x
    Xs1, Xt = np.asarray(Xs1), np.asarray(Xt)
    Xs2, Xt = np.asarray(Xs2), np.asarray(Xt)
    kmm = KMM(kernel_type='linear', B=10)
    beta1 = kmm.fit(Xs1, Xt)
    beta2 = kmm.fit(Xs2, Xt)
    print(beta1, beta2)

     pcost       dcost       gap    pres   dres
 0: -1.3711e+10 -1.3731e+10  2e+07  2e-02  7e-16
 1: -1.3711e+10 -1.3713e+10  2e+06  6e-04  9e-16
 2: -1.3711e+10 -1.3711e+10  3e+04  1e-05  9e-16
 3: -1.3711e+10 -1.3711e+10  3e+02  1e-07  1e-15
 4: -1.3711e+10 -1.3711e+10  3e+00  1e-09  1e-15
Optimal solution found.
     pcost       dcost       gap    pres   dres
 0: -2.0312e+10 -2.0342e+10  4e+07  2e-02  8e-16
 1: -2.0312e+10 -2.0315e+10  3e+06  9e-04  1e-15
 2: -2.0312e+10 -2.0312e+10  4e+04  7e-06  1e-15
 3: -2.0312e+10 -2.0312e+10  4e+02  7e-08  1e-15
Optimal solution found.
[[0.79068114]
 [1.50629163]
 [0.88327673]
 ...
 [0.82297187]
 [0.59212894]
 [0.82297187]] [[1.97745806]
 [0.87310003]
 [0.55896195]
 ...
 [0.86287125]
 [1.79584165]
 [0.50556145]]


In [67]:
def weighted_data(train_data_x, beta):
    new_train_data_x = []
    for index in range(len(train_data_x)):
        element = []
        for j in train_data_x[index]:
            e = j * float(beta[index])
            element.append(e)
        new_train_data_x.append(element)
    
    return new_train_data_x

In [76]:
new_train_male_data_x = []
for index in range(len(train_male_data_x)):
    element = []
    for j in train_male_data_x[index]:
        e = j * float(beta1[index])
        element.append(e)
    new_train_female_data_x.append(element)

In [77]:
new_train_female_data_x = []
for index in range(len(train_female_data_x)):
    element = []
    for j in train_mixed_data_x[index]:
        e = j * float(beta2[index])
        element.append(e)
    new_train_mixed_data_x.append(element)

In [73]:
len(beta2)

4204

In [74]:
len(new_train_mixed_data_x)

11293

In [60]:
from sklearn.neural_network import MLPRegressor
from sklearn import linear_model
from sklearn.metrics import mean_squared_error

In [61]:
len(train_mixed_data_y)

7089

In [62]:
# mlp regression
def mlp_regression(x_train, y_train, x_test):
    regr = MLPRegressor(max_iter=800).fit(x_train, y_train)

    x_test = np.array(x_test)
    y_pred = regr.predict(x_test)
    
    return y_pred

In [63]:
# linear regression
def lr_regression(x_train, y_train, x_test):
    regr = linear_model.LinearRegression().fit(x_train, y_train)

    x_test = np.array(x_test)
    y_pred = regr.predict(x_test)
    
    return y_pred

In [64]:
# male
new_train_female_data_x = weighted_data()
src_x_train = new_train_female_data_x + new_train_mixed_data_x
src_x_train = np.array(src_x_train)
src_y_train = train_female_data_y + train_mixed_data_y
src_y_train = np.array(src_y_train)
y_test = np.array(test_male_data_y)

mlp_y_pred = mlp_regression(src_x_train, src_y_train, test_male_data_x)
lr_y_pred = lr_regression(src_x_train, src_y_train, test_male_data_x)

print(mean_squared_error(y_test, mlp_y_pred), mean_squared_error(y_test, lr_y_pred))

93.88531333361057 120.11118941052308


In [68]:
# female
new_train_male_data_x = weighted_data(train_male_data_x, beta1)
new_train_mixed_data_x = weighted_data(train_mixed_data_x, beta2)
src_x_train = new_train_male_data_x + new_train_mixed_data_x
src_x_train = np.array(src_x_train)
src_y_train = train_male_data_y + train_mixed_data_y
src_y_train = np.array(src_y_train)
y_test = np.array(test_female_data_y)

mlp_y_pred = mlp_regression(src_x_train, src_y_train, test_female_data_x)
lr_y_pred = lr_regression(src_x_train, src_y_train, test_female_data_x)

print(mean_squared_error(y_test, mlp_y_pred), mean_squared_error(y_test, lr_y_pred))

133.38367222470777 188.27332326627885


In [78]:
# mixed
new_train_male_data_x = weighted_data(train_male_data_x, beta1)
new_train_female_data_x = weighted_data(train_female_data_x, beta2)
src_x_train = new_train_male_data_x + new_train_female_data_x
src_x_train = np.array(src_x_train)
src_y_train = train_male_data_y + train_female_data_y
src_y_train = np.array(src_y_train)
y_test = np.array(test_mixed_data_y)

mlp_y_pred = mlp_regression(src_x_train, src_y_train, test_mixed_data_x)
lr_y_pred = lr_regression(src_x_train, src_y_train, test_mixed_data_x)

print(mean_squared_error(y_test, mlp_y_pred), mean_squared_error(y_test, lr_y_pred))

140.32051199262486 156.2133370910049
