In [1]:
import pandas as pd
import numpy as np
import os

# 工作路径改为/quoraDupli
os.chdir(os.path.dirname(os.getcwd()))
print os.getcwd()

/software/home/chenzh/software/jupyter/quoraDupli


In [2]:
df = pd.read_pickle(os.path.join('data', '2_word2vec_tfidf.pkl'))

In [3]:
df = df.reindex(np.random.permutation(df.index))

# set number of train and test instances
num_train = int(df.shape[0] * 0.88)
num_test = df.shape[0] - num_train                 
print("Number of training pairs: %i"%(num_train))
print("Number of testing pairs: %i"%(num_test))

# init data data arrays
X_train = np.zeros([num_train, 2, 300])
X_test  = np.zeros([num_test, 2, 300])
Y_train = np.zeros([num_train]) 
Y_test = np.zeros([num_test])

# format data 
b = [a[None,:] for a in list(df['q1_feats'].values)]
q1_feats = np.concatenate(b, axis=0)

b = [a[None,:] for a in list(df['q2_feats'].values)]
q2_feats = np.concatenate(b, axis=0)

# fill data arrays with features
X_train[:,0,:] = q1_feats[:num_train]
X_train[:,1,:] = q2_feats[:num_train]
Y_train = df[:num_train]['is_duplicate'].values
            
X_test[:,0,:] = q1_feats[num_train:]
X_test[:,1,:] = q2_feats[num_train:]
Y_test = df[num_train:]['is_duplicate'].values

# remove useless variables
del b
del q1_feats
del q2_feats

# preprocess data, unit std
X_train_norm = np.zeros_like(X_train)
d = (np.sum(X_train[:,0,:] ** 2, 1) ** (0.5))
X_train_norm[:,0,:] = (X_train[:,0,:].T / (d + 1e-8)).T
d = (np.sum(X_train[:,1,:] ** 2, 1) ** (0.5))
X_train_norm[:,1,:] = (X_train[:,1,:].T / (d + 1e-8)).T


X_test_norm = np.zeros_like(X_test)
d = (np.sum(X_test[:,0,:] ** 2, 1) ** (0.5))
X_test_norm[:,0,:] = (X_test[:,0,:].T / (d + 1e-8)).T
d = (np.sum(X_test[:,1,:] ** 2, 1) ** (0.5))
X_test_norm[:,1,:] = (X_test[:,1,:].T / (d + 1e-8)).T

Number of training pairs: 355775
Number of testing pairs: 48515


# siamese

In [13]:
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Lambda, merge, BatchNormalization, Activation, Input, Merge
from keras import backend as K


def euclidean_distance(vects):
    x, y = vects
    return K.sqrt(K.sum(K.square(x - y), axis=1, keepdims=True))

def eucl_dist_output_shape(shapes):
    shape1, shape2 = shapes
    return (shape1[0], 1)

def cosine_distance(vests):
    x, y = vests
    x = K.l2_normalize(x, axis=-1)
    y = K.l2_normalize(y, axis=-1)
    return -K.mean(x * y, axis=-1, keepdims=True)

def cos_dist_output_shape(shapes):
    shape1, shape2 = shapes
    return (shape1[0],1)

def contrastive_loss(y_true, y_pred):
    '''Contrastive loss from Hadsell-et-al.'06
    http://yann.lecun.com/exdb/publis/pdf/hadsell-chopra-lecun-06.pdf
    '''
    margin = 1
    return K.mean(y_true * K.square(y_pred) + (1 - y_true) * K.square(K.maximum(margin - y_pred, 0)))


def create_base_network(input_dim):
    '''
    Base network for feature extraction.
    '''
    input = Input(shape=(input_dim, ))
    dense1 = Dense(128)(input)
    bn1 = BatchNormalization()(dense1)
#     bn1 = BatchNormalization(mode=2)(dense1)
    relu1 = Activation('relu')(bn1)

    dense2 = Dense(128)(relu1)
    bn2 = BatchNormalization()(dense2)
#     bn2 = BatchNormalization(mode=2)(dense2)
    res2 = merge([relu1, bn2], mode='sum')
    relu2 = Activation('relu')(res2)    

    dense3 = Dense(128)(relu2)
    bn3 = BatchNormalization()(dense3)
#     bn3 = BatchNormalization(mode=2)(dense3)
    res3 = Merge(mode='sum')([relu2, bn3])
    relu3 = Activation('relu')(res3)   
    
    feats = merge([relu3, relu2, relu1], mode='concat')
    bn4 = BatchNormalization()(feats)
#     bn4 = BatchNormalization(mode=2)(feats)

    model = Model(input=input, output=bn4)

    return model


def compute_accuracy(predictions, labels):
    '''
    Compute classification accuracy with a fixed threshold on distances.
    '''
    return labels[predictions.ravel() < 0.5].mean()

def create_network(input_dim):
    # network definition
    base_network = create_base_network(input_dim)
    
    input_a = Input(shape=(input_dim,))
    input_b = Input(shape=(input_dim,))
    
    # because we re-use the same instance `base_network`,
    # the weights of the network
    # will be shared across the two branches
    processed_a = base_network(input_a)
    processed_b = base_network(input_b)
    
    distance = Lambda(euclidean_distance, output_shape=eucl_dist_output_shape)([processed_a, processed_b])
    
    model = Model(input=[input_a, input_b], output=distance)
    return model

In [14]:
from keras.optimizers import RMSprop, SGD, Adam
net = create_network(300)

# train
#optimizer = SGD(lr=1, momentum=0.8, nesterov=True, decay=0.004)
optimizer = Adam(lr=0.001)
net.compile(loss=contrastive_loss, optimizer=optimizer)

for epoch in range(50):
    net.fit([X_train_norm[:,0,:], X_train_norm[:,1,:]], Y_train,
          validation_data=([X_test_norm[:,0,:], X_test_norm[:,1,:]], Y_test),
          batch_size=128, nb_epoch=1, shuffle=True, )
    
    # compute final accuracy on training and test sets
    pred = net.predict([X_test_norm[:,0,:], X_test_norm[:,1,:]], batch_size=128)
    te_acc = compute_accuracy(pred, Y_test)
    
#    print('* Accuracy on training set: %0.2f%%' % (100 * tr_acc))
    print('* Accuracy on test set: %0.2f%%' % (100 * te_acc))

  if sys.path[0] == '':


Train on 355775 samples, validate on 48515 samples
Epoch 1/1
* Accuracy on test set: 57.74%
Train on 355775 samples, validate on 48515 samples
Epoch 1/1
* Accuracy on test set: 64.09%
Train on 355775 samples, validate on 48515 samples
Epoch 1/1
* Accuracy on test set: 67.14%
Train on 355775 samples, validate on 48515 samples
Epoch 1/1
* Accuracy on test set: 68.41%
Train on 355775 samples, validate on 48515 samples
Epoch 1/1
* Accuracy on test set: 70.96%
Train on 355775 samples, validate on 48515 samples
Epoch 1/1
* Accuracy on test set: 71.01%
Train on 355775 samples, validate on 48515 samples
Epoch 1/1
* Accuracy on test set: 72.18%
Train on 355775 samples, validate on 48515 samples
Epoch 1/1
* Accuracy on test set: 72.47%
Train on 355775 samples, validate on 48515 samples
Epoch 1/1
* Accuracy on test set: 72.30%
Train on 355775 samples, validate on 48515 samples
Epoch 1/1
* Accuracy on test set: 73.70%
Train on 355775 samples, validate on 48515 samples
Epoch 1/1
* Accuracy on test 

* Accuracy on test set: 75.38%
Train on 355775 samples, validate on 48515 samples
Epoch 1/1
* Accuracy on test set: 74.39%
Train on 355775 samples, validate on 48515 samples
Epoch 1/1
* Accuracy on test set: 74.91%
Train on 355775 samples, validate on 48515 samples
Epoch 1/1
* Accuracy on test set: 75.05%
Train on 355775 samples, validate on 48515 samples
Epoch 1/1
* Accuracy on test set: 75.79%
