In [1]:
import numpy as np
from docopt import docopt
from scipy import sparse
import os
import subprocess
from shutil import copyfile
from operator import itemgetter

In [2]:
'''
workflow of the verify_implementation script

verify glove bias
1. I build W and H matrix, build bias term. And then, multply W and H, add bias term, use this matrix as my question
2. store the question matrix nnz term and meta information to ./matrix_folder
3. (this step is not important, because weight is closed when testing bias term)
   build a weight matrix and store its question and nnz term to ./count_folder
4. modified code, let it output the bias term when factorize the matrix
5. read in the answer matrix, test them, to see whether it can output the bias term

verify glove weight
1. I build W and H matrix, multply W and H, use this matrix as my question matrix
2. build a weight matrix, assign one position very high weight, store nnz and meta file in count_folder
3. open glove weight, close glove bias, factorize the matrix
4. read in the answer matrix, test them, to see the error on the position with high weight

'''

'\nworkflow of the verify_implementation script\n\nverify glove bias\n1. I build W and H matrix, build bias term. And then, multply W and H, add bias term, use this matrix as my question\n2. store the question matrix nnz term and meta information to ./matrix_folder\n3. (this step is not important, because weight is closed when testing bias term)\n   build a weight matrix and store its question and nnz term to ./count_folder\n4. modified code, let it output the bias term when factorize the matrix\n5. read in the answer matrix, test them, to see whether it can output the bias term\n\nverify glove weight\n1. I build W and H matrix, multply W and H, use this matrix as my question matrix\n2. build a weight matrix, assign one position very high weight, store nnz and meta file in count_folder\n3. open glove weight, close glove bias, factorize the matrix\n4. read in the answer matrix, test them, to see the error on the position with high weight\n\n'

In [66]:
'''
input: matrix_size = the size of question matrix which I need to factorize (how many words in total)
input: embedding_rank
input: include_bias (default = 0), set it as 1, when you need to add bias term when generating question matrix.
       only when include_bias = 1, we need to care the W_bias and H_bias in the return object.
       when include_bias = 0, the W_bias and H_bias in the return object are useless.
       
output: W, H, W_bias, H_bias, question_matrix; all of them are np array
        question_matrix[i][j] = W[i] * H[j] + W_bias[i] + H_bias[j]
        W: matrix_size * embedding_rank, np array
        H: embedding_rank * matrix_size, np array
        W_bias: matrix_size, np array
        H_bias: matrix_size, np array
        question_matrix: matrix_size * matrix_size, np array
'''
def build_answer_and_question(matrix_size, embedding_rank, include_bias = 0):
    W = np.random.random((matrix_size, embedding_rank))
    H = np.random.random((embedding_rank, matrix_size))
#     W = np.ones((matrix_size, embedding_rank))
#     H = np.ones((embedding_rank, matrix_size))

#     W = W*(W>0.55)
#     H = H*(H<0.45)
    
    W_bias = np.random.random(matrix_size)
    H_bias = np.random.random(matrix_size)
    
#     W_bias = np.ones(matrix_size)
#     H_bias = np.ones(matrix_size)    
    
#     question_matrix = np.dot(W, H)
    question_matrix = np.random.random((matrix_size, matrix_size))

    if include_bias:

        print "inlcude_bias is on"
        question_matrix = np.transpose(question_matrix)
        for i in range(question_matrix.shape[0]):
            question_matrix[i] = np.add(question_matrix[i], W_bias)
        question_matrix = np.transpose(question_matrix)
        
        for i in range(question_matrix.shape[1]):
            question_matrix[i] = np.add(question_matrix[i], H_bias)
    
#     print question_matrix
    
#     print question_matrix[1][5]
#     print (np.dot(W[1], H[:,5]) + W_bias[1] + H_bias[5])
    return W, H ,W_bias, H_bias, question_matrix

In [39]:
'''
input: matrix_size, I need to build a matrix_size * matrix_size weight matrix, everywhere except [1][1] is 1
input: weight_for_exception_term, how large weight you want to assign to the exception position

output: a matrix_size * matrix_size weight matrix, np array, the number in the [1][1] is weight_for_exception_term
'''
def build_weight_matrix(matrix_size, weight_for_exception_term):
    weight_matrix = np.ones((matrix_size, matrix_size))
    weight_matrix[1][1] = weight_for_exception_term
    
    return weight_matrix

In [5]:
'''
input is a np array, path_and_name

this function will save the nnz in path_and_name in the following foramt:
x_coordinate y_coordinate value
please note, in the final document, x_coordinate and y_coordinate start from 1, not 0; program will add 1 to each term's x and y coordinate.

'''
def save_nonzero_term_fast(nparray, path_and_name):
    csrmatrix = sparse.csr_matrix(nparray)  
    with open(path_and_name,'w') as f:
        for i in range(len(csrmatrix.indptr)-1):
#             if i % 1000000 == 0:
#                 print i, len(csrmatrix.indptr)-1
            columnIndices=[]
            dataInLine=[]
            columnIndices=csrmatrix.indices[csrmatrix.indptr[i]:csrmatrix.indptr[i+1]] 
            dataInLine=csrmatrix.data[csrmatrix.indptr[i]:csrmatrix.indptr[i+1]]
            for j in range(len(columnIndices)):
                f.write("%d %d %.6f\n"% (i+1, columnIndices[j]+1, dataInLine[j]))         

In [6]:
'''
input is a np array, path_and_name

this function will save the information in path_and_name in the following foramt:
matrix_size matrix_size
nnz_number training.ratings
nnz_number test.ratings
for example:
71290 71290
1289567 training.ratings
1289567 test.ratings
'''
def create_meta_file(nparray, path_and_name):
    csrmatrix = sparse.csr_matrix(nparray)
    nnz_number = csrmatrix.getnnz()
    size = nparray.shape[0]

    with open(path_and_name,'w') as f:
        f.write("%d %d\n" %(size, size))
        f.write("%d %s\n" %(nnz_number, 'training.ratings'))
        f.write("%d %s" %(nnz_number, 'test.ratings'))

In [7]:
'''
this function will read in the left and right matrix in the path_and_name
input: path_and_name
this funciton will read in the path_and_name +'.W' and path_and_name +'.H' file

output: left matrix a and right matrix b
        left matrix a: matrix_size * embedding_rank, np array
        right matrix b: embedding_rank * matrix_size, np array
'''
def read_answer(path_and_name):
    a = np.loadtxt(path_and_name +'.W')
#     print ("a.shape",a.shape)
    b = np.transpose(np.loadtxt(path_and_name +'.H'))
#     print ("b.shape",b.shape)
    return a, b

In [92]:
'''
verify glove weight
set include_bias as 0 when build qutstion matrix
save nnz terms and meta information
build weight matrix and save cooresponding information, assign a very high weight to [1][1] position
factorize the matrix, set G(glove_bias) as 0, set W(glvove_weight) as 1. set x_max as a very high number
read in and delete answer matrix
see the difference in every position between original matrix and rebuilded matrix 
'''

matrix_size = 10
rank = 3
x_max_in_setting = 100

W, H ,W_bias, H_bias, question_matrix = build_answer_and_question(matrix_size, rank, 0)
create_meta_file(question_matrix, "./matrix_folder/meta")
save_nonzero_term_fast(question_matrix, "./matrix_folder/training.ratings")
save_nonzero_term_fast(question_matrix, "./matrix_folder/test.ratings")





In [93]:
weight_matrix = build_weight_matrix(matrix_size, 100)
create_meta_file(weight_matrix, "./count_folder/meta")
save_nonzero_term_fast(weight_matrix, "./count_folder/training.ratings")
save_nonzero_term_fast(weight_matrix, "./count_folder/test.ratings")

subprocess.check_output(["make"])
subprocess.check_output(["./converter", "./matrix_folder"])
subprocess.check_output(["./converter", "./count_folder"])
print subprocess.check_output(["./omp-pmf-train", "-s", "10", "-n", "10", "-f", "1", "-t", "1000", "-q", "1", "-p", "0", "-r", "0.015625", "-l", "0.000000", "-b", "0", "-k", str(rank), "-E", "0", "-X", str(x_max_in_setting), "-W", "1", "-G", "0", "matrix_folder", "count_folder", "test_code"])


output_file_name = 'test_code-l0.000000-r0.015625-iter1000-gweight1-xmax'+str(x_max_in_setting)+'-gbias0.final'

W_answer, H_answer = read_answer(output_file_name)
subprocess.check_output(["rm", output_file_name + '.W'])
subprocess.check_output(["rm", output_file_name + '.H'])

# print question_matrix
print np.subtract(question_matrix, np.dot(W_answer, H_answer))

# print W_answer 1: 3.13123738e-02  100: 2.87885646e-02  10000: 2.86600946e-02   1000000: 2.86559836e-02   
# print H_answer

now implementing glove weight
x_max is 100
count_test_set, 1.000000
test_set, 0.031623
count_test_set, 1.000000
test_set, 0.031623
count_test_set, 1.000000
test_set, 0.031623
count_test_set, 1.000000
test_set, 0.031623
count_test_set, 1.000000
test_set, 0.031623
count_test_set, 1.000000
test_set, 0.031623
count_test_set, 1.000000
test_set, 0.031623
count_test_set, 1.000000
test_set, 0.031623
count_test_set, 1.000000
test_set, 0.031623
count_test_set, 1.000000
test_set, 0.031623
count_test_set, 1.000000
test_set, 0.031623
count_test_set, 100.000000
test_set, 1.000000
count_test_set, 1.000000
test_set, 0.031623
count_test_set, 1.000000
test_set, 0.031623
count_test_set, 1.000000
test_set, 0.031623
count_test_set, 1.000000
test_set, 0.031623
count_test_set, 1.000000
test_set, 0.031623
count_test_set, 1.000000
test_set, 0.031623
count_test_set, 1.000000
test_set, 0.031623
count_test_set, 1.000000
test_set, 0.031623
count_test_set, 1.000000
test_set, 0.031623
count_test_set, 1.000000
test_s

In [54]:
'''
verify glove bias
set include_bias as 1 when build qutstion matrix
save nnz terms and meta information
build weight matrix and save cooresponding information(this step is useless)
factorize the matrix, set G(glove_bias) as 1, set W(glvove_weight) as 0.
read in and delete answer matrix
see the difference in every position between original matrix and rebuilded matrix 
see the difference between original bias and the bias in my answer
'''
# def main():
#     args = docopt("""
#     Usage: 
#         verify_inner_product.py <representation_shared_path>
#     """)

matrix_size = 10
rank = 5
W, H ,W_bias, H_bias, question_matrix = build_answer_and_question(matrix_size, rank, 1)

# print question_matrix

create_meta_file(question_matrix, "./matrix_folder/meta")
save_nonzero_term_fast(question_matrix, "./matrix_folder/training.ratings")
save_nonzero_term_fast(question_matrix, "./matrix_folder/test.ratings")

create_meta_file(question_matrix, "./count_folder/meta")
save_nonzero_term_fast(question_matrix, "./count_folder/training.ratings")
save_nonzero_term_fast(question_matrix, "./matrix_folder/test.ratings")

subprocess.check_output(["make"])
subprocess.check_output(["./converter", "./matrix_folder"])
subprocess.check_output(["./converter", "./count_folder"])

subprocess.check_output(["./omp-pmf-train", "-s", "10", "-n", "10", "-f", "1", "-t", "1000", "-q", "1", "-p", "0", "-r", "0.015625", "-l", "0.000000", "-b", "0", "-k", str(rank), "-E", "0", "-X", "1", "-W", "0", "-G", "2", "matrix_folder", "count_folder", "test_code"])

output_file_name = 'test_code-l0.000000-r0.015625-iter1000-gweight0-xmax1-gbias2.final'

W_answer, H_answer = read_answer(output_file_name)
subprocess.check_output(["rm", output_file_name + '.W'])
subprocess.check_output(["rm", output_file_name + '.H'])

print W_bias
print W_answer[:,rank+1]

print H_bias
print H_answer[rank,:]

W_bias_dict = dict([(i, a) for i, a in enumerate(W_bias)])
sort_W_bias_dict = sorted(W_bias_dict.items(), key=itemgetter(1))
# print sort_W_bias_dict

W_answer_dict = dict([(i, a) for i, a in enumerate(W_answer[:,rank+1])])
sort_W_answer_dict = sorted(W_answer_dict.items(), key=itemgetter(1))
# print sort_W_answer_dict

H_bias_dict = dict([(i, a) for i, a in enumerate(H_bias)])
sort_H_bias_dict = sorted(H_bias_dict.items(), key=itemgetter(1))
# print H_bias_dict

H_answer_dict = dict([(i, a) for i, a in enumerate(H_answer[rank,:])])
sort_H_answer_dict = sorted(H_answer_dict.items(), key=itemgetter(1))
# print H_answer_dict

# print np.subtract(W_bias, W_answer[:,rank+1])
# print np.subtract(H_bias, H_answer[rank,:])


inlcude_bias is on
[ 0.50483846  0.46153753  0.3041392   0.52846059  0.83894381  0.54996912
  0.54227667  0.07156276  0.08002972  0.22675783]
[ 0.015176  0.105873 -0.077735  0.065187  0.228362  0.272595 -0.049881
 -0.104416 -0.412603 -0.042557]
[ 0.55990422  0.6045665   0.58187875  0.28037113  0.93925072  0.58113221
  0.01928283  0.57727253  0.43355802  0.99357643]
[-0.103281  0.177784  0.265607 -0.242932  0.046061  0.089364 -0.294318
  0.099466 -0.134189  0.035736]


In [55]:
print W_answer

[[  2.68070000e-01   2.42996000e-01   8.24862000e-01  -7.13419100e+00
    5.20411000e-01   1.00000000e+00   1.51760000e-02]
 [  2.62737000e-01  -5.77022000e-01  -6.89594000e-01  -1.05554200e+00
   -2.93052000e-01   1.00000000e+00   1.05873000e-01]
 [  2.97821000e-01   6.60011000e-01   9.27858000e-01   4.36570000e-01
   -6.77546000e-01   1.00000000e+00  -7.77350000e-02]
 [  3.11156000e-01   1.10011000e+00  -9.89000000e-04   4.58301300e+00
   -3.63490000e-02   1.00000000e+00   6.51870000e-02]
 [  2.90982000e-01  -1.96420000e-01  -1.08883400e+00  -7.58883300e+00
   -6.56561000e-01   1.00000000e+00   2.28362000e-01]
 [  3.19048000e-01  -3.71546000e-01   1.92633000e-01   8.13412700e+00
   -2.00920000e-02   1.00000000e+00   2.72595000e-01]
 [  2.97943000e-01   1.29774000e-01  -1.02100000e+00  -1.27247900e+00
    1.35118900e+00   1.00000000e+00  -4.98810000e-02]
 [  2.87585000e-01  -9.57095000e-01   1.15519700e+00   3.21094400e+00
    1.23418000e-01   1.00000000e+00  -1.04416000e-01]
 [  3.17

In [58]:
print question_matrix

[[ 2.29833695  2.05585971  2.24916218  1.93079167  2.8221014   1.94877134
   1.55434096  1.99563405  1.79474122  2.26644398]
 [ 2.27573573  2.04145177  2.39271945  2.16560011  2.79428194  2.4099812
   1.36456677  1.86250229  2.05183639  1.99780283]
 [ 2.67933181  2.32302593  2.42705327  2.124372    2.73797878  2.07728128
   1.67517243  2.23004     1.78032925  2.30485924]
 [ 2.87799219  2.63815666  2.7810083   2.22589334  2.74879052  2.36079555
   2.05716925  2.52882167  2.17951133  2.56072519]
 [ 2.69948345  2.371978    2.58895387  2.44152727  3.07062658  2.74451658
   1.6426979   2.22075584  2.45528412  2.58597297]
 [ 3.01359168  2.70120039  3.14664821  2.81275905  3.47305469  2.84842195
   2.08512568  2.50837256  2.45208962  2.38883374]
 [ 2.31502528  2.19365991  2.58751401  2.03249818  2.82985096  2.36621069
   1.73935924  2.05663896  2.29115818  2.27647949]
 [ 2.3140093   1.97869139  2.45982537  2.29256551  3.16247964  2.16983438
   1.39860694  1.8129571   1.73336439  1.74820206]
 

In [57]:
print np.subtract(question_matrix, np.dot(W_answer, H_answer))

[[  1.34356079e-03  -1.23563989e-03  -1.17622168e-03   9.67719268e-04
   -4.21793107e-04  -7.51640562e-04   1.66699693e-03  -9.81217156e-04
    1.09916898e-03  -5.04853662e-04]
 [ -4.96376052e-03   4.06835939e-03   3.30467428e-03  -2.52318701e-03
    1.18195090e-03   2.59822676e-03  -5.00696662e-03   3.44860235e-03
   -3.74250049e-03   1.64360849e-03]
 [  1.75714376e-03  -1.20611781e-03  -6.99622192e-04   4.23309397e-04
   -2.46613301e-04  -8.32164526e-04   1.25760834e-03  -1.13666459e-03
    1.17537233e-03  -4.74005618e-04]
 [ -2.09978638e-03   1.44645063e-03   8.42118426e-04  -5.02437963e-04
    3.01554316e-04   9.99301413e-04  -1.50116886e-03   1.36308361e-03
   -1.40203619e-03   5.72339982e-04]
 [  7.55113089e-04  -1.52132567e-03  -2.34004186e-03   2.23462665e-03
   -8.45804254e-04  -7.26072312e-04   2.78947345e-03  -8.44885884e-04
    1.15308488e-03  -6.60677315e-04]
 [ -9.65652183e-04  -3.37176845e-04  -1.62933551e-03   1.79341838e-03
   -5.93169688e-04   8.68259718e-05   1.52842