In [1]:
import numpy as np
from scipy.sparse import csc_matrix
from scipy.sparse import hstack

import gc

In [1]:
from pandas import Series

In [2]:
s = set([1, 2, 3, 4, 5])
s

{1, 2, 3, 4, 5}

In [5]:
Series(list(s))

0    1
1    2
2    3
3    4
4    5
dtype: int64

In [6]:
s2 = set([4, 5, 6, 7, 8])

In [8]:
s |= s2
s

{1, 2, 3, 4, 5, 6, 7, 8}

In [2]:
A = csc_matrix([[1, 2], [0, 0], [4, 0]])
B = csc_matrix([[1, 2, 0], [0, 0, 3], [4, 0, 5]])
C = csc_matrix([[1, 2, 0, 2], [0, 0, 3, 3], [4, 0, 5, 4]])

In [26]:
from scipy.sparse import isspmatrix_csc

In [27]:
isspmatrix_csc(A)

True

In [29]:
if not isspmatrix_csc(A):
    print('False')
else:
    print('True')

True


In [33]:
def concatenate_csc_matrices_by_columns(m1, m2):
    # 确保 m1, m2 为 csc_matric，不然要出问题
    from scipy.sparse import isspmatrix_csc
    if not isspmatrix_csc(m1):
        m1 = m1.tocsc()
        gc.collect()
    if not isspmatrix_csc(m2):
        m2 = m2.tocsc()
        gc.collect()
    # 分解合成
    data = np.concatenate((m1.data, m2.data))
    indices = np.concatenate((m1.indices, m2.indices))
    indptr = m2.indptr + len(m1.data)
    indptr = indptr[1:]
    indptr = np.concatenate((m1.indptr, indptr))
    # 手动释放内存
    del m1
    del m2
    gc.collect()
    # 生成结果
    res = csc_matrix((data, indices, indptr))
    
    # 手动释放内存
    del data
    del indices
    del indptr
    gc.collect()

    return res

In [30]:
def gen_interactive_term(A, B):
    # 注意传入参数的顺序，与效率有关，列数少的放前面
    
    res = B.multiply(A[:, 0]).tocsc()
    for i in range(1, np.shape(A)[1]):
        product = B.multiply(A[:, i]).tocsc()
        res = concatenate_csc_matrices_by_columns(res, product)
        # 只保留元素不全为 0 的列
#         res = res[:, res.getnnz(0) > 0]
        # 手动释放内存
        del product
        gc.collect()

    # 手动释放内存
    del A
    del B
    gc.collect()

    return res

In [35]:
res = gen_interactive_term(A, B)

In [None]:
re

In [11]:
np.shape(res)

(3, 6)

In [13]:
A.toarray()

array([[1, 2],
       [0, 0],
       [4, 0]], dtype=int64)

In [14]:
B.toarray()

array([[1, 2, 0],
       [0, 0, 3],
       [4, 0, 5]], dtype=int64)

In [12]:
res.toarray()

array([[ 1,  0, 16,  2,  0,  0],
       [ 2,  0,  0,  4,  0,  0],
       [ 0,  0, 20,  0,  0,  0]], dtype=int64)

In [36]:
res.toarray()

array([[ 1,  2,  0,  2,  4,  0],
       [ 0,  0,  0,  0,  0,  0],
       [16,  0, 20,  0,  0,  0]], dtype=int64)

In [19]:
B.multiply(A[:, 0])

<3x3 sparse matrix of type '<class 'numpy.int64'>'
	with 4 stored elements in Compressed Sparse Row format>

In [8]:
concatenate_csc_matrices_by_columns(A, B)

<3x5 sparse matrix of type '<class 'numpy.int64'>'
	with 8 stored elements in Compressed Sparse Column format>

In [7]:
res

<3x3 sparse matrix of type '<class 'numpy.int64'>'
	with 6 stored elements in Compressed Sparse Column format>

In [6]:
A.toarray()

array([[1, 2],
       [0, 0],
       [4, 0]], dtype=int64)

In [7]:
B.toarray()

array([[1, 2, 0],
       [0, 0, 3],
       [4, 0, 5]], dtype=int64)

In [11]:
tmp = concatenate_csc_matrices_by_columns(A, B)

In [12]:
tmp.toarray()

array([[1, 2, 1, 2, 0],
       [0, 0, 0, 0, 3],
       [4, 0, 4, 0, 5]], dtype=int64)

In [4]:
A[:, 1]

<3x1 sparse matrix of type '<class 'numpy.int64'>'
	with 1 stored elements in Compressed Sparse Row format>

In [7]:
range(np.shape(C)[1])

range(0, 3)

(3, 4)

In [14]:
C_A_list = []
C_B_list = []
for i in range(np.shape(C)[1]):
    col = C.getcol(i)
    C_A_list.append(col.multiply(A))
    C_B_list.append(col.multiply(B))

In [16]:
C_A = hstack(C_A_list, format='csr')

In [20]:
C_B = hstack(C_B_list, format='csr')

In [22]:
C_B.toarray()

array([[ 1,  2,  0,  2,  4,  0,  0,  0,  0,  2,  4,  0],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  9,  0,  0,  9],
       [16,  0, 20,  0,  0,  0, 20,  0, 25, 16,  0, 20]], dtype=int64)

In [23]:
C_A_list.extend(C_B_list)
C_A_list

[<3x2 sparse matrix of type '<class 'numpy.int64'>'
 	with 3 stored elements in Compressed Sparse Row format>,
 <3x2 sparse matrix of type '<class 'numpy.int64'>'
 	with 2 stored elements in Compressed Sparse Row format>,
 <3x2 sparse matrix of type '<class 'numpy.int64'>'
 	with 1 stored elements in Compressed Sparse Row format>,
 <3x2 sparse matrix of type '<class 'numpy.int64'>'
 	with 3 stored elements in Compressed Sparse Row format>,
 <3x3 sparse matrix of type '<class 'numpy.int64'>'
 	with 4 stored elements in Compressed Sparse Row format>,
 <3x3 sparse matrix of type '<class 'numpy.int64'>'
 	with 2 stored elements in Compressed Sparse Row format>,
 <3x3 sparse matrix of type '<class 'numpy.int64'>'
 	with 3 stored elements in Compressed Sparse Row format>,
 <3x3 sparse matrix of type '<class 'numpy.int64'>'
 	with 5 stored elements in Compressed Sparse Row format>]

In [29]:
def interactive_term_3group(A, B, C):
    from scipy.sparse import hstack
    # 选择列最少的那个来遍历，要求把列最少的传入A,其次B
    A_B_list = []
    A_C_list = []
    B_C_list = []
    for i in range(np.shape(A)[1]):
        col = A.getcol(i)
        A_B_list.append(col.multiply(B))
        A_C_list.append(col.multiply(C))
    for i in range(np.shape(B)[1]):
        col = B.getcol(i)
        B_C_list.append(col.multiply(C))
    all_list = A_B_list + A_C_list + B_C_list
    return hstack(all_list, format='csr')

In [31]:
interactive_term = interactive_term_3group(A, B, C)

In [32]:
interactive_term

<3x26 sparse matrix of type '<class 'numpy.int64'>'
	with 29 stored elements in Compressed Sparse Row format>

In [33]:
interactive_term.toarray()

array([[ 1,  2,  0,  2,  4,  0,  1,  2,  0,  2,  2,  4,  0,  4,  1,  2,  0,
         2,  2,  4,  0,  4,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  9,  9],
       [16,  0, 20,  0,  0,  0, 16,  0, 20, 16,  0,  0,  0,  0, 16,  0, 20,
        16,  0,  0,  0,  0, 20,  0, 25, 20]], dtype=int64)

In [1]:
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder()
enc.fit([[0, 0, 3], [1, 1, 0], [0, 2, 1], [1, 0, 2]])  


enc.n_values_

enc.feature_indices_

enc.transform([[0, 1, 1]]).toarray()

array([[ 1.,  0.,  0.,  1.,  0.,  0.,  1.,  0.,  0.]])

In [2]:
res = enc.transform([[0, 1, 1]])

In [3]:
type(res)

scipy.sparse.csr.csr_matrix

In [4]:
res.tocsc()

<1x9 sparse matrix of type '<class 'numpy.float64'>'
	with 3 stored elements in Compressed Sparse Column format>

In [5]:
OneHotEncoder?