In [22]:
import numpy as np
import pandas as pd
import scipy.sparse as sp
from numpy.linalg import norm
from collections import Counter, defaultdict
from scipy.sparse import csr_matrix
from nltk.corpus import stopwords
import string

In [23]:
df = pd.read_csv(
    filepath_or_buffer='data/train.dat', 
    header=None, 
    sep='\n')
df = pd.DataFrame(df[0].str.split('\t', 1).tolist())
statements = df[1]
classes = df[0]
print(statements)

0        Catheterization laboratory events and hospital...
1        Renal abscess in children. Three cases of rena...
2        Hyperplastic polyps seen at sigmoidoscopy are ...
3        Subclavian artery to innominate vein fistula a...
4        Effect of local inhibition of gamma-aminobutyr...
5        Infection during chronic epidural catheterizat...
6        Mediastinal tracheostomy using a pectoralis ma...
7        Tumefactive fibroinflammatory lesion of the ex...
8        Multiple representations contribute to body kn...
9        Increasing asthma prevalence in a rural New Ze...
10       Usefulness of the automatic implantable cardio...
11       Stress-related mucosal damage: review of drug ...
12       A biphasic pattern of anti-pre-S responses in ...
13       A controlled trial comparing vidarabine with a...
14       Pentostatin induces durable remissions in hair...
15       Cytologic features of poorly differentiated 'i...
16       Cutaneous manifestations of multiple myeloma. .

In [38]:
def cmer(name, c=3):
    r""" Given a name and parameter c, return the vector of c-mers associated with the name
    """
    name = name.lower()
    s=set(stopwords.words('english'))
    table = str.maketrans('', '', string.punctuation)
    aaj= list(filter(lambda w: not w in s,name.split()))
    stripped = [w.translate(table) for w in aaj]
#     print(stripped)
    v = []
    
#   find substrings of lenght "c" from every position in the string
    for i in range(0, len(stripped) - c + 1):
        v.append(stripped[i:i+c])
    
#     print(v)
    return v

In [41]:
def build_matrix(docs):
    r""" Build sparse matrix from a list of documents,
    each of which is a list of word/terms in the document.  
    """
    print(docs)
    nrows = len(docs)
    idx = {}
    tid = 0
    nnz = 0
    for d in docs:
        nnz += len(set(d))
        for w in d:
            if w not in idx:
                idx[w] = tid
                tid += 1
    ncols = len(idx)
        
    # set up memory
    ind = np.zeros(nnz, dtype=np.int)
    val = np.zeros(nnz, dtype=np.double)
    ptr = np.zeros(nrows+1, dtype=np.int)
    i = 0  # document ID / row counter
    n = 0  # non-zero counter
    # transfer values
    for d in docs:
        cnt = Counter(d)
        keys = list(k for k,_ in cnt.most_common())
        l = len(keys)
        for j,k in enumerate(keys):
            print(j)
            print(k)
            ind[j+n] = idx[k]
            val[j+n] = cnt[k]
        ptr[i+1] = ptr[i] + l
        n += l
        i += 1
            
#     print(ind[:20])        
    mat = csr_matrix((val, ind, ptr), shape=(nrows, ncols), dtype=np.double)
    mat.sort_indices()
    
    return mat

def csr_info(mat, name="", non_empy=False):
    r""" Print out info about this CSR matrix. If non_empy, 
    report number of non-empty rows and cols as well
    """
    if non_empy:
        print("%s [nrows %d (%d non-empty), ncols %d (%d non-empty), nnz %d]" % (
                name, mat.shape[0], 
                sum(1 if mat.indptr[i+1] > mat.indptr[i] else 0 
                for i in range(mat.shape[0])), 
                mat.shape[1], len(np.unique(mat.indices)), 
                len(mat.data)))
    else:
        print( "%s [nrows %d, ncols %d, nnz %d]" % (name, 
                mat.shape[0], mat.shape[1], len(mat.data)) )

def csr_l2normalize(mat, copy=False, **kargs):
    r""" Normalize the rows of a CSR matrix by their L-2 norm. 
    If copy is True, returns a copy of the normalized matrix.
    """
    if copy is True:
        mat = mat.copy()
    nrows = mat.shape[0]
    nnz = mat.nnz
    ind, val, ptr = mat.indices, mat.data, mat.indptr
    # normalize
    for i in range(nrows):
        rsum = 0.0    
        for j in range(ptr[i], ptr[i+1]):
            rsum += val[j]**2
        if rsum == 0.0:
            continue  # do not normalize empty rows
        rsum = 1.0/np.sqrt(rsum)
        for j in range(ptr[i], ptr[i+1]):
            val[j] *= rsum
            
    if copy is True:
        return mat
        
def namesToMatrix(names, c):
#     print(names)
    docs = [cmer(n, c) for n in names]
    return build_matrix(docs)


In [42]:
mat = namesToMatrix(statements,5)
csr_info(mat,'For c=' + str(5))

[[['catheterization', 'laboratory', 'events', 'hospital', 'outcome'], ['laboratory', 'events', 'hospital', 'outcome', 'direct'], ['events', 'hospital', 'outcome', 'direct', 'angioplasty'], ['hospital', 'outcome', 'direct', 'angioplasty', 'acute'], ['outcome', 'direct', 'angioplasty', 'acute', 'myocardial'], ['direct', 'angioplasty', 'acute', 'myocardial', 'infarction'], ['angioplasty', 'acute', 'myocardial', 'infarction', 'assess'], ['acute', 'myocardial', 'infarction', 'assess', 'safety'], ['myocardial', 'infarction', 'assess', 'safety', 'direct'], ['infarction', 'assess', 'safety', 'direct', 'infarct'], ['assess', 'safety', 'direct', 'infarct', 'angioplasty'], ['safety', 'direct', 'infarct', 'angioplasty', 'without'], ['direct', 'infarct', 'angioplasty', 'without', 'antecedent'], ['infarct', 'angioplasty', 'without', 'antecedent', 'thrombolytic'], ['angioplasty', 'without', 'antecedent', 'thrombolytic', 'therapy'], ['without', 'antecedent', 'thrombolytic', 'therapy', 'catheterization

TypeError: unhashable type: 'list'