In [1]:
# https://github.com/gcdart/MulticlassClassifier/blob/master/src/ml/LogisticRegression.java
# https://www.kaggle.com/c/lshtc/discussion/6911#38233 - preprocessing: multilabels comma should not have spaces
# https://www.kaggle.com/c/lshtc/discussion/14048 - dataset statistics
## reading the LWIKI, SWIKI dataset

In [2]:
import os
import numpy as np
import pandas as pd
import igraph as ig

from pathlib import Path
from tqdm import tqdm #always use this instead of `import tqdm`
# from sklearn.datasets import fetch_rcv1
from scipy.sparse import *
# np.random.seed(123)
import warnings
warnings.simplefilter('ignore')

In [3]:
import logging
from collections import OrderedDict

logging.basicConfig(level=logging.INFO, )

In [4]:
from sklearn.linear_model import SGDClassifier
from sklearn import preprocessing, metrics

In [5]:
from sklearn.model_selection import train_test_split

In [6]:
from joblib import Memory
from sklearn.datasets import load_svmlight_file

In [7]:
mem = Memory("./mycache")
@mem.cache
def get_data(filename):
    
    fname = str(Path(filename))
    fe, ex = os.path.splitext(fname) 

    try:
        data = load_svmlight_file(fname, multilabel=True)
    except:
        # Required: if the input data isn't in the correct libsvm format
        outfile = str(Path("{}_small{}".format(fe, ex)))
#         outfile = str(Path("{}_remapped{}".format(fe, ex)))
        if not os.path.isfile(outfile):
            logging.info("Remapping data to LibSVM format...")
            f = preprocess_libsvm(fname, outfile)
        else:
            logging.info("Using already remapped data...")
            f = outfile
        data = load_svmlight_file(f, multilabel=True)
        
    return data[0], data[1]

In [8]:
def preprocess_libsvm(input_file, output_file):
    # converts file to the required libsvm format.
    # this is very brute force but can be made faster [IMPROVE]

    file = open(output_file, "w+")
    with open(input_file, "r") as f:
        head = [next(f) for x in range(500)] # retrieve only `n` docs
        for i, line in enumerate(tqdm(head)): # change to f/head depending on your needs
            instance = line.strip().split()
            labels = instance[0]
            doc_dict = OrderedDict()
            temp_dict = {}
            temp_string = ''

            for pair in instance[1:]:
                feat = pair.split(":")
                if int(feat[0]) not in temp_dict:
                    temp_dict[int(feat[0])] = int(feat[1])

            for key in sorted(temp_dict.keys()):
                doc_dict[key] = temp_dict[key]

            for feat, tf in doc_dict.items():
                temp_string = temp_string + "{}:{} ".format(feat, tf)        
            file.write("{} {}\n".format(labels, temp_string))
        file.close()

    return output_file

In [9]:
def label_extractor(labels):

    leaf_labels = set()
    labels_per_doc = []

    for i in labels:
        labels_per_doc.append(len(i))
        for j in i:
            leaf_labels.add(int(j))
    
    return leaf_labels, labels_per_doc

In [10]:
def read_hier(filename):
    
    N = set()
    pi = set()
    T = set()
    
    with open(filename, "r") as f:
        for i, line in enumerate(f):
            words = line.strip().split()
            pi.add(int(words[0])) #adding parent node
            T.add(int(words[-1])) #adding ALL leaf nodes in the hierarchy
            for w in words:
                N.add(int(w))

    return N, pi, T

In [11]:
mem = Memory("./mycache")
@mem.cache
def rr_reader(filename):
    '''
    create a dataframe from the data-label pair
    '''

    num_entries = 200000
    df = pd.DataFrame()
    
    with open(filename, "r") as f:
#         head = [next(f) for x in range(num_entries)] # retrieve only `n` docs
        for i, line in enumerate(tqdm(f)): # change to f/head depending on your needs
            instance = line.strip().split()
            labels = instance[0]
            doc_dict = OrderedDict()
            temp_dict = {}

            for pair in instance[1:]:
                feat = pair.split(":")
                if int(feat[0]) not in temp_dict:
                    temp_dict[int(feat[0])] = int(feat[1])

            for key in sorted(temp_dict.keys()):
                doc_dict[key] = temp_dict[key]
                
            temp_df = pd.DataFrame(data = [ labels, doc_dict ]).T
            df = df.append(temp_df, ignore_index=True)
    
    df.columns = ["labels", "feat_tf"]
    df["labels"] = df["labels"].apply( lambda x: list(map(int, x.split(",")))  )
    return df

In [12]:
df = rr_reader("swiki/data/train.txt")

In [13]:
dta, labls = get_data("swiki/data/train_remapped.txt")

In [14]:
small_df = df.sample(100)

In [15]:
dta[0]

<1x2085164 sparse matrix of type '<class 'numpy.float64'>'
	with 114 stored elements in Compressed Sparse Row format>

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [17]:
# todo: fucking hell- represent documents in an embedding space -.-
# after that we can take `each x_i` -.-
def data_parser(dataset, dta):
    # embeds the doc to a D-dim space
    #create doc_vector for each instance
#     tfidfer = TfidfVectorizer()
    idx = list(dataset.index)
    df_x = []
#     for ix in tqdm(idx):
#         twoDarray = list(dataset.loc[ix, "feat_tf"].items())        
#         doc_items = []
#         for item in twoDarray:
#             if item[0] not in doc_items:
#                 if item[1] > 1:
#                     for _ in range(item[1]):
#                         doc_items.append(str(item[0]))
#                 else:
#                     doc_items.append(str(item[0]))

#         tfidfer.fit(doc_items)
#         df_x.append(tfidfer.transform(doc_items))
#             df_x.append(doc_items)
    for ix in tqdm(idx):
        df_x.append(dta[ix].todense())
    dg_x = np.stack(df_x)
    return dg_x

In [18]:
ff = data_parser(small_df, dta)

100%|██████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 4155.33it/s]


In [19]:
ff.shape

(100, 2085164)

In [20]:
fff = csr_matrix(ff, shape = (ff.shape[0], 64), dtype = np.float32)

In [21]:
rows = np.sum(fff, axis = 1) #row wise sum

In [26]:
fff.shape

(100, 64)

In [27]:
gg = np.random.randn(fff.shape[0], fff.shape[1])

In [22]:
np.where(any(fff[:,4]) != 0)

(array([], dtype=int64),)

In [23]:
print(fff[63,:].toarray())

[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]


In [24]:
#todo better R^d representation

In [28]:
Nn, pii, Ti = read_hier("swiki/data/cat_hier.txt")

In [29]:
def largest_feat_n(df):
    return max(df["feat_tf"].apply(lambda x: len(x)))

In [30]:
d = small_df

In [28]:
# whatiwant = [14661, 71999, 292915, 188756, 131368, 130762, 352578, 395447, 27512, 157031, 33692, 13402, 393382, 390846, 395447, 276114]
whatiwant = [14661, 52361, 401434,316934, 369064]

In [29]:
for j in whatiwant:
    d[str(j)] = 0

In [30]:
d.head()

Unnamed: 0,labels,feat_tf,14661,52361,401434,316934,369064
264656,"[258850, 78599]","{173875: 1, 177591: 1, 184602: 1, 297238: 1, 3...",0,0,0,0,0
247765,[170514],"{170340: 3, 235307: 1, 923907: 1, 1012717: 1, ...",0,0,0,0,0
194584,"[156147, 93043]","{4288: 1, 5995: 1, 8032: 1, 40818: 1, 68505: 1...",0,0,0,0,0
281678,[288225],"{170340: 2, 339085: 1, 341434: 1, 345419: 1, 3...",0,0,0,0,0
393537,[87256],"{3744: 1, 69606: 1, 78034: 1, 150969: 2, 17387...",0,0,0,0,0


In [31]:
for i in d.index:
    for j in whatiwant:
        if j in d.loc[i, "labels"]:
            d.loc[i, str(j)] = 1

In [32]:
# num of docs per label (id)
for j in whatiwant:
    print(j, sum(d[str(j)]))

14661 1
52361 0
401434 0
316934 0
369064 0


In [33]:
subset = pd.DataFrame()

In [34]:
# subset = d[d["14661"]==1]
subset = d

In [35]:
subset.sample()

Unnamed: 0,labels,feat_tf,14661,52361,401434,316934,369064
275764,[140871],"{32372: 3, 42120: 2, 42881: 1, 367780: 1, 7763...",0,0,0,0,0


In [36]:
train_data, _ = get_data("swiki/data/train_remapped.txt")

In [37]:
max(list(subset.index))

454867

In [38]:
ll = []
for i in list(subset.index):
    try:
        ll.append(train_data[int(i)].toarray())
    except:
        print(i)

In [39]:
niu = np.concatenate( ll, axis=0 )

In [None]:
niu.shape

(100, 2085164)

In [None]:
train_x, train_y, test_x, test_y = train_test_split(niu, subset, random_state=42, test_size=0.30, shuffle=False)

In [None]:
train_x.shape, train_y.shape, test_x.shape, test_y.shape

((70, 2085164), (30, 2085164), (70, 7), (30, 7))

In [None]:
x_train = train_x
y_train = test_x.drop(labels = ["labels", "feat_tf"], axis = 1)

x_test = train_y
y_test = test_y.drop(labels = ["labels", "feat_tf"], axis = 1)

In [None]:
y_train.sample()

Unnamed: 0,14661,52361,401434,316934,369064
103069,0,0,0,0,0


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.multiclass import OneVsRestClassifier

In [None]:
clf = SGDClassifier()
# lb = preprocessing.LabelBinarizer(sparse_output=True)
# mb = preprocessing.MultiLabelBinarizer(sparse_output=True)
# le = preprocessing.LabelEncoder()

In [None]:
clf

SGDClassifier(alpha=0.0001, average=False, class_weight=None,
       early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
       l1_ratio=0.15, learning_rate='optimal', loss='hinge', max_iter=None,
       n_iter=None, n_iter_no_change=5, n_jobs=None, penalty='l2',
       power_t=0.5, random_state=None, shuffle=True, tol=None,
       validation_fraction=0.1, verbose=0, warm_start=False)

In [31]:
print(len(Nn), len(Ti)) # there is one node without any parent
print(Nn.difference(Ti)) # this is the node: this is probably the root node i guess

50312 50311
{2143406}


In [32]:
len(pii)

13808

In [33]:
 def lookup_table(filename, subset):
        
    p2c_table = {}
    c2p_table = {}
    node2id = OrderedDict()
    id2node = OrderedDict()
    i = 0
    
    with open(filename, "r") as f:
        if not subset:
            head = f
        elif isinstance(subset, int):
            head = [next(f) for x in range(subset)] # retrieve only `n` docs
        else:
            raise ValueError("Incorrect subset type. Enter only False (boolean) or int. Encountered {} type.".format(type(subset)))
        for _, line in enumerate(tqdm(head)):
            split_line = line.strip().split()
            parent_node = int(split_line[0])
            child_node = list(map(int, split_line[1:]))
            
            # map to the respective dicts -> parent:child relationship
            # parent2child lookup table
            if parent_node not in p2c_table:
                p2c_table[parent_node] = [child_node[0]]
            else:
                p2c_table[parent_node].append(child_node[0])
                
            #child2parent lookup table
            if child_node[0] not in c2p_table:
                c2p_table[child_node[0]] = [parent_node]
            else:
                c2p_table[child_node[0]].append(parent_node)
                
            # map parent/child node to a node<->id
            if parent_node not in node2id:
                p_id = i
                node2id[parent_node] = p_id
                id2node[p_id] = parent_node
                i+=1
            else:
                p_id = node2id[parent_node]
                
            if child_node[0] not in node2id:
                c_id = i
                node2id[child_node[0]] = c_id
                id2node[c_id] = child_node[0]      
                i+=1
            else:
                c_id = node2id[child_node[0]]

    pi_parents = set(p2c_table.keys())        
    T_leaves = (c2p_table.keys() - p2c_table.keys()) 
    N_all_nodes = pi_parents.union(T_leaves)
    
    return p2c_table, c2p_table, node2id, id2node, list(pi_parents), list(T_leaves), list(N_all_nodes)

In [34]:
def hierarchy2graph(p2c_table, node2id):

    edges = []
    for parent, children in p2c_table.items():
        p_id = node2id[parent]
        for child in children:
            c_id = node2id[child]
            edges.append((p_id, c_id))
    vertices = [k for k, v in node2id.items()]
    g = ig.Graph(n=len(node2id), edges=edges, directed=True, vertex_attrs={"name": vertices})
    return g

In [35]:
def weightparameter(N_all_nodes, size_n):
    
    # randomly initialize weights for all nodes instead of taking user input [assumption]
    w_all_n = {}
    for n in N_all_nodes:
        temp_rand = np.random.randn(size_n,1)
        if n not in w_all_n:
            w_all_n[n] = temp_rand
            
#         do reverse mapping?
    return w_all_n

<img src = "image.png">

In [319]:
def function_gradient(w_node, x, data):
    
    w_n = w_all_n[w_node]
    w_pi_n = w_all_n[c2p[w_node][0]]
    
    C = 1 # by default
    one = w_n - w_pi_n
    norm_w = np.linalg.norm(one, 2, keepdims = False)
    g = []
    # g = g.reshape((-1,1))
    y_in = -1
    
    for i, j in enumerate(data.index):
        if w_node in data.loc[j, "labels"]:
            y_in = 1
            print("yes", w_node, data.loc[j, "labels"])
    
        # eqn 3.9
        xx =  x[i].reshape((-1, 1))
        y_w_x = y_in * np.dot(np.transpose(w_n), xx)

        y_x = y_in * xx

        two = C*(1/(1+np.exp(y_w_x)))*y_x

        assert(one.shape == two.shape)
        g.append(one - two)

        # eqn 3.8
        obj = C*np.log(1+np.exp(-y_w_x))

        min_wn = norm_w + obj
    
    return min_wn      

In [345]:
function_gradient

<function __main__.function_gradient(w_node, x, g, data, lmbda)>

In [320]:
from scipy import optimize

In [336]:
def objective_lr(data, w_node, lmbda, eps, maxfn, fff):
    m = 5
    f = 0
    xtol = 1e-30
    maxfn = 0
    w_n = w_all_n[w_node]
    init_x = [2, 2]
    largest_n = largest_feat_n(data)
    n = max(len(w_n), largest_n)
    
    x = fff
    g = np.zeros(x.shape)

    while(maxfn < 10):
        
        w_n = w_all_n[w_node]
        w_pi_n = w_all_n[c2p[w_node][0]]
        
        f = function_gradient(w_node, x, g, data, lmbda)
#         try:
        optimize.minimize(function_gradient, init_x, args=(w_n), method='L-BFGS-B', tol=xtol, options={'disp': True, 'eps': eps })
#         except:
#             print("didn't converge")
            
        maxfn +=1

In [341]:
p2c, c2p, n2i, i2n, pi, T, N = lookup_table("swiki/data/cat_hier.txt", subset = False)
p2c_s, c2p_s, n2i_s, i2n_s, pi_s, T_s, N_s = lookup_table("swiki/data/cat_hier.txt", subset = 9)
w_all_n = weightparameter(N, 64)

65333it [00:00, 276100.81it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:00<?, ?it/s]


In [342]:
large_n = largest_feat_n(small_df)

In [343]:
gg[1].shape

(64,)

In [344]:
# result_w = np.zeros((len(N_s),1))
# while(1):
result_w = {}

for n in N_s:    
    print(n)
    # check if the node has a parent
    if n not in c2p:
        w_pi_n = 0 # if the node has no parent then it's the root node. assign it w=0
    else:
        w_pi_n = w_all_n[n] 
    
    # if n is not leaf node
    if n not in T_s:
        # update w_n using eqn 3.3 or 4
        mod_C_n = len(p2c[n]) # |C_n|
        sum_w_c = 0 
        for c in p2c[n]:
            sum_w_c += w_all_n[c]
            
        if n not in result_w:
            result_w[n] = 1/(mod_C_n + 1) * (w_pi_n + sum_w_c)
    # else: n is a leaf node
    else:
        #optimize using lbfgs eqn 3.8, 3.9 or 8
        lmbda = 1
        eps = 1e-4
        maxfn = 10
        w_n = w_all_n[n]
        lr_function = objective_lr(small_df, n, lmbda, eps, maxfn, gg) ##
#         g = function_g() ##
        print("over")
    print("ove")
print("ov")

2255744


TypeError: 'numpy.ndarray' object is not callable

In [None]:
len(pi), len(T), len(N)

In [None]:
result_w