In [None]:
# https://github.com/gcdart/MulticlassClassifier/blob/master/src/ml/LogisticRegression.java
# https://www.kaggle.com/c/lshtc/discussion/6911#38233 - preprocessing: multilabels comma should not have spaces
# https://www.kaggle.com/c/lshtc/discussion/14048 - dataset statistics
## reading the LWIKI, SWIKI dataset

In [1]:
import os
import numpy as np
import pandas as pd
import igraph as ig

from pathlib import Path
from tqdm import tqdm #always use this instead of `import tqdm`
# from sklearn.datasets import fetch_rcv1

# np.random.seed(123)
import warnings
warnings.simplefilter('ignore')

In [2]:
import logging
from collections import OrderedDict

logging.basicConfig(level=logging.INFO, )

In [3]:
from sklearn.linear_model import SGDClassifier
from sklearn import preprocessing, metrics

In [4]:
from sklearn.model_selection import train_test_split

In [5]:
from joblib import Memory
from sklearn.datasets import load_svmlight_file

In [6]:
mem = Memory("./mycache")
@mem.cache
def get_data(filename):
    
    fname = str(Path(filename))
    fe, ex = os.path.splitext(fname) 

    try:
        data = load_svmlight_file(fname, multilabel=True)
    except:
        # Required: if the input data isn't in the correct libsvm format
        outfile = str(Path("{}_small{}".format(fe, ex)))
#         outfile = str(Path("{}_remapped{}".format(fe, ex)))
        if not os.path.isfile(outfile):
            logging.info("Remapping data to LibSVM format...")
            f = preprocess_libsvm(fname, outfile)
        else:
            logging.info("Using already remapped data...")
            f = outfile
        data = load_svmlight_file(f, multilabel=True)
        
    return data[0], data[1]

In [7]:
def preprocess_libsvm(input_file, output_file):
    # converts file to the required libsvm format.
    # this is very brute force but can be made faster [IMPROVE]

    file = open(output_file, "w+")
    with open(input_file, "r") as f:
        head = [next(f) for x in range(500)] # retrieve only `n` docs
        for i, line in enumerate(tqdm(head)): # change to f/head depending on your needs
            instance = line.strip().split()
            labels = instance[0]
            doc_dict = OrderedDict()
            temp_dict = {}
            temp_string = ''

            for pair in instance[1:]:
                feat = pair.split(":")
                if int(feat[0]) not in temp_dict:
                    temp_dict[int(feat[0])] = int(feat[1])

            for key in sorted(temp_dict.keys()):
                doc_dict[key] = temp_dict[key]

            for feat, tf in doc_dict.items():
                temp_string = temp_string + "{}:{} ".format(feat, tf)        
            file.write("{} {}\n".format(labels, temp_string))
        file.close()

    return output_file

In [8]:
def label_extractor(labels):

    leaf_labels = set()
    labels_per_doc = []

    for i in labels:
        labels_per_doc.append(len(i))
        for j in i:
            leaf_labels.add(int(j))
    
    return leaf_labels, labels_per_doc

In [9]:
def read_hier(filename):
    
    N = set()
    pi = set()
    T = set()
    
    with open(filename, "r") as f:
        for i, line in enumerate(f):
            words = line.strip().split()
            pi.add(int(words[0])) #adding parent node
            T.add(int(words[-1])) #adding ALL leaf nodes in the hierarchy
            for w in words:
                N.add(int(w))

    return N, pi, T

In [10]:
mem = Memory("./mycache")
@mem.cache
def rr_reader(filename):
    '''
    create a dataframe from the data-label pair
    '''

    num_entries = 200000
    df = pd.DataFrame()
    
    with open(filename, "r") as f:
#         head = [next(f) for x in range(num_entries)] # retrieve only `n` docs
        for i, line in enumerate(tqdm(f)): # change to f/head depending on your needs
            instance = line.strip().split()
            labels = instance[0]
            doc_dict = OrderedDict()
            temp_dict = {}

            for pair in instance[1:]:
                feat = pair.split(":")
                if int(feat[0]) not in temp_dict:
                    temp_dict[int(feat[0])] = int(feat[1])

            for key in sorted(temp_dict.keys()):
                doc_dict[key] = temp_dict[key]
                
            temp_df = pd.DataFrame(data = [ labels, doc_dict ]).T
            df = df.append(temp_df, ignore_index=True)
    
    df.columns = ["labels", "feat_tf"]
    df["labels"] = df["labels"].apply( lambda x: list(map(int, x.split(",")))  )
    return df

In [12]:
small_df = rr_reader("swiki/data/train_small.txt")

In [13]:
Nn, pii, Ti = read_hier("swiki/data/cat_hier.txt")

In [14]:
def largest_feat_n(df):
    return max(df["feat_tf"].apply(lambda x: len(x)))

In [17]:
d = small_df

In [18]:
# whatiwant = [14661, 71999, 292915, 188756, 131368, 130762, 352578, 395447, 27512, 157031, 33692, 13402, 393382, 390846, 395447, 276114]
whatiwant = [14661, 71999, 292915, 188756, 131368]

In [19]:
for j in whatiwant:
    d[str(j)] = 0

In [20]:
d.head()

Unnamed: 0,labels,feat_tf,14661,71999,292915,188756,131368
0,"[33692, 13402, 393382]","{624: 3, 4288: 1, 14403: 1, 54278: 1, 62619: 1...",0,0,0,0,0
1,[130762],"{120505: 1, 173442: 1, 554009: 1, 634374: 1, 6...",0,0,0,0,0
2,"[352578, 395447, 27512, 157031]","{62483: 1, 73429: 1, 138155: 1, 160218: 1, 165...",0,0,0,0,0
3,"[390846, 395447, 276114]","{200187: 1, 207596: 1, 343448: 1, 359544: 1, 4...",0,0,0,0,0
4,"[14661, 71999, 292915, 188756, 131368]","{2906: 1, 4288: 1, 17471: 3, 56146: 1, 94588: ...",0,0,0,0,0


In [21]:
for i in range(len(d)):
    for j in whatiwant:
        if j in d.loc[i, "labels"]:
            d.loc[i, str(j)] = 1

In [22]:
# num of docs per label (id)
for j in whatiwant:
    print(j, sum(d[str(j)]))

14661 3
71999 2
292915 1
188756 1
131368 1


In [23]:
subset = pd.DataFrame()

In [24]:
subset = d[d["14661"]==1]

In [26]:
subset.sample()

Unnamed: 0,labels,feat_tf,14661,71999,292915,188756,131368
47,"[392501, 14661, 347803]","{14403: 1, 15132: 2, 58638: 1, 85149: 1, 11593...",1,0,0,0,0


In [27]:
train_data, _ = get_data("swiki/data/train_remapped.txt")

In [30]:
max(list(subset.index))

115

In [31]:
ll = []
for i in list(subset.index):
    try:
        ll.append(train_data[int(i)].toarray())
    except:
        print(i)

In [32]:
niu = np.concatenate( ll, axis=0 )

In [33]:
niu.shape

(3, 2085164)

In [34]:
train_x, train_y, test_x, test_y = train_test_split(niu, subset, random_state=42, test_size=0.30, shuffle=False)

In [35]:
train_x.shape, train_y.shape, test_x.shape, test_y.shape

((2, 2085164), (1, 2085164), (2, 7), (1, 7))

In [36]:
x_train = train_x
y_train = test_x.drop(labels = ["labels", "feat_tf"], axis = 1)

x_test = train_y
y_test = test_y.drop(labels = ["labels", "feat_tf"], axis = 1)

In [38]:
y_train.sample()

Unnamed: 0,14661,71999,292915,188756,131368
47,1,0,0,0,0


In [39]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.multiclass import OneVsRestClassifier

In [40]:
clf = SGDClassifier()
# lb = preprocessing.LabelBinarizer(sparse_output=True)
# mb = preprocessing.MultiLabelBinarizer(sparse_output=True)
# le = preprocessing.LabelEncoder()

In [41]:
clf

SGDClassifier(alpha=0.0001, average=False, class_weight=None,
       early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
       l1_ratio=0.15, learning_rate='optimal', loss='hinge', max_iter=None,
       n_iter=None, n_iter_no_change=5, n_jobs=None, penalty='l2',
       power_t=0.5, random_state=None, shuffle=True, tol=None,
       validation_fraction=0.1, verbose=0, warm_start=False)

In [42]:
# SGD Classifier
for category in whatiwant[1:]:
    print('**Processing {} tag...**'.format(category))
    
    # Training sgd model on train data
    clf.fit(x_train, y_train[str(category)])
    
    # calculating test accuracy
    prediction = clf.predict(x_test)
    print('Test set Micro F1 is {}'.format(f1_score(y_test[str(category)], prediction, average="micro")))
    print('Test set Precision is {}'.format(precision_score(y_test[str(category)], prediction, average="macro")))
    print('Test set Recall is {}'.format(recall_score(y_test[str(category)], prediction, average="micro")))

    print("\n")

**Processing 71999 tag...**
Test set Micro F1 is 1.0
Test set Precision is 1.0
Test set Recall is 1.0


**Processing 292915 tag...**
Test set Micro F1 is 1.0
Test set Precision is 1.0
Test set Recall is 1.0


**Processing 188756 tag...**
Test set Micro F1 is 1.0
Test set Precision is 1.0
Test set Recall is 1.0


**Processing 131368 tag...**
Test set Micro F1 is 1.0
Test set Precision is 1.0
Test set Recall is 1.0




In [44]:
print(len(Nn), len(Ti)) # there is one node without any parent
print(Nn.difference(Ti)) # this is the node: this is probably the root node i guess

50312 50311
{2143406}


In [45]:
len(pii)

13808

In [46]:
 def lookup_table(filename, subset):
        
    p2c_table = {}
    c2p_table = {}
    node2id = OrderedDict()
    id2node = OrderedDict()
    i = 0
    
    with open(filename, "r") as f:
        if not subset:
            head = f
        elif isinstance(subset, int):
            head = [next(f) for x in range(subset)] # retrieve only `n` docs
        else:
            raise ValueError("Incorrect subset type. Enter only False (boolean) or int. Encountered {} type.".format(type(subset)))
        for _, line in enumerate(tqdm(head)):
            split_line = line.strip().split()
            parent_node = int(split_line[0])
            child_node = list(map(int, split_line[1:]))
            
            # map to the respective dicts -> parent:child relationship
            # parent2child lookup table
            if parent_node not in p2c_table:
                p2c_table[parent_node] = [child_node[0]]
            else:
                p2c_table[parent_node].append(child_node[0])
                
            #child2parent lookup table
            if child_node[0] not in c2p_table:
                c2p_table[child_node[0]] = [parent_node]
            else:
                c2p_table[child_node[0]].append(parent_node)
                
            # map parent/child node to a node<->id
            if parent_node not in node2id:
                p_id = i
                node2id[parent_node] = p_id
                id2node[p_id] = parent_node
                i+=1
            else:
                p_id = node2id[parent_node]
                
            if child_node[0] not in node2id:
                c_id = i
                node2id[child_node[0]] = c_id
                id2node[c_id] = child_node[0]      
                i+=1
            else:
                c_id = node2id[child_node[0]]

    pi_parents = set(p2c_table.keys())        
    T_leaves = (c2p_table.keys() - p2c_table.keys()) 
    N_all_nodes = pi_parents.union(T_leaves)
    
    return p2c_table, c2p_table, node2id, id2node, list(pi_parents), list(T_leaves), list(N_all_nodes)

In [47]:
def hierarchy2graph(p2c_table, node2id):

    edges = []
    for parent, children in p2c_table.items():
        p_id = node2id[parent]
        for child in children:
            c_id = node2id[child]
            edges.append((p_id, c_id))
    vertices = [k for k, v in node2id.items()]
    g = ig.Graph(n=len(node2id), edges=edges, directed=True, vertex_attrs={"name": vertices})
    return g

In [48]:
def weightparameter(N_all_nodes, size_n):
    
    # randomly initialize weights for all nodes instead of taking user input [assumption]
    w_all_n = {}
    for n in N_all_nodes:
        temp_rand = np.random.randn(size_n,)
        if n not in w_all_n:
            w_all_n[n] = temp_rand
    return w_all_n

<img src = "image.png">

In [94]:
def function_gradient(w_node, x, g, data, lmbda):
    
    w_n = w_all_n[w_node]
    print(c2p_s[w_node][0])
    w_pi_n = w_all_n[c2p_s[w_node][0]]
    
    C = 1 # by default
    y_in = np.ones((len(data),))*(-1)
    
    for i in range(len(data)):
        if w_node in data.loc[i, "labels"]:
            y_in[i] = 1
            print(w_node, data.loc[i, "labels"])
    
    # eqn 3.9
    one = w_n - w_pi_n
       
    print(y_in.shape, w_n.shape)
    assert y_in.shape == x.shape
    y_w_x = y_in * w_n.T * x
    y_x = y_in * x
    
    two = C*(1/(1+np.exp(y_w_x)))*y_x
    assert(w_n.shape == two.shape)
    g = one - two
    
    # eqn 3.8
    norm_w = np.linalg.norm(one, order = 2, keepdims = False)
    obj = C*np.log(1+np.exp(-y_w_x))
    min_wn = norm_w + obj
    
    return min_wn       

In [82]:
def objective_lr(data, w_node, lmbda, eps, maxfn):
    m = 5
    f = 0
    xtol = 1e-30
    iprint = [0, 1]
    iflag = [0]
    
    w_n = w_all_n[w_node]
    largest_n = largest_feat_n(data)
    n = max(len(w_n), largest_n)
    
    x = np.stack(w_n)
    g = np.zeros(x.shape)
    
    while(maxfn > 0):
        f = function_gradient(w_node, x, g, data, lmbda)
        # do lbfgs

In [57]:
p2c, c2p, n2i, i2n, pi, T, N = lookup_table("swiki/data/cat_hier.txt", subset = False)
p2c_s, c2p_s, n2i_s, i2n_s, pi_s, T_s, N_s = lookup_table("swiki/data/cat_hier.txt", subset = 10)
w_all_n = weightparameter(N_s, 10)

65333it [00:00, 254526.43it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<?, ?it/s]


In [58]:
len(pi), len(T), len(N)

(13808, 36504, 50312)

In [59]:
large_n = largest_feat_n(small_df)

In [95]:
result_w = np.zeros([len(N_s),])
# while(1):
result_w = {}

for n in N_s:    
    # check if the node has a parent
    if n not in c2p_s:
        w_pi_n = 0 # if the node has no parent then it's the root node. assign it w=0
    else:
        w_pi_n = w_all_n[n] 
    
    # if n is not leaf node
    if n not in T_s:
        # update w_n using eqn 3.3 or 4
        mod_C_n = len(p2c_s[n]) # |C_n|
        sum_w_c = 0 
        for c in p2c_s[n]:
            sum_w_c += w_all_n[c]
            
        print(n,":", mod_C_n, w_pi_n, sum_w_c)
        if n not in result_w:
            result_w[n] = 1/(mod_C_n + 1) * (w_pi_n + sum_w_c)
    # else: n is a leaf node
    else:
        #optimize using lbfgs eqn 3.8, 3.9 or 8
        lmbda = 1
        eps = 1e-4
        maxfn = 1000
        w_n = w_all_n[n]
        print(len(w_n))
        lr_function = objective_lr(small_df, n, lmbda, eps, maxfn) ##
#         g = function_g() ##
        print("over")
    print("ove")
print("ov")

10
[ 0.01262107 -0.96049248  0.80851146 -0.7210829   0.91202167 -0.27584985
  0.69628285  0.23157     1.17538187  0.90821097]
[ 0.01262107 -0.96049248  0.80851146 -0.7210829   0.91202167 -0.27584985
  0.69628285  0.23157     1.17538187  0.90821097]
2143406
(500,) (10,)


AssertionError: 

In [None]:
result_w