In [1]:
# https://github.com/gcdart/MulticlassClassifier/blob/master/src/ml/LogisticRegression.java
# https://www.kaggle.com/c/lshtc/discussion/6911#38233 - preprocessing: multilabels comma should not have spaces
# https://www.kaggle.com/c/lshtc/discussion/14048 - dataset statistics
## reading the LWIKI, SWIKI dataset

In [2]:
import os
import numpy as np
import pandas as pd

from pathlib import Path
from tqdm import tqdm #always use this instead of `import tqdm`
# from sklearn.datasets import fetch_rcv1

# np.random.seed(123)

In [3]:
import logging
from collections import OrderedDict

logging.basicConfig(level=logging.INFO)

In [4]:
from sklearn.linear_model import SGDClassifier
from sklearn import preprocessing, metrics

In [5]:
from joblib import Memory
from sklearn.datasets import load_svmlight_file

In [6]:
mem = Memory("./mycache")
@mem.cache
def get_data(filename):
    
    fname = str(Path(filename))
    fe, ex = os.path.splitext(fname) 

    try:
        data = load_svmlight_file(fname, multilabel=True)
    except:
        # Required: if the input data isn't in the correct libsvm format
        outfile = str(Path("{}_small{}".format(fe, ex)))
#         outfile = str(Path("{}_remapped{}".format(fe, ex)))
        if not os.path.isfile(outfile):
            logging.info("Remapping data to LibSVM format...")
            f = preprocess_libsvm(fname, outfile)
        else:
            logging.info("Using already remapped data...")
            f = outfile
        data = load_svmlight_file(f, multilabel=True)
        
    return data[0], data[1]

In [7]:
def preprocess_libsvm(input_file, output_file):
    # converts file to the required libsvm format.
    # this is very brute force but can be made faster [IMPROVE]

    file = open(output_file, "w+")
    with open(input_file, "r") as f:
        head = [next(f) for x in range(500)] # retrieve only `n` docs
        for i, line in enumerate(tqdm(head)): # change to f/head depending on your needs
            instance = line.strip().split()
            labels = instance[0]
            doc_dict = OrderedDict()
            temp_dict = {}
            temp_string = ''

            for pair in instance[1:]:
                feat = pair.split(":")
                if int(feat[0]) not in temp_dict:
                    temp_dict[int(feat[0])] = int(feat[1])

            for key in sorted(temp_dict.keys()):
                doc_dict[key] = temp_dict[key]

            for feat, tf in doc_dict.items():
                temp_string = temp_string + "{}:{} ".format(feat, tf)        
            file.write("{} {}\n".format(labels, temp_string))
        file.close()

    return output_file

In [8]:
def label_extractor(labels):

    leaf_labels = set()
    labels_per_doc = []

    for i in labels:
        labels_per_doc.append(len(i))
        for j in i:
            leaf_labels.add(int(j))
    
    return leaf_labels, labels_per_doc

In [25]:
def read_hier(filename):
    
    N = set()
    pi = set()
    T = set()
    
    with open(filename, "r") as f:
        for i, line in enumerate(f):
            words = line.strip().split()
            pi.add(int(words[0])) #adding parent node
            T.add(int(words[-1])) #adding ALL leaf nodes in the hierarchy
            for w in words:
                N.add(int(w))

    return N, pi, T

In [10]:
mem = Memory("./mycache")
@mem.cache
def rr_reader(filename):
    '''
    create a dataframe from the data-label pair
    '''

    num_entries = 100000
    df = pd.DataFrame()
    
    with open(filename, "r") as f:
        head = [next(f) for x in range(100000)] # retrieve only `n` docs
        for i, line in enumerate(tqdm(head)): # change to f/head depending on your needs
            instance = line.strip().split()
            labels = instance[0]
            doc_dict = OrderedDict()
            temp_dict = {}

            for pair in instance[1:]:
                feat = pair.split(":")
                if int(feat[0]) not in temp_dict:
                    temp_dict[int(feat[0])] = int(feat[1])

            for key in sorted(temp_dict.keys()):
                doc_dict[key] = temp_dict[key]
                
            temp_df = pd.DataFrame(data = [ labels, doc_dict ]).T
            df = df.append(temp_df, ignore_index=True)
    
    df.columns = ["labels", "feat_tf"]
    df["labels"] = df["labels"].apply( lambda x: list(map(int, x.split(",")))  )
    return df

In [11]:
df = rr_reader("swiki/data/train.txt")

In [12]:
df.head(5)

Unnamed: 0,labels,feat_tf
0,"[33692, 13402, 393382]","{624: 3, 4288: 1, 14403: 1, 54278: 1, 62619: 1..."
1,[130762],"{120505: 1, 173442: 1, 554009: 1, 634374: 1, 6..."
2,"[352578, 395447, 27512, 157031]","{62483: 1, 73429: 1, 138155: 1, 160218: 1, 165..."
3,"[390846, 395447, 276114]","{200187: 1, 207596: 1, 343448: 1, 359544: 1, 4..."
4,"[14661, 71999, 292915, 188756, 131368]","{2906: 1, 4288: 1, 17471: 3, 56146: 1, 94588: ..."


In [13]:
leaf, label_per_doc = label_extractor(df["labels"])

In [26]:
N, pi, T = read_hier("swiki/data/cat_hier.txt")

In [27]:
len(leaf)

24076

In [28]:
sum(label_per_doc)

220943

In [29]:
print(len(N), len(T)) # there is one node without any parent
print(N.difference(T)) # this is the node: this is probably the root node i guess

50312 50311
{2143406}


In [30]:
len(pi)

13808

In [85]:
 def lookup_table(filename):
        
    p2c_table = {}
    c2p_table = {}
    
        
    with open(filename, "r") as f:
        head = [next(f) for x in range(10)] # retrieve only `n` docs
        for i, line in enumerate(tqdm(head)):
            split_line = line.strip().split()
            parent_node = int(split_line[0])
            child_node = list(map(int, split_line[1:]))
            
            # map to the respective dicts
            # parent2child lookup table
            if parent_node not in p2c_table:
                p2c_table[parent_node] = [child_node]
            else:
                p2c_table[parent_node].append(child_node)
                
            #child2parent lookup table
            if child_node[0] not in c2p_table:
                c2p_table[child_node[0]] = [parent_node]
            else:
                c2p_table[child_node[0]].append(parent_node)
            
    return p2c_table, c2p_table

In [86]:
p, c = lookup_table("swiki/data/cat_hier.txt")

100%|███████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 19906.52it/s]


In [87]:
c

{2156813: [2143406],
 2322682: [2143406],
 143406: [2143406],
 2255744: [2143406],
 2235965: [2143406],
 2440809: [2156813],
 2159645: [2156813],
 2267844: [2156813],
 2271677: [2156813],
 2152343: [2156813]}

In [19]:
train_data, train_labels = get_data("swiki/data/train_small.txt")
test_data, test_labels = get_data("swiki/data/test_small.txt")

In [20]:
train_data.shape

(500, 2085161)

In [21]:
trmp = pd.DataFrame(data = [3, ["4", 5, 8], {"goo":5}]).T

In [22]:
type(trmp[0][0])

int

In [31]:
ll = [3, 4, 5, 6]

In [33]:
ll[1:]

[4, 5, 6]