In [1]:
%pylab inline
import pandas as pd
import os
from artm_experiments import OptimizationTopicsFilter
from optimize_methods import cosine_metric, jac_cosine_metric
from artm import *
%load_ext autoreload
%autoreload 2

Populating the interactive namespace from numpy and matplotlib




In [2]:
def read_phi(phi_name, topics_name):
    phi = pd.read_pickle(os.path.join('phi_ethnic', phi_name))
    topics = dict()
    to_class = {'++': 0, '+-': 1, '-+': 2, '--': 3}
    with open(os.path.join('ethnic_top_tokens', topics_name)) as f:
        for i in xrange(400):
            line = f.readline().split()
            topics[line[1][:-1]] = to_class[line[0]]
    return phi, topics

In [3]:
phi_hull, topics_hull = read_phi('4_phi_smooth_decor', 'smooth_decor.txt')
phi_train, topics_train = read_phi('5_phi_full', 'full.txt')
phi_test, topics_test = read_phi('2_phi_lda', 'lda.txt')

In [4]:
hull_euc = OptimizationTopicsFilter(eps=1e-5)
hull_cos = OptimizationTopicsFilter(eps=1e-5, metric=cosine_metric, jac_metric=jac_cosine_metric)

topics_classes = {0: [], 1: [], 2: [], 3: []}

for topic in phi_hull.columns:
    topics_classes[topics_hull[topic]].append(topic)

In [7]:
from sklearn.metrics.pairwise import cosine_distances
from scipy.stats import entropy
import time

def cos_metric(u, v):
    return cosine_distances([u], [v])[0, 0]
                                      
def kl_metric(u, v):
    not_zeros = (u > 0.0) & (v > 0.0)
    u_nz = u[not_zeros]
    v_nz = v[not_zeros]
    res = 0.5 * entropy(u_nz, v_nz) + 0.5 * entropy(v_nz, u_nz)
    return min(np.sum(res), 1e3)
                        
def jac_metric(u, v):
    inter = (u > 0.0) & (v > 0.0)
    union = (u > 0.0) | (v > 0.0)
    return 1.0 - 1.0 * np.sum(inter) / np.sum(union)

def find_closest(topic_vector, metric):
    res = []
    for cl in xrange(4):
        min_dist = 1e5
        min_topic = None
        for topic in topics_classes[cl]:
            dist = metric(topic_vector, phi_hull[topic].as_matrix().ravel())
            if dist < min_dist:
                min_dist = dist
                min_topic = topic
                
        res.append(min_dist)
    return res
        

def gen_features(topic_vector):
    features = []
    for cl in xrange(4):
        features.append(hull_euc.get_dist(topic_vector, phi_hull, topics_classes[cl]).fun)
    for cl in xrange(4):
        features.append(hull_cos.get_dist(topic_vector, phi_hull, topics_classes[cl]).fun)
    
    for cut_size in [20, 100, 10 ** 6]:
        for metric in [cos_metric, kl_metric, jac_metric]:
            cutted_vector = topic_vector.copy()
            cutted_vector[np.argsort(cutted_vector)[:-cut_size]] = 0.0
            closest = find_closest(cutted_vector, metric)
            features += closest
    #print features
    return np.array(features)    return np.array(features)


X = []
y = []
for phi, topics in [(phi_train, topics_train), (phi_test, topics_test)]:
    for topic in phi.columns:
        print topic
        X.append(gen_features(phi[topic].as_matrix().ravel()))
        y.append(topics[topic])
    
X = np.array(X)
y = np.array(y)

e_topic_0
e_topic_1
e_topic_2
e_topic_3
e_topic_4
e_topic_5
e_topic_6
e_topic_7
e_topic_8
e_topic_9
e_topic_10
e_topic_11
e_topic_12
e_topic_13
e_topic_14
e_topic_15
e_topic_16
e_topic_17
e_topic_18
e_topic_19
e_topic_20
e_topic_21
e_topic_22
e_topic_23
e_topic_24
e_topic_25
e_topic_26
e_topic_27
e_topic_28
e_topic_29
e_topic_30
e_topic_31
e_topic_32
e_topic_33
e_topic_34
e_topic_35
e_topic_36
e_topic_37
e_topic_38
e_topic_39
e_topic_40
e_topic_41
e_topic_42
e_topic_43
e_topic_44
e_topic_45
e_topic_46
e_topic_47
e_topic_48
e_topic_49
e_topic_50
e_topic_51
e_topic_52
e_topic_53
e_topic_54
e_topic_55
e_topic_56
e_topic_57
e_topic_58
e_topic_59
e_topic_60
e_topic_61
e_topic_62
e_topic_63
e_topic_64
e_topic_65
e_topic_66
e_topic_67
e_topic_68
e_topic_69
e_topic_70
e_topic_71
e_topic_72
e_topic_73
e_topic_74
e_topic_75
e_topic_76
e_topic_77
e_topic_78
e_topic_79
e_topic_80
e_topic_81
e_topic_82
e_topic_83
e_topic_84
e_topic_85
e_topic_86
e_topic_87
e_topic_88
e_topic_89
e_topic_90
e_topic_9

In [3]:
features_names = ['HullEuc{}'.format(cl) for cl in xrange(4)] +\
                 ['HullCos{}'.format(cl) for cl in xrange(4)] +\
                 ['Dist{}{}{}'.format(cut_size, metric, cl) 
                  for cut_size in ['20', '100', 'Inf']
                  for metric in ['Cos', 'Kl', 'Jac'] 
                  for cl in xrange(4) ]
print '\n'.join(features_names)

HullEuc0
HullEuc1
HullEuc2
HullEuc3
HullCos0
HullCos1
HullCos2
HullCos3
Dist20Cos0
Dist20Cos1
Dist20Cos2
Dist20Cos3
Dist20Kl0
Dist20Kl1
Dist20Kl2
Dist20Kl3
Dist20Jac0
Dist20Jac1
Dist20Jac2
Dist20Jac3
Dist100Cos0
Dist100Cos1
Dist100Cos2
Dist100Cos3
Dist100Kl0
Dist100Kl1
Dist100Kl2
Dist100Kl3
Dist100Jac0
Dist100Jac1
Dist100Jac2
Dist100Jac3
DistInfCos0
DistInfCos1
DistInfCos2
DistInfCos3
DistInfKl0
DistInfKl1
DistInfKl2
DistInfKl3
DistInfJac0
DistInfJac1
DistInfJac2
DistInfJac3


In [8]:
X.tofile('x')
#X.tofile('x3')
y.tofile('y')
#y.tofile('y3')

In [4]:
X = np.fromfile('x').reshape((-1, len(features_names)))
y = np.fromfile('y', dtype=int)

In [10]:
X[4]

array([ 0.00713671,  0.00708345,  0.0059144 ,  0.00498849,  0.89644015,
        0.94914104,  0.74842787,  0.68537855,  0.97981455,  0.97940704,
        0.85976436,  0.67433329,  0.17353196,  0.28884853,  0.23016986,
        0.30721865,  0.9997902 ,  0.99974812,  0.99970959,  0.99970874,
        0.92156368,  0.94726462,  0.81031363,  0.65638586,  0.34728988,
        0.31856274,  0.256797  ,  0.29608205,  0.99935249,  0.99898183,
        0.99884123,  0.99885444,  0.91238312,  0.90632709,  0.78555263,
        0.62621462,  0.29389346,  0.34628165,  0.20968898,  0.27431126,
        0.        ,  0.        ,  0.        ,  0.        ])

In [6]:
y

array([3, 3, 3, 3, 3, 3, 2, 0, 3, 3, 1, 2, 2, 3, 2, 3, 3, 3, 3, 3, 2, 3, 3,
       3, 3, 3, 3, 1, 3, 0, 3, 3, 3, 0, 3, 3, 3, 1, 3, 3, 2, 1, 3, 1, 3, 3,
       3, 3, 3, 3, 3, 3, 2, 2, 1, 3, 2, 3, 0, 1, 3, 3, 0, 1, 3, 3, 1, 1, 3,
       1, 3, 1, 1, 3, 1, 3, 3, 3, 2, 2, 2, 3, 1, 3, 3, 3, 3, 3, 3, 3, 3, 1,
       1, 2, 3, 3, 3, 3, 3, 2, 3, 3, 3, 3, 1, 3, 3, 0, 3, 3, 2, 1, 2, 3, 3,
       0, 3, 3, 3, 3, 3, 3, 0, 0, 1, 3, 3, 1, 3, 3, 3, 3, 1, 3, 3, 3, 3, 0,
       2, 0, 3, 0, 3, 3, 3, 3, 3, 3, 3, 3, 0, 2, 3, 3, 1, 3, 3, 0, 3, 3, 3,
       3, 2, 0, 1, 3, 3, 3, 0, 3, 3, 3, 3, 3, 2, 3, 1, 2, 3, 3, 0, 1, 3, 3,
       2, 0, 0, 3, 3, 0, 3, 3, 1, 3, 3, 3, 3, 3, 3, 1, 1, 3, 1, 1, 2, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 1, 0, 3, 3, 1, 3, 3, 0, 3, 0,
       3, 3, 3, 2, 3, 0, 1, 3, 1, 1, 3, 3, 3, 3, 0, 3, 2, 3, 0, 3, 3, 3, 3,
       3, 3, 3, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 3, 3, 3, 3, 3, 3, 3,
       3, 3,

In [7]:
X /= np.max(X, axis=0)

In [9]:
np.random.seed(56789)
rand_perm = np.random.permutation(X.shape[0])
X = X[rand_perm]
y = y[rand_perm]

In [15]:
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score, log_loss
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.cross_validation import StratifiedKFold

def test_model(class_model, params, measure_l1=False, feature_mask="all"):
    if feature_mask == "all":
        feature_mask = np.ones(X.shape[1], dtype=bool)
    skf = StratifiedKFold(y, 3)
    scores = np.zeros(4)
    count = 0
    for train, test in skf:
        X_train, X_test, y_train, y_test = X[train][:, feature_mask], X[test][:, feature_mask], \
                                           y[train], y[test]
        model = class_model(**params)
        model.fit(X_train, y_train)
        if measure_l1:
            count_f = np.sum(model.coef_[0] != 0.0)
        ans = model.predict_proba(X_test)
        for cl in xrange(4):
            binary_test = y_test.copy()
            not_cl = binary_test != cl
            is_cl = binary_test == cl
            binary_test[not_cl] = 0
            binary_test[is_cl] = 1
            
            scores[cl] += roc_auc_score(binary_test, ans[:, cl])
        count += 1
        #print scores / count
    
    if measure_l1:
        return scores / 3, count_f
    else:
        return scores / 3

#print "xgboost"
#for max_depth in [2, 10, 1000]:
#    for n_estimators in [10, 200, 500, 1000]:
#        print n_estimators, max_depth, test_model(XGBClassifier, 
#                                                  {"n_estimators": n_estimators, "max_depth": max_depth})
#print "log res"
#for C in np.logspace(-2, 3):
#    res = test_model(LogisticRegression, {"penalty": 'l1', "C": C}, measure_l1=True)
#    print ','.join(map(str, (C, res[1], res[0][0])))
hull_features = np.zeros(X.shape[1], dtype=bool)
hull_features[4:8] = 1
hull_features[-36:] = 1

dist_features = np.zeros(X.shape[1], dtype=bool)
dist_features[-36:] = 1

without_cos_features = np.zeros(X.shape[1], dtype=bool)
without_cos_features[4:8] = 1
without_cos_features[12:20] = 1
without_cos_features[24:32] = 1
without_cos_features[36:44] = 1

for mask in ["all", hull_features, dist_features, without_cos_features]:
    #print mask
    print "xgboost"
    print test_model(XGBClassifier, {"n_estimators": 500, "max_depth": 1000}, feature_mask=mask)
    print "log res"
    print test_model(LogisticRegression, {"penalty": 'l1', "C": 1.1}, feature_mask=mask)
    print 

xgboost
[ 0.91581239  0.78841807  0.79192655  0.88823564]
log res
[ 0.90653872  0.79199755  0.76288594  0.88054069]

xgboost
[ 0.91576627  0.79500341  0.79402917  0.88749197]
log res
[ 0.90662139  0.79252049  0.76401378  0.88075304]

xgboost
[ 0.9164757   0.78335229  0.78612837  0.88575645]
log res
[ 0.91031359  0.79257711  0.76522446  0.88242627]

xgboost
[ 0.91770239  0.76059809  0.78836893  0.87301022]
log res
[ 0.90383599  0.77659541  0.76090294  0.87348536]





In [17]:
model = LogisticRegression(penalty='l1', C=1.1).fit(X[:, dist_features], y)

def get_importances(model, n_features):
    if not hasattr(model, 'coef_'):
        booster = model.booster()
        fscores = booster.get_fscore()

        importances = np.zeros(n_features)

        for k, v in fscores.iteritems():
            importances[int(k[1:])] = v

        return importances
    else:
        return model.coef_[0]
    
features_importances = zip(features_names[8:], get_importances(model, X.shape[1]))
features_importances = dict(filter(lambda p: p[1] != 0.0, features_importances))
features_importances = [(name, features_importances[name]) for name in features_names if name in features_importances]

for fi in features_importances:
    print '{},'.format(fi[0])
    
for fi in features_importances:
    print '({},{})'.format(fi[1], fi[0])
    

Dist20Cos0,
Dist20Cos2,
Dist20Kl3,
Dist20Jac0,
Dist20Jac1,
Dist20Jac2,
Dist20Jac3,
Dist100Jac0,
Dist100Jac1,
Dist100Jac2,
Dist100Jac3,
DistInfCos1,
DistInfCos3,
DistInfKl0,
DistInfJac1,
(-1.51708495115,Dist20Cos0)
(-1.29188329339,Dist20Cos2)
(1.53750528571,Dist20Kl3)
(0.13269859521,Dist20Jac0)
(0.137601913789,Dist20Jac1)
(0.114574753066,Dist20Jac2)
(0.137608623722,Dist20Jac3)
(0.0504267261387,Dist100Jac0)
(0.0870468754272,Dist100Jac1)
(0.430541513447,Dist100Jac2)
(0.34512426982,Dist100Jac3)
(-3.03241884272,DistInfCos1)
(2.10779418723,DistInfCos3)
(-4.0389550597,DistInfKl0)
(1.45687375076,DistInfJac1)
