## Plan

1. Load data organised into K folds (pickle file) [DONE]
2. obtain histograms for classes and object labels [DONE]
3. generate one-hot encodings for object labels
4. label encoding for output classes
5. label encoding for multi-label training
6. Split into k folds of similar output class distribution

### Import system and set paths

In [1]:
import sys
sys.path.append('/Volumes/GoogleDrive/My Drive/my_SR_Research/Code/SR_Lib')
# Office old machine
sys.path.append('/Users/muskata/Google Drive File Stream/My Drive/my_SR_Research/Code/SR_Lib')

In [1]:
import sys
import numpy as np
import pickle
import copy
import json
from sklearn import preprocessing
#from sVOC2k_lib_feat import compute_geometrical_features
#from sVOC2k_lib_util import Object
#from sVOC2k_lib_util import cleanObjLabel
#from sVOC2k_lib_util import get_csv_string
#from sVOC2k_lib_lang import getVOCembeddings

In [2]:
from sklearn import metrics
from sklearn import tree
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import f1_score
from sklearn.preprocessing import StandardScaler


## Load data organised into K folds

In [11]:
#Python 2 implementation
#pkl_file = open('dataset_shuffle_1971_4.pkl', 'rb')
#pkl_file = open('dataset_shuffle_1971_5.pkl', 'rb')

#pkl_file = open('pickel_spatialvock/dataset_shuffle_1971_2017 .pkl', 'rb')
#pkl_file = open('dataset_shuffle_2000_2017.pkl', 'rb')

#data = pickle.load(pkl_file)
#pkl_file.close()

#Python 3 implementation 
with open('pickel_spatialvock/dataset_shuffle_1971_2017 .pkl', 'rb') as f:
    u = pickle._Unpickler(f)
    u.encoding = 'latin1'
    p = u.load()
    f.close()

data = p

In [13]:
data_keys = data.keys()
print (data_keys)
print (data['X_best_fold']['fold_3']['X_model'].shape)
print (data['X_all_fold']['fold_3']['X_model'].shape)

X_best_fold = data['X_best_fold']
Y_best_fold = data['Y_best_fold']
X_all_fold = data['X_all_fold']
Y_all_fold = data['Y_all_fold']

print ("Extract rel_pos_one_hot ...")
rel_pos_one_hot = data['rel_pos_one_hot']
print(rel_pos_one_hot)
print ("DONE")
#
print ("Extract obj_cls_one_hot ...")
obj_cls_one_hot = data['obj_cls_one_hot']
print(obj_cls_one_hot)
print ("DONE")
#
print ("Extract prep_label_enc ...")
prep_label_enc = data['prep_label_enc']
print(prep_label_enc)
print ("DONE")
#
print ("Extract prep_one_hot ...")
prep_one_hot = data['prep_one_hot']
print(prep_one_hot)
print ("DONE")
#

dict_keys(['obj_cls_one_hot', 'prep_label_enc', 'prep_one_hot', 'rel_pos_one_hot', 'X_best_fold', 'Y_all_fold', 'X_all_fold', 'Y_best_fold'])
(1058, 762)
(2292, 762)
Extract rel_pos_one_hot ...
LabelBinarizer(neg_label=0, pos_label=1, sparse_output=False)
DONE
Extract obj_cls_one_hot ...
LabelBinarizer(neg_label=0, pos_label=1, sparse_output=False)
DONE
Extract prep_label_enc ...
LabelEncoder()
DONE
Extract prep_one_hot ...
LabelBinarizer(neg_label=0, pos_label=1, sparse_output=False)
DONE


In [14]:
#data['X_best_fold']['fold_0']['meta-info']['geo_feat'].index('objDiagonalRatioTL')
data['X_best_fold']['fold_0'].keys()
data['X_best_fold']['fold_0']['X_headings']



{'obj_glove': (62, 162),
 'obj_code': (0, 2),
 'rel_position': (55, 59),
 'obj_word2vec': (162, 762),
 'geometric': (42, 55),
 'obj_one_hot': (2, 42),
 'depth_human': (59, 62)}

## Concatenate Folds

In [15]:
def concat_folds(X_folds, Y_folds, fold_list):
    """
    name_str : name of resulting set, i.e "train", "dev" or "test"
    X_folds : all the folds, i.e X_best_fold or X_all_fold
    Y_folds : all the folds, i.e Y_best_fold or Y_all_fold
    fold_vec : a list of fold integers to concatenate
    """
    # check that all numbers in fold_vec are valid
    assert(len(fold_list) > 0), "fold _list is empty!"
    folds = X_folds.keys()
    for f in fold_list:
        f_query = 'fold_'+str(f)
        assert(f_query in folds), f_query + " not in X_folds"
    #
    # Define components to concatenate
    Y_comp = [('Y_model','array'), ('best_label', 'list'), ('all_label', 'list'), ('all_code', 'list') ]
    X_comp = [('X_model','array'), ('obj_label', 'list')]
    #
    # Define headings to add
    Y_headings = ['Y_headings']
    X_headings = ['X_headings']
    #
    # Define output dictionary
    X_set = {}
    Y_set = {}
    #
    # Add Headings
    for h in X_headings:
        X_set[h] = X_folds['fold_'+str(fold_list[0])][h]
    for h in Y_headings:
        Y_set[h] = Y_folds['fold_'+str(fold_list[0])][h]
    #
    # Concatenate folds
    # Copy first fold
    f = fold_list[0]
    for c in X_comp:
        #print c
        if c[1]=='array':
            #print c[0]
            X_set[c[0]] = np.copy(X_folds['fold_'+str(f)][c[0]])
            #X_set['X_model'] = np.copy(X_folds['fold_'+str(f)]['X_model'])
        if c[1]=='list':
            #print c[0]
            X_set[c[0]] = copy.deepcopy(X_folds['fold_'+str(f)][c[0]])
    for c in Y_comp:
        if c[1]=='array':
            Y_set[c[0]] = np.copy(Y_folds['fold_'+str(f)][c[0]])
        if c[1]=='list':
            Y_set[c[0]] = copy.deepcopy(Y_folds['fold_'+str(f)][c[0]])
    # Then copy the rest
    for f in fold_list[1:]:
        for c in X_comp:
            #print c
            if c[1]=='array':
                #print c[0]
                X_set[c[0]] = np.concatenate((X_set[c[0]],np.copy(X_folds['fold_'+str(f)][c[0]])), axis=0)
                #X_set['X_model'] = np.copy(X_folds['fold_'+str(f)]['X_model'])
            if c[1]=='list':
                #print c[0]
                X_set[c[0]] += copy.deepcopy(X_folds['fold_'+str(f)][c[0]])
        for c in Y_comp:
            if c[1]=='array':
                Y_set[c[0]] = np.concatenate((Y_set[c[0]],np.copy(Y_folds['fold_'+str(f)][c[0]])), axis=0)
            if c[1]=='list':
                Y_set[c[0]] += copy.deepcopy(Y_folds['fold_'+str(f)][c[0]])


    return X_set, Y_set

In [16]:

def setup_TDT_sets(train_folds, dev_folds, test_folds):
    """
    train_folds = e.g [0,1,2]
    dev_folds   = e.g [3]
    test_folds  = e.g [4]
    """
    global X_b_train, Y_b_train, X_b_dev, Y_b_dev, X_b_test, Y_b_test
    global x_b_train, y_b_train, x_b_dev, y_b_dev, x_b_test, y_b_test
    #
    global X_a_train, Y_a_train, X_a_dev, Y_a_dev, X_a_test, Y_a_test
    global x_a_train, y_a_train, x_a_dev, y_a_dev, x_a_test, y_a_test
    #
    #
    # define best_prep sets
    X_b_train, Y_b_train = concat_folds(X_best_fold, Y_best_fold, train_folds)
    X_b_dev, Y_b_dev = concat_folds(X_best_fold, Y_best_fold, dev_folds)
    X_b_test, Y_b_test = concat_folds(X_best_fold, Y_best_fold, test_folds)
    #    #
    x_b_train = np.copy(X_b_train['X_model'][:,feat_list])
    y_b_train = np.copy(Y_b_train['Y_model'][:,0].ravel())
    #
    x_b_dev = np.copy(X_b_dev['X_model'][:,feat_list])
    y_b_dev = np.copy(Y_b_dev['Y_model'][:,0].ravel())
    #
    x_b_test = np.copy(X_b_test['X_model'][:,feat_list])
    y_b_test = np.copy(Y_b_test['Y_model'][:,0].ravel())
    #
    #
    # define all_prep sets
    X_a_train,Y_a_train = concat_folds(X_all_fold, Y_all_fold, train_folds)
    X_a_dev,Y_a_dev = concat_folds(X_all_fold, Y_all_fold, dev_folds)
    X_a_test,Y_a_test = concat_folds(X_all_fold, Y_all_fold, test_folds)
    #
    x_a_train = np.copy(X_a_train['X_model'][:,feat_list])
    y_a_train = np.copy(Y_a_train['Y_model'][:,0].ravel())
    #
    x_a_dev = np.copy(X_a_dev['X_model'][:,feat_list])
    y_a_dev = np.copy(Y_a_dev['Y_model'][:,0].ravel())
    #
    x_a_test = np.copy(X_a_test['X_model'][:,feat_list])
    y_a_test = np.copy(Y_a_test['Y_model'][:,0]).ravel()
    #
    return 0


In [10]:
def get_pred_rank(a):
    """
    a : probability vector, shape(number of classes (prepositions),1)
    N : len(a)
    b : ranked classes (prepositions)
    """
    N=len(a)
    b=np.zeros([N],dtype=[('prob',float),('idx',int)])
    #for i in range(N)
    b['idx']=np.arange(N)
    b['prob']=np.copy(a)
    b.sort(order='prob')
    return b

def get_pred_vector(R):
    """
    R : predicted probabilities, shape:(number of test examples, number of classes (prepositions))
    b : highest ranked preposition , shape (number of test examples, 1)
    """
    b=np.zeros([R.shape[0],1],dtype=float)
    for i in range(len(b)):
        #b[i,]=float(get_pred_rank(y_p[i])[-1][1])
        b[i,0]=float(get_pred_rank(R[i])[-1][1])

    return b

def get_recall_at_k(y_pred,y_test,k):
    """
    k      : compute recall @ k
    y_pred : predicted probabilities from model, shape (size of y_test, number of classes)
    y_test : expected output classes 
    """
    m = len(y_test)
    correct=0
    for i in range(m)[:]:
        y_r = get_pred_rank(y_pred[i])  ### chnaged y_p to y_pred
        #print y_r
        y_rank = np.flip(np.asarray(y_r['idx'],dtype=int),0) 
        #print y_rank
        if int(y_test[i]) in y_rank[:k]:
            correct+=1
    return float(correct)/float(m)

def get_acc_all(Y_pred,y_test_all):
    correct=0
    for i in range(len(Y_pred)):
        if Y_pred[i] in y_test_all[i]:
            correct+=1
    return float(correct)/len(Y_pred)

def get_recall_for_all_at_k(y_pred,y_all_test,k):
    """
    k          : compute recall @ k
    y_pred     : predicted probabilities from model, shape (size of y_test, number of classes)
    y_all_test : expected output classes 
    
    return:
    recall@k per prep
    weighted average
    average
    """
    m = len(y_all_test)
    correct=0
    for i in range(m)[:]:
        y_r = get_pred_rank(y_p[i])
        #print y_r
        y_rank = np.flip(np.asarray(y_r['idx'],dtype=int),0) 
        #print y_rank
        #print y_all_test[i]
        if any(p in y_all_test[i] for p in y_rank[:k]):
            correct+=1
    #
    return float(correct)/float(m)

def get_recall_per_prep_at_k(y_pred,y_test,k):
    """
    k      : compute recall @ k
    y_pred : predicted probabilities from model, shape (size of y_test, number of classes)
    y_test : expected output classes 
    """
    def get_y_histo(y_data):
        # histogram for y_data
        y_histo={}
        m = y_data.shape[0]
        p_histo={}
        for i in range(m):
            if int(y_data[i][0]) not in p_histo:
                p_histo[int(y_data[i][0])]=1
            else:
                p_histo[int(y_data[i][0])]+=1
        #
        p_data=np.zeros(len(p_histo.keys()),dtype=float)
        for p in p_histo.keys():
            p_data[p]=p_histo[p]
        return p_data
    #
    prep_in_test=get_y_histo(y_test)
    prep_correct = np.zeros(prep_in_test.shape,dtype=float)
    m = len(y_test)
    for i in range(m)[:]:
        y_r = get_pred_rank(y_pred[i])
        #print y_r
        y_rank = np.flip(np.asarray(y_r['idx'],dtype=int),0) 
        #print y_rank
        if int(y_test[i]) in y_rank[:k]:
            prep_correct[int(y_test[i])]+=1
    #
    recall_per_prep = prep_correct/prep_in_test
    recall_weighted_average = np.sum(prep_correct)/np.sum(prep_in_test)
    recall_average = np.mean(recall_per_prep)
    #
    return recall_per_prep, recall_average, recall_weighted_average, prep_in_test, prep_correct

def get_precision_at_k(y_pred,y_test,k):
    """
    k      : compute recall @ k
    y_pred : predicted probabilities from model, shape (size of y_test, number of classes)
    y_test : expected output classes 
    """
    m = len(y_test)
    correct=0
    for i in range(m)[:]:
        y_r = get_pred_rank(y_pred[i])
        y_rank = np.flip(np.asarray(y_r['idx'],dtype=int),0) 
        if (y_test[i].any() == p for p in y_rank[:k]):
            correct+=1
    return float(correct)/float(m)

def get_precision_per_prep(y_pred,y_test):
    """
    inputs:
        y_pred : predicted probabilities per preposition size(m,number of unique prepositions)
        y_test : gold annotations: could be either single or multi-label
    returns:
        acc_p(0) : the precision accuracy per preposition or per class
                   (-1) means that the preposition was never predicted
        global_mean   : the mean over all classes, not just predicted ones
        global_wt_mean : the mean over all classes weighted by number of predicted over all classes
        acc_p_mean    : the mean accuracy among the predicted classes 
        acc_p_wt_mean : the sum of class accuracies weighted by the number predicted as a ratio of total
                  predicted over all classes
    """
    N = y_pred.shape[1]  #Number of prepositions
    m = len(y_test)  #Number of test instances
    predicted = {}  # Array: number of times a preposition is predicted
    correct = {}    # Array: number of time the prediction is correct
    for n in range(N):
        predicted[n] = 0
        correct[n] = 0
    #    
    for i in range(m):
        y_r = get_pred_rank(y_p[i])
        y_rank = np.flip(np.asarray(y_r['idx'],dtype=int),0)
        predicted[int(y_rank[0])] += 1
        if int(y_rank[0]) in np.array(y_test[i],dtype=int):           
            correct[int(y_rank[0])] += 1 
    #
    acc_p = np.zeros((N),dtype=float)
    for n in range(N):
        if predicted[n] > 0:
            acc_p[n] = float(correct[n])/float(predicted[n])
        else:
            acc_p[n] = -1.
    #
    # Compute global means
    # remove "-1s"
    zeroed_acc_p = np.zeros((N),dtype=float)
    count_predicted = np.zeros((N),dtype=float)
    count_correct = np.zeros((N),dtype=float)
    total_predicted = 0
    for n in range(N):
        #total_predicted += predicted[n]
        count_predicted[n] = predicted[n]
        count_correct[n] = correct[n]
        if acc_p[n] != -1.0:
            zeroed_acc_p[n] = acc_p[n]
    #
    assert(int(count_predicted.sum()) == m), "Wrong number of predicted prepositions"
    global_mean = np.mean(zeroed_acc_p)
    global_wt_mean = np.sum(zeroed_acc_p*(count_predicted/count_predicted.sum()))
    #
    acc_p_list=[]
    count_list =[]
    for n in range(N):
        if acc_p[n] != -1.0:
            acc_p_list.append(acc_p[n])
            count_list.append(predicted[n])
    #
    acc_p_array = np.array(acc_p_list, dtype=float)
    count_array = np.array(count_list, dtype=float)
    acc_p_mean = np.mean(acc_p_array)
    acc_p_wt_mean = np.sum(acc_p_array*count_array/np.sum(count_array))
    #
    return acc_p, global_mean, global_wt_mean, acc_p_mean, acc_p_wt_mean, count_correct, count_predicted


In [3]:
def prediction_labels(a,p):
    #a is the list of percentages in order 
    #p is the percentage cut off point e.g 50%
    #k is the top k answers to take
    pred_l = []
    temp_index = []
    m = len(a)
    h_count = 0
    for i in range(m):
        if a[i]*100 < p:
            pred_l.append(0)
        else:
            pred_l.append(a[i])
            h_count+=1
    return pred_l,h_count

def get_pred_rank1(a):
    """
    a : probability vector, shape(number of classes (prepositions),1)
    N : len(a)
    b : ranked classes (prepositions)
    """
    N=len(a)
    b=np.zeros([N],dtype=[('prob',float),('idx',int)])
    #for i in range(N)
    b['idx']=np.arange(N)
    b['prob']=np.copy(a)
    b.sort(order='prob')
    return b

def ordered_intersection(y, h):
    lst3 = []
    y_count = 0
    h_count = 0
    for i in range(len(y)):
        if y[i] == 1: 
            y_count +=1 
        if h[i] == 1:
            h_count +=1 
        if y[i] == h[i] and y[i] == 1:
            lst3.append(y[i])
    return lst3 , y_count,h_count

def get_precision_at_k1(y_pred,y_test,k):
    """
    k      : compute recall @ k
    y_pred : predicted probabilities from model, shape (size of y_test, number of classes)
    y_test : expected output classes 
    """
    m = len(y_test)
    pres = []
    correct=0
    for i in range(m)[:]:
        #pred_l, h_count = prediction_labels(y_pred[i],prob)
        y_r = get_pred_rank1(y_pred[i])
        y_rank = np.flip(np.asarray(y_r['idx'],dtype=int),0)
        
        for p in y_rank[:k]:
            if (y_test[i][p] == 1):
                correct+=1
                break
    return float(correct)/float(m)

predictions = [[1,1,0,0,0,0,0,0,0],
               [0,0,1,1,0,0,0,0,0],
               [0,0,0,0,1,1,0,0,0],
               [0,0,0,0,0,0,1,1,0],
               [0,0,0,0,0,0,0,1,1],
               [0,0,0,0,0,0,1,1,0],
               [0,0,0,0,0,0,1,1,0]]

Y_test = predictions.copy()

#Y_test[0] = [1,1,0,0,1,0,0,0,0]
pres = get_precision_at_k1(predictions,Y_test,3)
print(pres)

1.0


In [38]:
def y_count(a):
    m = len(a)
    y_count = 0
    for i in range(m):
        #print(a[i])
        if a[i] == 1:
            y_count+=1
    return y_count

def get_recall_at_k1(y_pred,y_test,k):
    """
    k      : compute recall @ k
    y_pred : predicted probabilities from model, shape (size of y_test, number of classes)
    y_test : expected output classes 
    """
    m = len(y_test)
    recalls =[]
    for i in range(m)[:]:
        correct=0
        y_len = y_count(y_test[i])
        y_r = get_pred_rank1(y_pred[i])
        y_rank = np.flip(np.asarray(y_r['idx'],dtype=int),0) 
        
        for p in y_rank[:k]:
            if (y_test[i][p] == 1):
                correct+=1
                
        recalls.append(float(correct/y_len))
        
    return float(sum(recalls))/float(m) 

    
predictions = [[1,1,0,0,0,0,0,0,0],
               [0,0,1,1,0,0,0,0,0],
               [0,0,0,0,1,1,0,0,0],
               [0,0,0,0,0,0,1,1,0],
               [0,0,0,0,0,0,0,1,1],
               [0,0,0,0,0,0,1,1,0],
               [0,0,0,0,0,0,1,1,0]]

Y_test = predictions.copy()
Y_test[0] = [1,1,0,1,0,0,0,0,0]
pres = get_recall_at_k1(predictions,Y_test,3)
print(pres)

0.9523809523809523


In [39]:
from collections import Counter

def y_count_1(a):
    m = len(a)
    y_count = []
    for i in range(m):
        #print(a[i])
        if a[i] == 1:
            y_count.append(i)
    return y_count

def get_pred_rank1(a):
    """
    a : probability vector, shape(number of classes (prepositions),1)
    N : len(a)
    b : ranked classes (prepositions)
    """
    N=len(a)
    b=np.zeros([N],dtype=[('prob',float),('idx',int)])
    #for i in range(N)
    b['idx']=np.arange(N)
    b['prob']=np.copy(a)
    b.sort(order='prob')
    return b

def TP_FN(y_pred,y_test,k):
    TP = []
    FN = []
    for j in range(len(y_pred)):
        Y = y_test[j]
        H = get_pred_rank1(y_pred[j])        
        y_count = y_count_1(y_test[j])
        y_rank = np.flip(np.asarray(H['idx'],dtype=int),0) 
        #print(y_rank)
        for p in y_count:
            if p in y_rank[:k]:
                TP.append(p)
            else:
                #print(p)
                FN.append(p)
            
    return TP,FN 

def get_per_label(TP,FN,n):
    Recalls = []
    for i in range(0,n):
        TP1 = TP.get(i)
        FN1 = FN.get(i)
        if TP1 == None:
            TP1 = 0
        if FN1 == None:
            FN1 = 0
                
        if (TP1+FN1) == 0:
            Recall = 0
        else:
            Recall  = (TP1)/(TP1+FN1)
            
        Recalls.append(Recall)
        
    return Recalls

#Recall => TP/TP+FN
#def Micro_Averaging(y_pred,y_test,k):
    
predictions = [[1,1,0,0,0,0,0,0,0],
               [0,0,1,1,0,0,0,0,0],
               [0,0,0,0,1,1,0,0,0],
               [0,0,0,0,0,0,1,1,0],
               [0,0,0,0,0,0,0,1,1],
               [0,0,0,0,0,0,1,1,0],
               [0,0,0,0,0,0,1,1,0]]

Y_test = predictions.copy()
Y_test[0] = [1,1,0,1,0,0,0,0,0]
TP,FN  = TP_FN(predictions,Y_test,3)
TP_Counter = Counter(TP)
FN_Counter = Counter(FN)
Recalls = get_per_label(TP_Counter,FN_Counter,len(Y_test[0]))
print(Recalls)
print(sum(Recalls)/len(Recalls))

[1.0, 1.0, 1.0, 0.5, 1.0, 1.0, 1.0, 1.0, 1.0]
0.9444444444444444


In [39]:
def train_model(model_type, x_train, y_train, HP_dict):
    """
    model_type: which model to train: LR, RF, DT, NN, kNN, SVM
    Retuns: instance of model
    """
    # ================================================================================================
    if model_type == "LR":
        my_C = HP_dict['C']
        my_tol = HP_dict['tol']
        this_model = LogisticRegression(C=HP_dict['C'], penalty='l2', tol = HP_dict['tol'],
                                        multi_class='ovr', max_iter=500, random_state=1971)
        this_model.fit(x_train, y_train)
    # ================================================================================================
    #
    # ================================================================================================
    if model_type == "DT":
        this_model = tree.DecisionTreeClassifier(max_depth=HP_dict['tree_depth'],
                                                 min_samples_split = HP_dict['min_samples_split'],
                                                min_samples_leaf=HP_dict['min_samples_leaf'],
                                                random_state=1971)
        this_model.fit(x_train, y_train)
    # ================================================================================================
    #
    # ================================================================================================
    if model_type == 'RF':
        this_model = RandomForestClassifier(n_estimators = HP_dict['n_estimators'],  #40
                                            max_features = HP_dict['max_features'],  #40
                                            max_depth = HP_dict['max_depth'],    #9
                                            min_samples_split = HP_dict['min_samples_split'],
                                            min_samples_leaf = HP_dict['min_samples_leaf'],
                                            random_state=1971,
                                            )
        this_model.fit(x_train, y_train)
    # ================================================================================================
    #
    # ================================================================================================
    if model_type=='NN':
        #print HP_dict['alpha'],HP_dict['tol'],HP_dict['H1'],HP_dict['H2']
        this_model = MLPClassifier(solver='lbfgs',  # lbfgs
                                   alpha = HP_dict['alpha'], #1e-5,
                                   #max_iter=500,
                                   tol=HP_dict['tol'], #0.0001
                                   hidden_layer_sizes=(HP_dict['H1'],HP_dict['H2']), # 20,10
                                   random_state=1971,
                                   )
        this_model.fit(x_train, y_train)
    # ================================================================================================
    #
    # ================================================================================================
    if model_type=='SVM':
        #print HP_dict['alpha'],HP_dict['tol'],HP_dict['H1'],HP_dict['H2']
        this_model = svm.SVC(      kernel='rbf',  # rbf
                                   probability = True, # True
                                   decision_function_shape = 'none', #none
                                   tol   = HP_dict['tol'], # 0.01
                                   C     = HP_dict['C'], # 1.0
                                   gamma = HP_dict['gamma'], # 0.01
                                   random_state=1971,
                                   )
        this_model.fit(x_train, y_train)
    # ================================================================================================
    #
    # ================================================================================================
    if model_type=='kNN':
        this_model = KNeighborsClassifier(      
                                   n_neighbors=HP_dict['n_neighbors']
                                   )
        this_model.fit(x_train, y_train)
    # ================================================================================================


# if model == 'kNN':
#     model_kNN_1 = KNeighborsClassifier(n_neighbors=29)
#     model_kNN_1.fit(X_train, Y_train.ravel()) 
#     #print model_kNN_1
#     # predict probabilities for all classes
#     print "Testing Model :"
#     y_p = model_kNN_1.predict_proba(X_test)
    
    return this_model

In [20]:
K_cv = len(X_best_fold.keys())
print (K_cv)

5


# SELECT features here

In [20]:
# compute feature selection array 
print "Features available:"
for hh in X_best_fold['fold_0']['X_headings']:
    print "    ", hh, ":",X_best_fold['fold_0']['X_headings'][hh]
feat_head = []
#feat_head = ['obj_glove']
#feat_head = ['obj_word2vec']
#feat_head = ['obj_one_hot','geometric', 'rel_position','depth_human']
#feat_head = ['geometric','depth_human']
feat_head = ['geometric', 'rel_position']
#feat_head = ['obj_one_hot', 'geometric']
#feat_head = ['obj_one_hot', 'geometric', 'rel_position']
#feat_head = ['obj_code', 'geometric', 'rel_position']
#feat_head = ['obj_code', 'geometric']
#feat_head = ['obj_one_hot']
#feat_head = ['obj_one_hot','depth_human']
#
# compute feature headings
feat_head_col={}
feat_list = []
col_prev=0
for fh in feat_head:
    col_0, col_1 = X_best_fold['fold_0']['X_headings'][fh]
    col_last = col_prev + (col_1-col_0)
    feat_head_col[fh] = [col_prev, col_last]
    col_prev = col_last
    print "Copying feature indices :",fh, ':columns:', col_0, 'to', col_1-1
    feat_list += range(col_0, col_1, 1)
#============================================================================
# Remove diagonal feature
#feat_list.remove(53)
#============================================================================
print feat_list
print feat_head_col

Features available:
     obj_glove : (62, 162)
     obj_code : (0, 2)
     rel_position : (55, 59)
     obj_word2vec : (162, 762)
     geometric : (42, 55)
     obj_one_hot : (2, 42)
     depth_human : (59, 62)
Copying feature indices : geometric :columns: 42 to 54
Copying feature indices : rel_position :columns: 55 to 58
[42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58]
{'geometric': [0, 13], 'rel_position': [13, 17]}


In [21]:
print "Features : " + str(feat_head)
print len(feat_list)

Features : ['geometric', 'rel_position']
17


In [14]:
# Example on training a model
# setup_TDT_sets([0,1,2,3], [4], [4])
# model_lr = train_model('LR', x_b_train, y_b_train, {'C':10, 'tol':.001})

In [15]:
print X_best_fold['fold_0'].keys()
print X_best_fold['fold_0']['meta-info']['geo_feat']

['obj_glove', 'meta-data', 'X_headings', 'obj_one_hot', 'obj_word2vec', 'obj_label', 'obj_code', 'X_model', 'geometric', 'rel_position', 'depth_human', 'meta-info']
['AreaObj1_Norm_wt_Image', 'AreaObj2_Norm_wt_Image', 'AreaOverlap_Norm_wt_Min', 'AspectRatioObj_1', 'AspectRatioObj_2', 'DistBtCentr_Norm_wt_ImageDiag', 'DistanceSizeRatio', 'InvFeatXmaxXmin', 'InvFeatXminXmin', 'InvFeatYmaxYmin', 'InvFeatYminYmin', 'objAreaRatioTrajLand', 'relativePosition']


# Test Loop

In [33]:
print "FEATURE SET : ", feat_head
print "                    train, dev, test,  tr_sngl  tr_mlti  ts_sngl  ts_mlti   wt_m_1  wt_m_2"
results = np.zeros((K_cv+1,8),dtype=float)
results_acc_p = np.zeros((K_cv+1,17),dtype=float)   # 17 should be a variable (number of unique preps)
res_prep_predict = np.zeros((K_cv+1,17),dtype=float)   # 17 should be a variable (number of unique preps)
res_prep_correct = np.zeros((K_cv+1,17),dtype=float)   # 17 should be a variable (number of unique preps)
res_recall = np.zeros((K_cv+1,4),dtype=float)
res_recall_all = np.zeros((K_cv+1,4),dtype=float)
res_conf_mat = np.zeros((K_cv+1,len(prep_label_enc.classes_),len(prep_label_enc.classes_)),dtype=float)
res_recall_tested = np.zeros((4,K_cv+1,17),dtype=float)   # 17 should be a variable (number of unique preps)
res_recall_correct = np.zeros((4,K_cv+1,17),dtype=float)   # 17 should be a variable (number of unique preps)


for j_test in range(K_cv):
    # select folds
    all_folds = range(K_cv)
    all_folds.remove(j_test)
    train_folds = list(all_folds)
    dev_folds = [j_test] # Dummy
    test_folds = [j_test]
    print "TEST fold:%4s"%str(test_folds),":",
    print "%20s"%(str(train_folds)+ str(dev_folds)+ str(test_folds)),
    setup_TDT_sets(train_folds, dev_folds, test_folds )
    #
    # ================================================================================================
    scale_feat = True
    if 'geometric' in feat_head and scale_feat:
        scaler = StandardScaler()
        col_0, col_1 = feat_head_col['geometric']
        scaler.fit(x_b_train[:,col_0:col_1])  # training with best
        x_b_train[:,col_0:col_1] = scaler.transform(x_b_train[:,col_0:col_1])  
        x_b_dev[:,col_0:col_1] = scaler.transform(x_b_dev[:,col_0:col_1])
        x_b_test[:,col_0:col_1] = scaler.transform(x_b_test[:,col_0:col_1])
        x_a_train[:,col_0:col_1] = scaler.transform(x_a_train[:,col_0:col_1])  
        x_a_dev[:,col_0:col_1] = scaler.transform(x_a_dev[:,col_0:col_1])
        x_a_test[:,col_0:col_1] = scaler.transform(x_a_test[:,col_0:col_1])
    # ================================================================================================
    #
    # ================================================================================================
    model = 'RF'
    data_set = 'expanded'
    #data_set = 'best'
    #
    if data_set == 'expanded': 
        x_train = x_a_train
        y_train = y_a_train
        Y_train = Y_a_train
        x_test = x_b_test
        yy_test = y_b_test
        Y_test = Y_b_test        

     
#         x_test = x_a_test
#         yy_test = y_a_test
#         Y_test = Y_a_test
    if data_set == 'best':
        x_train = x_b_train
        y_train = y_b_train
        Y_train = Y_b_train
        
        x_test = x_b_test
        yy_test = y_b_test
        Y_test = Y_b_test        
    #
    
    h_param = {}
    h_param['LR'] = {'C':1, 'tol':.001}
    h_param['DT'] = {'tree_depth':8, 'min_samples_split':4, 'min_samples_leaf':2}
    h_param['RF'] = {'n_estimators':100, 'max_features':17,'max_depth':9, 'min_samples_leaf':2, 'min_samples_split':10}
    #h_param['NN'] = {'alpha':1e-4, 'H1':12,  'H2':10, 'tol':0.001}
    h_param['SVM'] = {'C':10.0, 'gamma':0.01, 'tol':0.01}
    h_param['kNN'] = {'n_neighbors':17}
    #
    model_exp = train_model(model, x_train, y_train, h_param[model]) 
    
    #
    #if model == 'DT':
    #    model_exp = train_model(model, x_train, y_train, {'tree_depth':6}) # 6 or 7
    #
    # ================================================================================================
    #
    #
    # test on BEST_TRAIN_SINGLE
    y_p = model_exp.predict_proba(x_train)
    y_test = y_train
    y_all_test = Y_train['all_code']

    #y_p = model_exp.predict_proba(x_a_train)
    #y_test = y_a_train
    #y_all_test = Y_a_train['all_code']
    results[j_test,0] = get_recall_at_k(y_p, y_test, 1)
    results[j_test,1] = get_recall_for_all_at_k(y_p, y_all_test, 1)
    print " %5.3f    %5.3f   "%(results[j_test,0], results[j_test,1]),

    # test on BEST_TEST_SINGLE and BEST_TEST_MULTI
    y_p = model_exp.predict_proba(x_test)
    y_test = yy_test
    y_all_test = Y_test['all_code']   
    
    #y_p = model_exp.predict_proba(x_a_test)
    #y_test = y_a_test
    #y_all_test = Y_a_test['all_code']   
    results[j_test,2] = get_recall_at_k(y_p, y_test, 1)
    for r in range(4):
        res_recall[j_test,r] = get_recall_at_k(y_p, y_test, r+1)
        res_recall_all[j_test,r] = get_recall_for_all_at_k(y_p, y_all_test, r+1)
    results[j_test,3] = get_recall_for_all_at_k(y_p, y_all_test, 1)
    #Y_pred=get_pred_vector(y_p)
    #res_conf_mat[j_test] = np.copy(confusion_matrix(y_test, Y_pred))
    print "%5.3f   %5.3f   "%(results[j_test,2], results[j_test,3]),
    acc_p, global_mean, global_wt_mean, acc_p_mean, acc_p_wt_mean, count_correct, count_predicted =\
                                                            get_precision_per_prep(y_p, y_all_test)
    results[j_test,4] = global_mean
    results[j_test,5] = global_wt_mean
    results_acc_p[j_test] = np.copy(acc_p)
    res_prep_predict[j_test] = np.copy(count_predicted)
    res_prep_correct[j_test] = np.copy(count_correct)
    #results[j_test,6] = acc_p_mean
    #results[j_test,7] = acc_p_wt_mean
    #
    print "%5.3f   %5.3f   "%(results[j_test,4], results[j_test,5] ),
    print "%5.3f   %5.3f   "%(results[j_test,6], results[j_test,7] )
    # Compute recall per preposition
    for r in range(4):
        recall_per_prep, recall_average, recall_weighted_average, prep_in_test, prep_correct =\
                                    get_recall_per_prep_at_k(y_p, y_test.reshape((y_test.shape[0],1)), r+1)
        res_recall_tested[r,j_test,:] = np.copy(prep_in_test)
        res_recall_correct[r,j_test,:] = np.copy(prep_correct)

    
    
print "=========================================================================================="
results[-1,:] = np.mean(results[0:-1,:],axis=0)
print "                                       %5.3f    %5.3f    %5.3f   %5.3f    %5.3f   %5.3f    %5.3f   %5.3f"\
                                                    %(results[-1,0],\
                                                    results[-1,1], results[-1,2], results[-1,3],\
                                                    results[-1,4], results[-1,5],\
                                                    results[-1,6], results[-1,7])
#
# standard recall@k
print"b_BEST_TEST: recall@k results: "
print"================================"
res_recall[-1,:] = np.mean(res_recall[0:-1,:],axis=0)
for j in range(K_cv+1):
    for r in range(4):
        print "%5.3f  "%(res_recall[j,r]),
    print
    if j==K_cv-1:
        print"================================"
print"================================"
print
#
# system level accuracy
print"b_ALL_TEST: recall@k results: "
print"================================"
res_recall_all[-1,:] = np.mean(res_recall_all[0:-1,:],axis=0)
for j in range(K_cv+1):
    for r in range(4):
        print "%5.3f  "%(res_recall_all[j,r]),
    print
    if j==K_cv-1:
        print"================================"
print"================================"

# Compute precison per preposition (acc_p(0))
p_predict = np.sum(res_prep_predict, axis=0)
p_correct = np.sum(res_prep_correct, axis=0)
N=p_predict.shape[0]
acc_p_0 = np.zeros((N),dtype=float)
for n in range(N):
    if p_predict[n] > 0:
        acc_p_0[n] = float(p_correct[n])/float(p_predict[n])
    else:
        acc_p_0[n] = -1.
results_acc_p[-1,:] = np.copy(acc_p_0)
acc_p_0_wt_mean = np.sum(acc_p_0*p_predict/p_predict.sum())
print "precision mean (acc_p(0)) = ",acc_p_0_wt_mean
print '%2d %-25s %5s  %5s  %5s  %5s  %5s  %5s '%(0,'Preposition', "F1","F2","F3","F4","F5","wt_mean")
for j in range(N):
    print '%2d %-25s '%(j,prep_label_enc.inverse_transform(j)+'('+str(p_predict[j])+')'),
    #print '%2d %-14s '%(j,''),

    for i in range(results_acc_p.shape[0]):
        if results_acc_p[i,j]==-1:
            print "%5s "%" - ",
        else:
            print "%5.3f "%results_acc_p[i,j],
    print
print"================================"

#acc_p_0_wt_mean = np.sum(acc_p_0*p_predict/p_predict.sum())
#print acc_p_0_wt_mean
#
# Recall@k per preposition
res_recall_tested[:,-1,:] = np.sum(res_recall_tested[:,0:-1,:], axis=1)
res_recall_correct[:,-1,:] = np.sum(res_recall_correct[:,0:-1,:], axis=1)
recall_per_p_at_k = np.copy(res_recall_correct[:,-1,:]/res_recall_tested[:,-1,:])
recall_per_p_wt_mean = np.copy(np.sum(recall_per_p_at_k*res_recall_tested[:,-1,:]/
                               np.sum(res_recall_tested[:,-1,:], axis=1, keepdims=True), axis=1))
#
print '%2d %25s %5s  %5s  %5s  %5s  %5s'%(0,'Preposition', "k=1","k=2","k=3","k=4","wt_mean")
for j in range(N):
    print '%2d %-25s '%(j,prep_label_enc.inverse_transform(j)+'('+str(res_recall_tested[0,-1,j])+')'),
    for i in range(recall_per_p_at_k.shape[0]):    
        print "%5.3f "%recall_per_p_at_k[i,j],
    print    
print"================================================================"
print '%2d %-25s %5.3f  %5.3f  %5.3f  %5.3f'%(0,'',recall_per_p_wt_mean[0],\
                                      recall_per_p_wt_mean[1],\
                                      recall_per_p_wt_mean[2],\
                                      recall_per_p_wt_mean[3])
print"================================================================"
#
#
# Confusion matrix
#
# Save to file
## Save to file
"""
1. Model type: LR, RF, NN, SVM, DT [done]
2. Features used at input [done]
3. acc(k) : System level accuracy, k=1..4 [done]
4. acc_p(0) : per preposition precision weighted average  [done]
5. acc_p(prep) : per preposition precision [done]
EXTRA RESULTS
6. standard recall@k on single ground truth preposition [done]
7. standard recall@k per preposition on single ground truth preposition [done]
"""


 FEATURE SET :  ['geometric', 'rel_position']
                    train, dev, test,  tr_sngl  tr_mlti  ts_sngl  ts_mlti   wt_m_1  wt_m_2
TEST fold: [0] :   [1, 2, 3, 4][0][0]  0.403    0.893    0.359   0.734    0.500   0.734    0.000   0.000   
TEST fold: [1] :   [0, 2, 3, 4][1][1]  0.405    0.896    0.355   0.712    0.457   0.712    0.000   0.000   
TEST fold: [2] :   [0, 1, 3, 4][2][2]  0.402    0.890    0.356   0.744    0.517   0.744    0.000   0.000   
TEST fold: [3] :   [0, 1, 2, 4][3][3]  0.403    0.892    0.348   0.718    0.443   0.718    0.000   0.000   
TEST fold: [4] :   [0, 1, 2, 3][4][4]  0.404    0.892    0.331   0.723    0.531   0.723    0.000   0.000   
                                       0.403    0.893    0.350   0.726    0.490   0.726    0.000   0.000
b_BEST_TEST: recall@k results: 
0.359   0.601   0.722   0.820  
0.355   0.575   0.710   0.805  
0.356   0.599   0.741   0.835  
0.348   0.586   0.708   0.806  
0.331   0.582   0.711   0.817  
0.350   0.589   0.718   0.

'\n1. Model type: LR, RF, NN, SVM, DT [done]\n2. Features used at input [done]\n3. acc(k) : System level accuracy, k=1..4 [done]\n4. acc_p(0) : per preposition precision weighted average  [done]\n5. acc_p(prep) : per preposition precision [done]\nEXTRA RESULTS\n6. standard recall@k on single ground truth preposition [done]\n7. standard recall@k per preposition on single ground truth preposition [done]\n'

In [35]:
save_flag = True
if save_flag:
    out_file = open('results_2000_2017_BB_for_MULTI.txt','a')
    out_file.write('\n/START RUN =========================================================================\n\n')
    out_file.write('Model : ' + str(model)+'\n\n')
    out_file.write('HP :'+str(h_param[model])+'\n\n')
    out_file.write('Dataset : ' + str(data_set)+'\n\n')
    out_file.write('Features : ' + str(feat_head)+'\n\n')
    out_file.write('Ratio of train/test - Single label: train=%4.3f : test=%4.3f : ratio=%4.3f\n'%\
                                                      (results[-1,0], results[-1,2],
                                                      results[-1,2]/results[-1,0]))
    out_file.write('Ratio of train/test -  Multi label: train=%4.3f : test=%4.3f : ratio=%4.3f\n'%\
                                                      (results[-1,1], results[-1,3],
                                                      results[-1,3]/results[-1,1]))
    out_file.write('\n')
    #
    out_file.write('     k :  1      2      3      4\n')
    out_file.write('acc(k) : ')
    for r in range(4):
        out_file.write("%5.3f  "%(res_recall_all[-1,r]))
    out_file.write('\n\n')
    #
    out_file.write("precision weighted mean (acc_p(0)) = %5.3f\n"%acc_p_0_wt_mean)
    out_file.write('%2s  %20s  %8s  %8s\n'%('','Preposition',"Count", "acc_p(0)"))
    for j in range(N):
        out_file.write('%2d %20s  %8.1f  %5.3f\n'%(j,prep_label_enc.inverse_transform(j),\
                                                 p_predict[j], results_acc_p[-1,j]))
    out_file.write("============================================\n")
    out_file.write('%2s %20s  %8.1f  %5.3f\n'%('','weighted mean',\
                                                 p_predict.sum(), acc_p_0_wt_mean))
    out_file.write("============================================\n")
    #
    out_file.write("\n========= EXTRA RESULTS ============\n\n")
    #
    out_file.write('       k :  1      2      3      4\n')
    out_file.write('recall@k : ')
    for r in range(4):
        out_file.write("%5.3f  "%(res_recall[-1,r]))
    out_file.write('\n\n')
    #
    out_file.write('%2s %20s %8s  %5s  %5s  %5s  %5s\n'%('','Preposition',"Count", "k=1","k=2","k=3","k=4"))
    for j in range(N):
        out_file.write('%2d %20s %8.1f  '%(j,prep_label_enc.inverse_transform(j), res_recall_tested[0,-1,j]))
        for i in range(recall_per_p_at_k.shape[0]):    
            out_file.write("%5.3f  "%recall_per_p_at_k[i,j])
        out_file.write('\n')    
    out_file.write("================================================================\n")
    out_file.write('%2s %20s %8.1f  %5.3f  %5.3f  %5.3f  %5.3f\n'%('','weighted mean',\
                                          res_recall_tested[0,-1,:].sum(),\
                                          recall_per_p_wt_mean[0],\
                                          recall_per_p_wt_mean[1],\
                                          recall_per_p_wt_mean[2],\
                                          recall_per_p_wt_mean[3]))
    out_file.write("================================================================\n\n")
    #
    out_file.write('/END RUN =========================================================================\n\n')
    out_file.close()   

In [None]:
model_nn_1 = MLPClassifier(solver='lbfgs', 
                           alpha=1e-4,
                           #max_iter=500,
                           tol = 0.01,
                           hidden_layer_sizes=(10,10),
                           random_state=1971,
                           )
model_nn_1.fit(x_train, y_train)



In [None]:
h_param['NN'] = {'alpha':1e-4, 'H1':10,  'H2':10, 'tol':0.001}
model_exp = train_model(model, x_train, y_train, h_param[model]) 

In [None]:
model_exp

In [None]:
with open('results.txt', 'r') as f:
    res_file_raw = f.readlines()
#
res_file=[]
for item in res_file_raw:
    item_list = item.split()
    if len(item_list)>0:
        res_file.append(item_list)


# Summarize results

In [None]:
def get_model(res_batch):
    ret_model = -1
    for item in res_batch:
        if item[0] == 'Model':
            ret_model = item[2]
    #
    return ret_model
#
#
def get_acc_k(res_batch):
    ret_acc_k = -1
    for item in res_batch:
        if item[0] == 'acc(k)':
            ret_acc_k = item[2:]
    #
    return ret_acc_k
#
#
def get_acc_p_prep_data(res_batch):
    """
    return prep labels, counts, acc_p(0)
    """
    ret_acc_p = -1
    find_table = False
    table_start = -1
    for i,item in enumerate(res_batch[:-1]):
        if item[0] == 'precision':
            #and res_batch[i+1]=='Preposition':
            find_table = True
            table_start = i+2
    if find_table:
        ret_acc_p = res_batch[table_start:table_start+17] + [res_batch[table_start+18]]
    #
    return ret_acc_p
#
#
def get_acc_p_wt_average(res_batch):
    return 0


with open('results_2017.txt', 'r') as f:
    res_file_raw = f.readlines()
#
res_file=[]
for item in res_file_raw:
    item_list = item.split()
    if len(item_list)>0:
        res_file.append(item_list)


# Find start and ends
start_list = []
end_list = []
for row,item in enumerate(res_file):
    if item[0] == '/START':
        start_list.append(row)
    if item[0] == '/END':
        end_list.append(row)
#
print start_list
print end_list
assert(len(start_list)==len(end_list)), "start and end list not of same size"
no_batches = len(start_list)
#
# collect acc(k) results
acc_k = [['','k=1','k=2','k=3','k=4']]
i=0
batch = list(res_file[start_list[i]: end_list[i]+1])
acc_p = [['','Model:']]
acc_p.append(['n','Preposition'])
prep_table = get_acc_p_prep_data(batch)
for item in prep_table:
    acc_p.append([item[0],item[1]])
#
for i in range(no_batches):
    batch = list(res_file[start_list[i]: end_list[i]+1])
    batch_model = get_model(batch)
    # get acc_k
    acc_k.append([batch_model] + get_acc_k(batch))
    #
    # get acc_p
    acc_p[0]+=[batch_model, batch_model]
    acc_p[1]+=['count','acc_p']
    batch_table = get_acc_p_prep_data(batch)
    for j,item in enumerate(batch_table):
        assert(acc_p[j+2][1] == item[1]), "wrong preposition label"
        if item[3] == '-1.000':
            item[3]='-'
        acc_p[j+2] += [str(int(float(item[2]))),item[3]]
#
# print table
print "acc_k table"
for row in acc_k:
    for col in row:
        print "%6s"%col,
    print
print
acc_p[-1][0]=''
acc_p[-1][1]='wt_mean'
print "acc_p(0) table"
for row in acc_p: 
    print "%3s %18s "%(row[0],row[1]),
    for col in row[2:]:
        print "%6s"%col,
    print

save_flag = True
if save_flag:
    out_file = open('summary_2017.txt','w')
    out_file.write("=========================================================================\n")
    out_file.write("acc_k table\n")
    for row in acc_k:
        for col in row:
            out_file.write("%6s"%col)
        out_file.write('\n')
    out_file.write('\n')
    #                                           
    acc_p[-1][0]=''
    acc_p[-1][1]='wt_mean'
    out_file.write("acc_p(0) table\n")
    for row in acc_p: 
        gap = 0
        out_file.write("%3s %18s "%(row[0],row[1]))
        for col in row[2:]:
            out_file.write("%6s"%col)
            gap += 1
            if gap%2==0:
                out_file.write(' ')
        out_file.write('\n')
        if row==acc_p[-2] or row==acc_p[1]:
                out_file.write("=========================================================================\n")


    out_file.write("=========================================================================\n")
    out_file.write("\n")
    #
    out_file.close()

    

In [None]:
len(feat_list)

## HPO loop

In [None]:
# version_2
#
#================================================================================================
model = 'kNN'
data_set = 'expanded'
#data_set = 'best'
#================================================================================================
#
#================================================================================================
# set hyper parameter grid

if model =='kNN':
    my_k = range(1,30,1)
    hp_set = []
    for K in my_k:
        hp_set.append([K])
#
if model == 'LR':
    my_C=[.1,  1, 10, 100]
    my_tol=[.01, 0.001, .0001]
    hp_set =[]
    for C in my_C:
        for tol in my_tol:
            hp_set.append([C,tol])
#
if model == 'SVM':
    my_C=[.1,  1, 10]
    my_gamma = [.1, .01]
    my_tol=[.01, 0.001,]
    hp_set =[]
    for C in my_C:
        for g in my_gamma:
            for tol in my_tol:
                hp_set.append([C,g, tol])
#
if model=='DT':
    Tree_depth=[5, 6, 7, 8, 9]
    Min_samples_leaf =[1,2, 4, 6]
    Min_samples_split = [2, 4, 6]

    Tree_depth=[7, 8]
    Min_samples_leaf =[1, 2, 3]
    Min_samples_split = [2, 3, 4]
    
    hp_set =[]
    for D in Tree_depth:
        for L in Min_samples_leaf:
            for S in Min_samples_split:
                hp_set.append([D, S, L])
#
if model=='RF':
    #n_estimators = [50, 60, 70, 90] 
    n_estimators = [100, 110] 
    #max_features =  [20, 30, 40, 50]
    max_features =  [40]
    #max_depth    = [ 7, 8, 9]
    max_depth    = [ 9]
    min_samples_leaf  = [1, 2, 3]
    min_samples_split = [8, 10, 12]
    hp_set=[]
    for n in n_estimators:
        for mf in max_features:
            for D in max_depth:
                for msl in min_samples_leaf:
                    for mss in min_samples_split:
                        hp_set.append([n, mf, D, msl, mss])
#
if model == 'NN':
    alpha = [1e-3, 1e-4, 1e-5]
    H1 = [10, 17, 25]
    H2 = [10, 17, 25]
    tol = [.01, .001, .0001]
    hp_set=[]
    for a in alpha:
        for h1 in H1:
            for h2 in H2:
                for t in tol:
                    hp_set.append([a, h1, h2, t])
#
#================================================================================================
#
#print "train, dev, test, tr_Best  tr_All  Ts_Best   Ts_ALL"
results = np.zeros((K_cv+1,4),dtype=float)  # the last row is for computing the average
#res_fold ={}
cres_fold ={}

for j_dev in range(K_cv):
    #================================================================================================
    # select folds
    all_folds = range(K_cv)
    j_test = (j_dev + 1) % K_cv
    all_folds.remove(j_test)
    all_folds.remove(j_dev)
    train_folds = list(all_folds)  # all_folds -dev -test
    dev_folds = [j_dev]
    test_folds = [j_test]
    test_folds = dev_folds
    print "TEST fold = ",test_folds,
    #res_fold[str(test_folds)]=np.zeros((len(hp_set),6),dtype=float)
    cres_fold[str(test_folds)] = np.zeros((len(hp_set)),dtype=[('hp_val','U50'),('tr_sngl',float),
                                                                             ('tr_mlti',float),
                                                                             ('ts_sngl',float),
                                                                             ('ts_mlti',float)])

    setup_TDT_sets(train_folds, dev_folds, test_folds)
    #================================================================================================
    #
    # ================================================================================================
    scale_feat = True
    if 'geometric' in feat_head and scale_feat:
        scaler = StandardScaler()
        col_0, col_1 = feat_head_col['geometric']
        scaler.fit(x_b_train[:,col_0:col_1])  # training with best
        x_b_train[:,col_0:col_1] = scaler.transform(x_b_train[:,col_0:col_1])  
        x_b_dev[:,col_0:col_1] = scaler.transform(x_b_dev[:,col_0:col_1])
        x_b_test[:,col_0:col_1] = scaler.transform(x_b_test[:,col_0:col_1])
        x_a_train[:,col_0:col_1] = scaler.transform(x_a_train[:,col_0:col_1])  
        x_a_dev[:,col_0:col_1] = scaler.transform(x_a_dev[:,col_0:col_1])
        x_a_test[:,col_0:col_1] = scaler.transform(x_a_test[:,col_0:col_1])
    # ================================================================================================
    #
    # ================================================================================================
    #
    if data_set == 'expanded': 
        x_train = x_a_train
        y_train = y_a_train
        Y_train = Y_a_train
        x_test = x_a_dev
        yy_test = y_a_dev
        Y_test = Y_a_dev
    if data_set == 'best':
        x_train = x_b_train
        y_train = y_b_train
        Y_train = Y_b_train
        x_test = x_b_dev
        yy_test = y_b_dev
        Y_test = Y_b_dev        
    #
# ================================================================================================
#


    print "                     TR_sngl  TR_exp  TS_sngl   TS_EXP"
    for i_hp,hp in enumerate(hp_set):
        # learn model
        ff=""
        for hh in hp:
            ff += "%10.5f" % (hh)
        #ff = "%10.5f%10.5f" % (hp[0], hp[1])
        cres_fold[str(test_folds)][i_hp]['hp_val'] = ff 
        print "hp = %30s :" % str(hp),
        #my_C=hp[0]
        #my_tol=hp[1]
        #model_lr_1 = train_model('LR', x_b_train, y_b_train, {'C':hp[0], 'tol':hp[1]})
        #model_lr_1 = train_model('LR', x_train, y_train, {'C':hp[0], 'tol':hp[1]})

        h_param = {}
        if model == 'LR':
            h_param['LR'] = {'C':hp[0], 'tol':hp[1]}
        if model == 'DT':
            h_param['DT'] = {'tree_depth':hp[0], 'min_samples_split':hp[1], 'min_samples_leaf':hp[2]}
        if model == 'RF':
            h_param['RF'] = {'n_estimators':hp[0], 'max_features':hp[1],'max_depth':hp[2],
                             'min_samples_leaf':hp[3], 'min_samples_split':hp[4]}
        if model == 'SVM':
            h_param['SVM'] = {'C':hp[0], 'gamma':hp[1], 'tol':hp[2]}
        if model == 'kNN':
            h_param['kNN'] = {'n_neighbors':hp[0]}
        if model == 'NN':
            h_param['NN'] = {'alpha':hp[0], 'H1':hp[1], 'H2':hp[2], 'tol':hp[3]}
        #
        #
        model_exp = train_model(model, x_train, y_train, h_param[model]) 


#         if model == 'LR':
#             #model_exp = train_model('LR', x_a_train, y_a_train, {'C':10, 'tol':.001}) 
#             model_exp = train_model('LR', x_train, y_train, {'C':hp[0], 'tol':hp[1]}) 

#         if model == 'DT':
#             model_exp = train_model(model, x_train, y_train, {'tree_depth':hp[0]}) 
        
        #print model_lr_1
        #
        #
        #res_fold[str(test_folds)][i_hp,0]=hp[0]
        #res_fold[str(test_folds)][i_hp,1]=hp[1]
        y_p = model_exp.predict_proba(x_train)
        y_test = y_train
        y_all_test = Y_train['all_code']
        #Y_pred=get_pred_vector(y_p)
        #print "shape Y_pred :", Y_pred.shape
        #res_fold[str(test_folds)][i_hp,2] = get_recall_at_k(Y_pred, y_test, 1)
        cres_fold[str(test_folds)][i_hp]['tr_sngl'] = get_recall_at_k(y_p, y_test, 1)
        #results[j_dev,0] = get_recall_at_k(Y_pred, y_test, 1)
        #res_fold[str(test_folds)][i_hp,3] = get_acc_all(Y_pred,y_all_test)
        cres_fold[str(test_folds)][i_hp]['tr_mlti'] = get_recall_for_all_at_k(y_p, y_all_test, 1)
        #cres_fold[str(test_folds)][i_hp]['tr_mlti'] = get_acc_all(Y_pred,y_all_test)
        #results[j_dev,1] = get_acc_all(Y_pred,y_all_test)
        #print " %5.3f    %5.3f   "%(get_recall_at_k(Y_pred, y_test, 1), get_acc_all(Y_pred,y_all_test)),

        print " %5.3f    %5.3f   "%(cres_fold[str(test_folds)][i_hp]['tr_sngl'],\
                                    cres_fold[str(test_folds)][i_hp]['tr_mlti']),
        #print
        y_p = model_exp.predict_proba(x_test)
        y_test = yy_test
        y_all_test = Y_test['all_code']
        #Y_pred=get_pred_vector(y_p)
        #res_fold[str(test_folds)][i_hp,4] = get_recall_at_k(Y_pred, y_test, 1)
        cres_fold[str(test_folds)][i_hp]['ts_sngl'] = get_recall_at_k(y_p, y_test, 1)

        #results[j_dev,2] = get_recall_at_k(Y_pred, y_test, 1)
        #res_fold[str(test_folds)][i_hp,5] = get_acc_all(Y_pred,y_all_test)
        cres_fold[str(test_folds)][i_hp]['ts_mlti'] = get_recall_for_all_at_k(y_p, y_all_test, 1)

        #results[j_dev,3] = get_acc_all(Y_pred,y_all_test)
        #print "%5.3f   %5.3f   "%(get_recall_at_k(Y_pred, y_test, 1), get_acc_all(Y_pred,y_all_test))
        print "%5.3f   %5.3f   "%(cres_fold[str(test_folds)][i_hp]['ts_sngl'],
                                  cres_fold[str(test_folds)][i_hp]['ts_mlti']),
        print "%5.3f"%(cres_fold[str(test_folds)][i_hp]['ts_mlti']/cres_fold[str(test_folds)][i_hp]['tr_mlti'])

#    print "====================================================="
#    results[-1,:] = np.mean(results[0:-1,:],axis=0)
#    print "                   %5.3f    %5.3f   %5.3f   %5.3f   "%(results[-1,0], results[-1,1], results[-1,2], results[-1,3])
print "HPO...COMPLETED"
    

In [None]:
#version 2:
# Rank HP sets
# for idx in range(K_cv):
#     print "fold :",idx
#     cres_fold['['+str(idx)+']'].sort(order='ts_sngl')
#     #print cres_fold['['+str(idx)+']']
    
N=cres_fold['[0]'].shape[0]  # number of unique parameter sets
print "Number of unique HP parameter sets =",N
hp_wt = {}
for idx in range(K_cv):  # for each fold
    #print "fold :",idx
    cres_fold['['+str(idx)+']'].sort(order='ts_mlti')  # choose metric for ranking
    for i in range(N):        
        ff =  cres_fold['['+str(idx)+']'][i]['hp_val']
        if ff not in hp_wt:
            hp_wt[ff]=[]
        hp_wt[ff].append(float(i)/float(N))
#
#f_rank = np.zeros((N,3),dtype=float)
f_rank = np.zeros((N),dtype=[('hp_val','U50'),('weight',float)])
for i,ff in enumerate(hp_wt.keys()):
    #print "%30s:   %5.3f"%(ff, sum(hp_wt[ff]))
    f_rank[i]['hp_val'] = ff
    f_rank[i]['weight']=sum(hp_wt[ff])
f_rank.sort(order='weight')
print

for i in range(f_rank.shape[0]):
    print "%50s:   %5.3f"%(f_rank[i]['hp_val'], f_rank[i]['weight'])

In [None]:
batch

In [None]:
# collect acc(k) results
acc_k = []
max_rows = len(res_file)
row=0
while row<max_rows: 
    item_list = res_file[row].split()
    if len(item_list) >0:
        if item_list[0] == 'Model':
            entry =[item_list[2][0:2]]
            while item_list[0] != 'acc(k)':
                row +=1
                item_list = res_file[row].split()

            entry += [item_list[2],item_list[4], item_list[6], item_list[8]]
            #print item_list
            acc_k.append(entry)    
    #
    row +=1
acc_k
#
# collect acc(k) results
acc_k = []
max_rows = len(res_file)
max_rows = 42
row=14
while row<max_rows: 
    item_list = res_file[row].split()
    print item_list
    row += 1

In [None]:
p_predict.sum()

In [None]:
res_recall_tested[0,-1,1]

In [None]:
np.sum(res_recall_tested[:,-1,:], axis=1, keepdims=True)

In [None]:
res_recall_tested[:,-1,:] = np.sum(res_recall_tested[:,0:-1,:], axis=1)
res_recall_correct[:,-1,:] = np.sum(res_recall_correct[:,0:-1,:], axis=1)
recall_per_p_at_k = np.copy(res_recall_correct[:,-1,:]/res_recall_tested[:,-1,:])
recall_per_p_wt_mean = np.copy(np.sum(recall_per_p*res_recall_tested[:,-1,:]/
                               np.sum(res_recall_tested[:,-1,:], axis=1, keepdims=True), axis=1))

In [None]:
recall_per_p_wt_mean

In [None]:
recall_per_p_at_k

In [None]:
recall_per_prep

In [None]:
# sum over all folds
p_tested = np.sum(res_prep_predict, axis=0)
p_correct = np.sum(res_prep_correct, axis=0)
N=p_predict.shape[0]
acc_p_0 = np.zeros((N),dtype=float)
for n in range(N):
    if p_predict[n] > 0:
        acc_p_0[n] = float(p_correct[n])/float(p_predict[n])
    else:
        acc_p_0[n] = -1.
results_acc_p[-1,:] = np.copy(acc_p_0)
acc_p_0_wt_mean = np.sum(acc_p_0*p_predict/p_predict.sum())
print "mean = ",acc_p_0_wt_mean
for j in range(N):
    print '%2d %-25s '%(j,prep_label_enc.inverse_transform(j)+'('+str(p_predict[j])+')'),
    #print '%2d %-14s '%(j,''),

    for i in range(results_acc_p.shape[0]):
        if results_acc_p[i,j]==-1:
            print "%5s "%" - ",
        else:
            print "%5.3f "%results_acc_p[i,j],
    print



In [None]:
# sum over all folds
p_predict = np.sum(res_prep_predict, axis=0)
p_correct = np.sum(res_prep_correct, axis=0)
N=p_predict.shape[0]
acc_p_0 = np.zeros((N),dtype=float)
for n in range(N):
    if p_predict[n] > 0:
        acc_p_0[n] = float(p_correct[n])/float(p_predict[n])
    else:
        acc_p_0[n] = -1.
results_acc_p[-1,:] = np.copy(acc_p_0)
acc_p_0_wt_mean = np.sum(acc_p_0*p_predict/p_predict.sum())
print "mean = ",acc_p_0_wt_mean
for j in range(N):
    print '%2d %-25s '%(j,prep_label_enc.inverse_transform(j)+'('+str(p_predict[j])+')'),
    #print '%2d %-14s '%(j,''),

    for i in range(results_acc_p.shape[0]):
        if results_acc_p[i,j]==-1:
            print "%5s "%" - ",
        else:
            print "%5.3f "%results_acc_p[i,j],
    print




In [None]:
(-1+-1+.3333+1)/5

In [None]:
res_prep_correct[:,1]

In [None]:
res_prep_predict[:,1]

In [None]:
res_prep_correct[:,1].sum()/res_prep_predict[:,1].sum()

In [None]:
5./19

# Test Loop on BEST set

In [None]:
print "FEATURE SET : ", feat_head
print "                    train, dev, test,  tr_Best  tr_EXP  b_sngl  b_mlti    e_singl  e_mlti"
results = np.zeros((K_cv+1,6),dtype=float)
res_recall = np.zeros((K_cv+1,4),dtype=float)
res_recall_all = np.zeros((K_cv+1,4),dtype=float)
res_conf_mat = np.zeros((K_cv+1,len(prep_label_enc.classes_),len(prep_label_enc.classes_)),dtype=float)
for j_test in range(K_cv):
    # select folds
    all_folds = range(K_cv)
    #j_test = (j_dev + 1) % K_cv
    all_folds.remove(j_test)
    #all_folds.remove(j_dev)
    train_folds = list(all_folds)
    dev_folds = [j_test] # dummy
    test_folds = [j_test]
    print "TEST fold:%4s"%str(test_folds),":",
    #res_fold[str(test_folds)]=np.zeros((len(hp_set),6),dtype=float)
    print "%20s"%(str(train_folds)+ str(dev_folds)+ str(test_folds)),

    #setup_TDT_sets(train_folds+dev_folds, dev_folds, test_folds )
    setup_TDT_sets(train_folds, dev_folds, test_folds )

    # ================================================================================================
    scale_feat = True
    if 'geometric' in feat_head and scale_feat:
        scaler = StandardScaler()
        col_0, col_1 = feat_head_col['geometric']
        scaler.fit(x_b_train[:,col_0:col_1])  # training with best
        x_b_train[:,col_0:col_1] = scaler.transform(x_b_train[:,col_0:col_1])  
        x_b_dev[:,col_0:col_1] = scaler.transform(x_b_dev[:,col_0:col_1])
        x_b_test[:,col_0:col_1] = scaler.transform(x_b_test[:,col_0:col_1])
        x_a_train[:,col_0:col_1] = scaler.transform(x_a_train[:,col_0:col_1])  
        x_a_dev[:,col_0:col_1] = scaler.transform(x_a_dev[:,col_0:col_1])
        x_a_test[:,col_0:col_1] = scaler.transform(x_a_test[:,col_0:col_1])
    # ================================================================================================
    model = 'LR'
    
    if model == 'LR':
        model_best = train_model('LR', x_b_train, y_b_train, {'C':10, 'tol':.001}) #train on best
    #
    # test on BEST_TRAIN_SINGLE
    y_p = model_best.predict_proba(x_b_train)
    y_test = y_b_train
    y_all_test = Y_b_train['all_code']
    #Y_pred=get_pred_vector(y_p)    
    #results[j_test,0] = get_recall_for_all_at_k(y_p, y_test, 1)
    results[j_test,0] = get_recall_at_k(y_p, y_test, 1)
    #results[j_test,1] = get_acc_all(Y_pred,y_all_test)
    results[j_test,1] = get_recall_for_all_at_k(y_p, y_all_test, 1)
    #print " %5.3f    %5.3f   "%(get_recall_at_k(y_p, y_test, 1), get_recall_for_all_at_k(y_p, y_all_test, 1)),
    print " %5.3f    %5.3f   "%(results[j_test,0], results[j_test,1]),

    #get_acc_all(Y_pred,y_all_test)),
    #
    # test on BEST_TEST_SINGLE and BEST_TEST_MULTI
    y_p = model_best.predict_proba(x_b_test)
    y_test = y_b_test
    y_all_test = Y_b_test['all_code']   
    results[j_test,2] = get_recall_at_k(y_p, y_test, 1)
    for r in range(4):
        res_recall[j_test,r] = get_recall_at_k(y_p, y_test, r+1)
        res_recall_all[j_test,r] = get_recall_for_all_at_k(y_p, y_all_test, r+1)
    results[j_test,3] = get_recall_for_all_at_k(y_p, y_all_test, 1)
    #Y_pred=get_pred_vector(y_p)
    #res_conf_mat[j_test] = np.copy(confusion_matrix(y_test, Y_pred))
    print "%5.3f   %5.3f   "%(results[j_test,2], results[j_test,3]),
    acc_p, global_mean, global_wt_mean, acc_p_mean, acc_p_wt_mean = get_precision_per_prep(y_p, y_all_test)

    #
    # test on EXPANDED_SINGLE and EXPANDED_MULTI
    y_p = model_best.predict_proba(x_a_test)
    y_test = y_a_test
    y_all_test = Y_a_test['all_code']
    #Y_pred=get_pred_vector(y_p)
    results[j_test,4] = get_recall_at_k(y_p, y_test, 1)
    results[j_test,5] = get_recall_for_all_at_k(y_p, y_all_test, 1)
    print "%5.3f   %5.3f   "%(results[j_test,4], results[j_test,5] )


print "=========================================================================================="
results[-1,:] = np.mean(results[0:-1,:],axis=0)
print "                                       %5.3f    %5.3f    %5.3f   %5.3f    %5.3f   %5.3f   "%(results[-1,0],\
                                                    results[-1,1], results[-1,2], results[-1,3],\
                                                    results[-1,4], results[-1,5])

#
print"b_BEST_TEST: recall@k results: "
print"================================"
res_recall[-1,:] = np.mean(res_recall[0:-1,:],axis=0)
for j in range(K_cv+1):
    for r in range(4):
        print "%5.3f  "%(res_recall[j,r]),
    print
    if j==K_cv-1:
        print"================================"
print"================================"
print
print"b_ALL_TEST: recall@k results: "
print"================================"
res_recall_all[-1,:] = np.mean(res_recall_all[0:-1,:],axis=0)
for j in range(K_cv+1):
    for r in range(4):
        print "%5.3f  "%(res_recall_all[j,r]),
    print
    if j==K_cv-1:
        print"================================"
print"================================"
#
# Confusion matrix
#
# Save to file
## Save to file
save_flag = True
if save_flag:
    out_file = open('results.txt','a')
    out_file.write('/START RUN =========================================================================\n')
    out_file.write('Model : ' + str(model)+'\n')
    out_file.write('Features : ' + str(feat_head)+'\n')
    #out_file.write(get_csv_string(line_entry)+'\n')
    out_file.write('/END RUN =========================================================================\n')
    out_file.close()   


In [None]:
feat_list

In [None]:
cc = res_conf_mat.sum(axis=0)
print cc

In [None]:
np.array(y_all_test[0],dtype=int)

In [None]:
# RECALL per preposition
print model_exp
# test on BEST_TRAIN
y_p = model_exp.predict_proba(x_b_train)
y_test = y_b_train
y_all_test = Y_b_train['all_code']
Y_pred=get_pred_vector(y_p)
#print "shape Y_pred :", Y_pred.shape
#res_fold[str(test_folds)][i_hp,2] = get_recall_at_k(Y_pred, y_test, 1)
results[j_test,0] = get_recall_at_k(Y_pred, y_test, 1)
#res_fold[str(test_folds)][i_hp,3] = get_acc_all(Y_pred,y_all_test)
results[j_test,1] = get_acc_all(Y_pred,y_all_test)
print " %5.3f    %5.3f   "%(get_recall_at_k(Y_pred, y_test, 1), get_acc_all(Y_pred,y_all_test)),
#print
    
get_recall_per_prep_at_k(y_p, y_test.reshape((y_test.shape[0],1)), 1)


In [None]:
4260/11200.

In [None]:
# rows are expected = sum of a row
cc=confusion_matrix(y_test, Y_pred)
c=cc
ci=np.array(c,dtype=float)
d=100.0*ci/ci.sum(axis=1, keepdims=True)
t = c.sum(axis=1)
#c=c/np.sum(c,axis=1,dtype=float)*100
#for j in range(c.shape[0]):
#    c[j,:] = c[j,:]/float(np.sum(c[j]))*100
   
print '                         ',
for i in range(d.shape[1]):
    print "%3d "%i,
print
print
for j in range(d.shape[0]):
    print '%2d %-22s '%(j,prep_label_enc.inverse_transform(j)+'('+str(t[j])+')'),
    #print '%2d %-14s '%(j,''),

    for i in range(d.shape[1]):
        if d[j,i]==0.0:
            print " -  ",
        else:
            print "%3.0f "%d[j,i],
    print
    print



In [None]:
def print_confusion_mat(c):
    ci=np.array(c,dtype=float)
    d=100.0*ci/ci.sum(axis=1, keepdims=True)
    t = c.sum(axis=1)
    #  
    print '                         ',
    for i in range(d.shape[1]):
        print "%3d "%i,
    print
    print
    for j in range(d.shape[0]):
        print '%2d %-22s '%(j,prep_label_enc.inverse_transform(j)+'('+str(t[j])+')'),
        #print '%2d %-14s '%(j,''),

        for i in range(d.shape[1]):
            if d[j,i]==0.0:
                print " -  ",
            else:
                print "%3.0f "%d[j,i],
        print
        print
    return 0


In [None]:
print_confusion_mat(cc_lang_geo_depth)

In [None]:
print_confusion_mat(cc_lang_geo)

In [None]:
print_confusion_mat(cc_lang_depth)

In [None]:
print_confusion_mat(cc_lang)

In [None]:
print_confusion_mat(cc_geo)

In [None]:
print_confusion_mat(cc_geo_depth)

In [None]:
#cc_lang_depth = np.copy(cc)
#cc_lang_geo_depth = np.copy(cc)
#cc_geo_depth = np.copy(cc)
#cc_geo = np.copy(cc)
#cc_lang_geo = np.copy(cc)
#cc_lang = np.copy(cc)

In [None]:
print d.sum(axis=1)
print t.sum()

In [None]:
cii=np.array(ci,dtype=float)
dii=cii/cii.sum(axis=1, keepdims=True)
dii.sum(axis=1)
ci

# Test Loop on ALL set

In [None]:
print "FEATURE SET : ", feat_head
print "                    train, dev, test,  tr_Best  tr_All  b_Best  b_ALL    e_singl  e_ALL"
results = np.zeros((K_cv+1,6),dtype=float)
res_recall = np.zeros((K_cv+1,4),dtype=float)
res_recall_all = np.zeros((K_cv+1,4),dtype=float)
res_conf_mat = np.zeros((K_cv+1,len(prep_label_enc.classes_),len(prep_label_enc.classes_)),dtype=float)
for j_test in range(K_cv):
    # select folds
    all_folds = range(K_cv)
    all_folds.remove(j_test)
    train_folds = list(all_folds)
    dev_folds = [j_test] # dummy
    test_folds = [j_test]
    print "TEST fold:%4s"%str(test_folds),":",
    #res_fold[str(test_folds)]=np.zeros((len(hp_set),6),dtype=float)
    print "%20s"%(str(train_folds)+ str(dev_folds)+ str(test_folds)),

    #setup_TDT_sets(train_folds+dev_folds, dev_folds, test_folds )
    setup_TDT_sets(train_folds, dev_folds, test_folds )

    # ================================================================================================
    if 'geometric' in feat_head:
        scaler = StandardScaler()
        col_0 = X_best_fold['fold_0']['X_headings']['geometric'][0]
        col_1 = X_best_fold['fold_0']['X_headings']['geometric'][1]
        #col_0 = 0
        #col_1 = 18
        scaler.fit(x_b_train[:,col_0:col_1])  # training with best
        x_b_train[:,col_0:col_1] = scaler.transform(x_b_train[:,col_0:col_1])  
        x_b_test[:,col_0:col_1] = scaler.transform(x_b_test[:,col_0:col_1])
        x_a_train[:,col_0:col_1] = scaler.transform(x_a_train[:,col_0:col_1])  
        x_a_test[:,col_0:col_1] = scaler.transform(x_a_test[:,col_0:col_1])
    # ================================================================================================

    model_lr_1 = train_model('LR', x_a_train, y_a_train, {'C':10, 'tol':.001})


    # test on BEST_TRAIN
    y_p = model_lr_1.predict_proba(x_a_train)
    y_test = y_a_train
    y_all_test = Y_a_train['all_code']
    Y_pred=get_pred_vector(y_p)
    #print "shape Y_pred :", Y_pred.shape
    #res_fold[str(test_folds)][i_hp,2] = get_recall_at_k(Y_pred, y_test, 1)
    results[j_test,0] = get_recall_at_k(Y_pred, y_test, 1)
    #res_fold[str(test_folds)][i_hp,3] = get_acc_all(Y_pred,y_all_test)
    results[j_test,1] = get_acc_all(Y_pred,y_all_test)
    print " %5.3f    %5.3f   "%(get_recall_at_k(Y_pred, y_test, 1), get_acc_all(Y_pred,y_all_test)),
    #print
    #
    # test on BEST_TEST
    y_p = model_lr_1.predict_proba(x_b_test)
    y_test = y_b_test
    y_all_test = Y_b_test['all_code']
    Y_pred=get_pred_vector(y_p)
    results[j_test,2] = get_recall_at_k(Y_pred, y_test, 1)
    #for r in range(4):
    #    res_recall[j_test,r] = get_recall_at_k(Y_pred, y_test, r+1)
    #    res_recall_all[j_test,r] = get_recall_for_all_at_k(Y_pred, y_all_test, r+1)
    results[j_test,3] = get_acc_all(Y_pred,y_all_test)
    #res_conf_mat[j_test] = np.copy(confusion_matrix(y_test, Y_pred))
    
    print "%5.3f   %5.3f   "%(get_recall_at_k(Y_pred, y_test, 1), get_acc_all(Y_pred,y_all_test)),
    #
    # test on ALL_TEST
    y_p = model_lr_1.predict_proba(x_a_test)
    y_test = y_a_test
    y_all_test = Y_a_test['all_code']
    Y_pred=get_pred_vector(y_p)
    results[j_test,4] = get_recall_at_k(Y_pred, y_test, 1)
    for r in range(4):
        res_recall[j_test,r] = get_recall_at_k(Y_pred, y_test, r+1)
        res_recall_all[j_test,r] = get_recall_for_all_at_k(Y_pred, y_all_test, r+1)
    results[j_test,5] = get_acc_all(Y_pred,y_all_test)
    res_conf_mat[j_test] = np.copy(confusion_matrix(y_test, Y_pred))
    print "%5.3f   %5.3f   "%(get_recall_at_k(Y_pred, y_test, 1), get_acc_all(Y_pred,y_all_test))


print "=========================================================================================="
results[-1,:] = np.mean(results[0:-1,:],axis=0)
print "                                       %5.3f    %5.3f    %5.3f   %5.3f    %5.3f   %5.3f   "%(results[-1,0],\
                                                    results[-1,1], results[-1,2], results[-1,3],\
                                                    results[-1,4], results[-1,5])

#
print"b_BEST_TEST: recall@k results: "
print"================================"
res_recall[-1,:] = np.mean(res_recall[0:-1,:],axis=0)
for j in range(K_cv+1):
    for r in range(4):
        print "%5.3f  "%(res_recall[j,r]),
    print
    if j==K_cv-1:
        print"================================"
print"================================"
print
print"b_ALL_TEST: recall@k results: "
print"================================"
res_recall_all[-1,:] = np.mean(res_recall_all[0:-1,:],axis=0)
for j in range(K_cv+1):
    for r in range(4):
        print "%5.3f  "%(res_recall_all[j,r]),
    print
    if j==K_cv-1:
        print"================================"
print"================================"
#
# Confusion matrix


In [None]:
cc = res_conf_mat.sum(axis=0)
print cc.sum()/5.
#cc_a_lang_geo = np.copy(cc)
cc_aa_lang_geo = np.copy(cc)

In [None]:
print_confusion_mat(cc_aa_lang_geo)

In [None]:
print_confusion_mat(cc_a_lang_geo)

In [None]:
setup_TDT_sets([0,1,2,3],[3],[4])

scaler = StandardScaler()
col_0 = X_best_fold['fold_0']['X_headings']['geometric'][0]
col_1 = X_best_fold['fold_0']['X_headings']['geometric'][1]
#col_0 = 0
#col_1 = x_b_train.shape[1]
#print x_b_train[:,col_0: col_1].shape
scaler.fit(x_b_train[:,col_0:col_1])  # training with best
x_b_train[:,col_0:col_1] = scaler.transform(x_b_train[:,col_0:col_1])  
x_b_test[:,col_0:col_1] = scaler.transform(x_b_test[:,col_0:col_1])
x_a_train[:,col_0:col_1] = scaler.transform(x_a_train[:,col_0:col_1])  
x_a_test[:,col_0:col_1] = scaler.transform(x_a_test[:,col_0:col_1])
#print scaler.mean_
#print scaler.var_
# print 
# print "Size of x_b_train :", x_b_train.shape
# print "Size of y_b_train :", y_b_train.shape
# print "Size of x_b_test :", x_b_test.shape
# print "Size of y_b_test :", y_b_test.shape


In [None]:
my_C=100.0
my_tol=0.0001
model_lr_1=LogisticRegression(C=my_C,penalty='l2', tol=my_tol, multi_class='ovr',random_state=1971)
model_lr_1.fit(x_b_train, y_b_train)
print model_lr_1
# predict probabilities for all classes



In [None]:
y_p = model_lr_1.predict_proba(x_b_train)
y_test = y_b_train
y_all_test = Y_b_train['all_code']
Y_pred=get_pred_vector(y_p)
print "shape Y_pred :", Y_pred.shape
print 'Train Overall RECALL =',sum(((Y_pred.ravel()-y_test)**2)==0)/float((y_test.shape[0]))
print 'Train Overall RECALL of all good preps =',get_acc_all(Y_pred,y_all_test)
print
y_p = model_lr_1.predict_proba(x_b_test)
y_test = y_b_test
y_all_test = Y_b_test['all_code']
Y_pred=get_pred_vector(y_p)
print "shape Y_pred :", Y_pred.shape
print 'Test Overall RECALL =',sum(((Y_pred.ravel()-y_test)**2)==0)/float((y_test.shape[0]))
print 'Test Overall RECALL of all good preps =',get_acc_all(Y_pred,y_all_test)

In [None]:
model = 'NN'
#
if model == 'kNN':
    model_kNN_1 = KNeighborsClassifier(n_neighbors=29)
    model_kNN_1.fit(X_train, Y_train.ravel()) 
    #print model_kNN_1
    # predict probabilities for all classes
    print "Testing Model :"
    y_p = model_kNN_1.predict_proba(X_test)

if model=='NN':
    #my_C=100.0
    #my_tol=0.0001
    model_nn_1 = MLPClassifier(solver='lbfgs', 
                               alpha=1e-5,
                               max_iter=500,
                               hidden_layer_sizes=(20,10),
                               random_state=1971,
                               )
    model_nn_1.fit(x_b_train, y_b_train)
    print model_nn_1
    # predict probabilities for all classes
    print "Testing Model :",
    #            
    y_p = model_nn_1.predict_proba(x_b_test)
    #y_p = model_lr_1.predict_proba(X_valid)
    print "DONE"
    
if model=='LR':
    my_C=100.0
    my_tol=0.0001
    model_lr_1=LogisticRegression(C=my_C,penalty='l2', tol=my_tol, multi_class='ovr',random_state=1971)
    model_lr_1.fit(x_b_train, y_b_train)
    print model_lr_1
    # predict probabilities for all classes
    print "Testing Model :",
    #            
    y_p = model_lr_1.predict_proba(x_b_test)
    #y_p = model_lr_1.predict_proba(X_valid)
    print "DONE"
#
#
if model=='DT':
    Tree_depth=6
    model_dt_1 = tree.DecisionTreeClassifier(max_depth=Tree_depth,random_state=1971)
    model_dt_1.fit(X_train['X_model'], Y_train['Y_model'][:,0].ravel())
    # predict probabilities for all classes
    print model_dt_1
    print "Testing Model"
    y_p = model_dt_1.predict_proba(X_test['X_model'])
#
#
if model=='RF':
    model_rf_1 = RandomForestClassifier(n_estimators=80,  #40
                                        max_features=30,  #40
                                        max_depth=7,    #9
                                        #min_samples_split=40,
                                        min_samples_leaf=10,
                                        random_state=1971,
                                        )
    #model_rf_1.fit(x_b_train, y_b_train)
    model_rf_1.fit(x_a_train, y_a_train)
    #
    print model_rf_1
    print "Testing Model : ",    # predict probabilities for all classes
    #y_p = model_rf_1.predict_proba(x_b_test) 
    y_p = model_rf_1.predict_proba(x_a_test) 
    print "DONE"

if model=='SVM':
    model_svm_1=svm.SVC(    C=1.0,
                            gamma=.01,
                            probability=True,
                            kernel='rbf',
                            tol=0.01,
                            decision_function_shape = 'none',
                            random_state=1971,
                            #cache_size=1000
                            #class_weight='balanced' 
                            #penalty='l1', 
                            #fit_intercept=True, 
                            #warm_start=False
                            )
    model_svm_1.fit(X_train, Y_train.ravel())

    # predict probabilities for all classes
    y_p = model_svm_1.predict_proba(X_test) 



In [None]:
y_p = model_nn_1.predict(x_b_train)
y_test = Y_b_train['Y_model'][:,18:35]
print "train dataset size :",y_test.shape
print "train: over all annotations :",np.sum(y_test*y_p)/float(np.sum(y_test))
print "train: one which is good :",np.sum(y_test*y_p)/float(y_test.shape[0])
#
y_p = model_nn_1.predict(x_b_test)
y_test = Y_b_test['Y_model'][:,18:35]
print
print "test dataset size :",y_test.shape
print "test: over all annotations :",np.sum(y_test*y_p)/float(np.sum(y_test))
print "test: one which is good :",np.sum(y_test*y_p)/float(y_test.shape[0])
#
# y_test = Y_b_test['Y_model'][:,1:18]
# print
# print "test dataset size :",y_test.shape
# print "test: over all annotations :",np.sum(y_test*y_p)/float(np.sum(y_test))
# print "test: one which is good :",np.sum(y_test*y_p)/float(y_test.shape[0])

In [None]:
a=np.sum(y_test*y_p,axis=1)/np.sum(y_test,axis=1)
print np.sum(y_test)/a.shape[0]
print np.sum(a)/a.shape[0]

In [None]:
y_b_train[0:3]

In [None]:
y_b_test[0:3]

In [None]:
y_p[0:3]

In [None]:
y_test = Y_b_test['Y_model'][:,18:35]
print np.sum(y_test - y_p)**2
print np.sum(y_test)
print "over all annotations :",np.sum(y_test*y_p)/float(np.sum(y_test))
print "one which is good :",np.sum(y_test*y_p)/float(y_test.shape[0])

In [None]:
y_p = model_nn_1.predict_proba(x_b_train)
y_test = Y_b_train['Y_model'][:,18:35]
#y_all_test = Y_b_train['all_code']
Y_pred=get_pred_vector(y_p)
print "shape Y_pred :", Y_pred.shape
print 'Train Overall RECALL =',sum(((Y_pred.ravel()-y_test)**2)==0)/float((y_test.shape[0]))
print 'Train Overall RECALL of all good preps =',get_acc_all(Y_pred,y_all_test)
print
y_p = model_nn_1.predict_proba(x_b_test)
y_test = Y_b_test['Y_model'][:,0].ravel()
y_all_test = Y_b_test['all_code']
Y_pred=get_pred_vector(y_p)
print "shape Y_pred :", Y_pred.shape
print 'Test Overall RECALL =',sum(((Y_pred.ravel()-y_test)**2)==0)/float((y_test.shape[0]))
print 'Test Overall RECALL of all good preps =',get_acc_all(Y_pred,y_all_test)

In [None]:
for i in range(10):
    print i, Y_b_train['Y_model'][i, int(Y_pred[i][0])+18]


In [None]:
Y_b_train['Y_model'][i,18:35]
print Y_pred[i][0]

In [None]:
#y_p = model_nn_1.predict_proba(x_b_train)
y_test = y_b_train
y_all_test = Y_b_train['all_code']
Y_pred=get_pred_vector(y_p)
print "shape Y_pred :", Y_pred.shape
print 'Train Overall RECALL =',sum(((Y_pred.ravel()-y_test)**2)==0)/float((y_test.shape[0]))
print 'Train Overall RECALL of all good preps =',get_acc_all(Y_pred,y_all_test)
print
y_p = model_nn_1.predict_proba(x_b_test)
y_test = y_b_test
y_all_test = Y_b_test['all_code']
Y_pred=get_pred_vector(y_p)
print "shape Y_pred :", Y_pred.shape
print 'Test Overall RECALL =',sum(((Y_pred.ravel()-y_test)**2)==0)/float((y_test.shape[0]))
print 'Test Overall RECALL of all good preps =',get_acc_all(Y_pred,y_all_test)

In [None]:
y_p = model_nn_1.predict_proba(x_b_test)
#y_p = model_nn_1.predict_proba(x_b_train)
#y_p = model_rf_1.predict_proba(x_b_test)
y_test = y_b_test
y_all_test = Y_b_test['all_code']
#
#y_p = model_lr_1.predict_proba(x_a_test)
# y_p = model_rf_1.predict_proba(x_a_test)
# y_test = y_a_test
# y_all_test = Y_a_test['all_code']

In [None]:
print y_p.shape
print y_test.shape
print len(y_all_test)

In [None]:
Y_pred=get_pred_vector(y_p)
print 'Overall RECALL =',sum(((Y_pred.ravel()-y_test)**2)==0)/float((y_test.shape[0]))
#print 'Overall RECALL =',sum(((Y_pred-Y_valid)**2)==0)/float(len(Y_valid))

print 'Overall RECALL of all good preps =',get_acc_all(Y_pred,y_all_test)
print 'RECALL@k : %2d:%5.4f,%2d:%5.4f,%2d:%5.4f,%2d:%5.4f'%(\
                                    1, get_recall_at_k(Y_pred, y_test, 1),\
                                    2, get_recall_at_k(Y_pred, y_test, 2),\
                                    3, get_recall_at_k(Y_pred, y_test, 3),\
                                    4, get_recall_at_k(Y_pred, y_test, 4))

# ff=precision_recall_fscore_support(Y_test, Y_pred,average=None)
# #
# # recall_per_prep.append(ff[1])
# # recall_micro.append(precision_recall_fscore_support(y_test, Y_pred,average='micro')[1])
# # recall_macro.append(precision_recall_fscore_support(y_test, Y_pred,average='macro')[1])

# print 'recall'
# output_vector(ff[1])
# output_vector(ff[0])
# output_vector(ff[2])

print 'average recall micro    :',precision_recall_fscore_support(y_test, Y_pred,average='micro')[1]
print 'average recall macro    :',precision_recall_fscore_support(y_test, Y_pred,average='macro')[1]
print 'average recall weighted :',precision_recall_fscore_support(y_test, Y_pred,average='weighted')[1]
# #
print 'average precision micro    :',precision_recall_fscore_support(y_test, Y_pred,average='micro')[0]
print 'average precision macro    :',precision_recall_fscore_support(y_test, Y_pred,average='macro')[0]
print 'average precision weighted :',precision_recall_fscore_support(y_test, Y_pred,average='weighted')[0]
# #
# print 'my precision @ k           :',get_precision_at_k(y_p,Y_test,2)
print 'F-score                    :',precision_recall_fscore_support(y_test, Y_pred,average='weighted')
# #     print 'Overall RECALL of all good preps =',get_acc_all(Y_pred,y_test_all)
# #     print 'Overall RECALL of all good per prep() =',idx2prep[9],get_acc_all_per_prep(9,Y_pred,y_test_all)
# #     print 'Overall RECALL of all good per prep() =',idx2prep[8],get_acc_all_per_prep(8,Y_pred,y_test_all)
# print "#"

In [None]:
(Y_pred.ravel()-y_test).shape

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [None]:
scaler.fit(x_b_train)

In [None]:
print scaler.mean_
print scaler.var_

In [None]:
Y_test['Y_model']

In [None]:
len(Y_test['all_code'])

In [None]:
Y_pred.shape

In [None]:
pp=model_lr_1.predict_proba(x_test[0:-1])
print pp.shape

In [None]:
x_test[0]

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler_all = StandardScaler()
import numpy as np

In [None]:
# Fit only to the training data
X_train=np.array([[1,0,0,0,.8,1.5],[0,0,1,0,-1.2,3.4],
                  [0,1,0,0,2.2,-2.1],[0,0,0,1,-1.2,0.8],[1,0,0,1,-.2,-2.2]], dtype=float)
scaler.fit(X_train[:,4:6])
scaler_all.fit(X_train)

In [None]:
print scaler.mean_
print scaler.var_

In [None]:
# Now apply the transformations to the data:
X_train_s = scaler.transform(X_train[:,4:6])
X_train_a = scaler_all.transform(X_train)

In [None]:
X_train

In [None]:
X_train_s

In [None]:
X_train_a[:,4:]-X_train_s

In [None]:
a=np.array([[9.,2,3],[4,1,6],[7,8,4]])

In [None]:
a

In [None]:
a.sort(axis=0)
print a

In [None]:
np.sort(a.view('i8,i8,i8'), order=['f0'], axis=0).view(np.int)

In [None]:
b=np.sort(a.view('i8,i8,i8'), order=['f1'], axis=0).view(np.int)

In [None]:
type(a)

In [None]:
np.sort(a.view('f8,f8,f8'), order=['f1'], axis=0).view(np.float64)

In [None]:
# print sizes for train/dev/test sets
print
print "Size of x_b_train :", x_b_train.shape
print "Size of y_b_train :", y_b_train.shape
print "Size of x_b_dev :", x_b_dev.shape
print "Size of y_b_dev :", y_b_dev.shape
print "Size of x_b_test :", x_b_test.shape
print "Size of y_b_test :", y_b_test.shape

print
print "Size of x_a_train :", x_a_train.shape
print "Size of y_a_train :", y_a_train.shape
print "Size of x_a_dev :", x_a_dev.shape
print "Size of y_a_dev :", y_a_dev.shape
print "Size of x_a_test :", x_a_test.shape
print "Size of y_a_test :", y_a_test.shape
