In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

from sklearn.metrics import roc_curve, auc
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import average_precision_score
from sklearn.metrics import precision_recall_curve

In [2]:
label = pd.read_csv('./HasEmpty/Test_Prediction.txt', header=None)
prob = pd.read_csv('./HasEmpty/PredictionsOutput.txt', sep=" ", header=None)
train_data = pd.read_csv('./HasEmpty/TNC2_FileName_ID_wo_unknow_FullClass_test_random.txt', sep="\t", header=None)
test_data = pd.read_csv('./HasEmpty/TNC2_FileName_ID_wo_unknow_FullClass_train_random.txt', sep="\t", header=None)

In [3]:
label.columns = ['filename', 'true_label', 'pred_label', 'max_prob']
train_data.columns = ['filename', 'true_label']
test_data.columns = ['filename', 'true_label']
# prob.columns = [str(num+1) for num in range(prob.size[1])]

In [4]:
print("train data has following categorie:\n {}.".format(sorted(train_data.true_label.unique())))

train data has following categorie:
 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41].


In [5]:
# label.groupby(['true_label'])['col3'].mean()

In [6]:
label.head()

Unnamed: 0,filename,true_label,pred_label,max_prob
0,L-LJS17-E9B-0177.JPG,7,7,0.285
1,L-TC17-H12A-0195.JPG,6,4,0.983
2,L-YL17-I41A-0149.JPG,1,3,0.512
3,L-TC17-H12A-0084.JPG,6,1,0.997
4,L-YL17-I42A-0164.JPG,3,3,0.967


In [7]:
prob.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,32,33,34,35,36,37,38,39,40,41
0,0.001,0.001,0.005,0.214,0.015,0.003,0.091,0.285,0.004,0.004,...,0.001,0.003,0.001,0.001,0.002,0.001,0.001,0.002,0.001,0.003
1,0.0,0.001,0.0,0.0,0.983,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.002,0.441,0.512,0.004,0.0,0.0,0.002,0.0,0.002,...,0.0,0.001,0.0,0.0,0.0,0.0,0.0,0.0,0.001,0.0
3,0.0,0.997,0.0,0.0,0.003,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.031,0.967,0.0,0.0,0.0,0.0,0.0,0.002,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## 1: top n including empty; 0: other animals

In [12]:
def get_top_label(row):
#     10: mouse; 20: squarrel; 
    if row.true_label==4 or row.true_label==5 or row.true_label==14 or row.true_label==20 or row.true_label==10 \
        or row.true_label==1:
        return 1
    else:
        return 0

In [14]:
label['is_top_label'] =  label.apply(lambda row: get_top_label(row), axis=1)
label.head()

Unnamed: 0,filename,true_label,pred_label,max_prob,is_top_label
0,L-LJS17-E9B-0177.JPG,7,7,0.285,0
1,L-TC17-H12A-0195.JPG,6,4,0.983,0
2,L-YL17-I41A-0149.JPG,1,3,0.512,1
3,L-TC17-H12A-0084.JPG,6,1,0.997,0
4,L-YL17-I42A-0164.JPG,3,3,0.967,0


In [24]:
print("Other: {}".format(label[label.is_top_label==0].shape[0]))
print("top n: {}".format(label[label.is_top_label==1].shape[0]))

Other: 770
top n: 658


## Simple model

In [16]:
X_train, X_test, y_train, y_test = train_test_split(prob, label.is_top_label,
                                                    test_size=0.3, random_state= 0)

In [18]:
def compute_auc_score(X, y, model):
    """ Compute area under curve for roc curve (larger better) 
        Used to evaluate model with almost balanced class
    Parameters
    ----------
    X: array
        Feature array
    y: array
        True label array
    model: logistic regression model (or other model)
        trained model
        
    Returns
    -------
    double
        area under curve
    """
    probas_ = model.predict_proba(X)
    fpr, tpr, thresholds = roc_curve(y, probas_[:, 1])
    return auc(fpr, tpr)

def norm_conf_matrix(Xtest, ytest, model):
    """ Compute normalized confusion matrix for postive/negative accuracy
    """
    cm_1 = confusion_matrix(ytest, model.predict(Xtest))
#     cm_1 = cm_1.astype('float')/cm_1.sum(axis=1)[:, None]
    return cm_1

def custom_threshold(Xtest, model, threshold):
    """ Generate predicted label use defined threshold instead of .5
    Example: 
        y_label = custom_threshold(Xtrain,model,0.8)
    """
    y_pred_prob = model.predict_proba(Xtest)
    y_pred = y_pred_prob[:,1] > threshold
    return y_pred

In [33]:
def cross_validate_helper(c, train_x, train_y, test_x, test_y):
    print()
    
    print("c is {}".format(c))
    logistic_model = LogisticRegression(penalty='l2', C=c, class_weight = 'balanced') #class_weight = 'balanced'
    logistic_model.fit(train_x,train_y)
#     model evaludations
#     auc = compute_auc_score(test_x,test_y, logistic_model)
#     print("AUC validation score: %.2f"%(auc))
    pr = precision_recall_fscore_support(test_y,logistic_model.predict(test_x))
#     print('Postive class precision:%.3f, recall:%.3f and f-score: %.3f'%
#           (pr[0][1],pr[1][1], pr[2][1]))
    print('Negative class precision:%.3f, recall:%.3f and f-score: %.3f'%
          (pr[0][0],pr[1][0], pr[2][0]))    
    cm = confusion_matrix(test_y, custom_threshold(test_x,logistic_model,0.5))
    print(cm)

In [34]:
for c in [0.01, 0.1, 0.5, 1, 10, 50, 100, 500, 1000]:
    cross_validate_helper(c, X_train, y_train, X_test, y_test)


c is 0.01
Negative class precision:0.661, recall:0.655 and f-score: 0.658
[[146  77]
 [ 75 131]]

c is 0.1
Negative class precision:0.691, recall:0.722 and f-score: 0.706
[[161  62]
 [ 72 134]]

c is 0.5
Negative class precision:0.694, recall:0.753 and f-score: 0.723
[[168  55]
 [ 74 132]]

c is 1
Negative class precision:0.697, recall:0.762 and f-score: 0.728
[[170  53]
 [ 74 132]]

c is 10
Negative class precision:0.695, recall:0.767 and f-score: 0.729
[[171  52]
 [ 75 131]]

c is 50
Negative class precision:0.699, recall:0.771 and f-score: 0.733
[[172  51]
 [ 74 132]]

c is 100
Negative class precision:0.699, recall:0.771 and f-score: 0.733
[[172  51]
 [ 74 132]]

c is 500
Negative class precision:0.699, recall:0.780 and f-score: 0.737
[[174  49]
 [ 75 131]]

c is 1000
Negative class precision:0.702, recall:0.780 and f-score: 0.739
[[174  49]
 [ 74 132]]


## Actual model

In [35]:
logistic_model = LogisticRegression(penalty='l2',  C=1000, class_weight = 'balanced') #class_weight = 'balanced'
logistic_model.fit(X_train,y_train)

LogisticRegression(C=1000, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [50]:
for thred in [0.5, 0.65, 0.8]:
    cm = confusion_matrix(y_test, custom_threshold(X_test,logistic_model,thred))
    print()
    print("threshold: {}".format(thred))
    print("Of all has animal, how much we returned: %.3f"%(cm[0][0]/(cm[0][0]+cm[0][1])))
    print("Of all returned photo we predicted as has animal, how many do have animal: %.3f"%(cm[0][0]/(cm[0][0]+cm[1][0])))



threshold: 0.5
Of all has animal, how much we returned: 0.780
Of all returned photo we predicted as has animal, how many do have animal: 0.702

threshold: 0.65
Of all has animal, how much we returned: 0.852
Of all returned photo we predicted as has animal, how many do have animal: 0.674

threshold: 0.8
Of all has animal, how much we returned: 0.946
Of all returned photo we predicted as has animal, how many do have animal: 0.596
