In [67]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

from sklearn.metrics import roc_curve, auc
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import average_precision_score
from sklearn.metrics import precision_recall_curve

In [37]:
label = pd.read_csv('Prediction.txt', header=None)
prob = pd.read_csv('Prob.txt', sep=" ", header=None)
train_data = pd.read_csv('Train.txt', sep="\t", header=None)
test_data = pd.read_csv('Test.txt', sep="\t", header=None)

In [39]:
label.columns = ['filename', 'true_label', 'pred_label', 'max_prob']
train_data.columns = ['filename', 'true_label']
test_data.columns = ['filename', 'true_label']
# prob.columns = [str(num+1) for num in range(prob.size[1])]

In [50]:
print("train data has following categorie:\n {}.".format(sorted(train_data.true_label.unique())))

train data has following categorie:
 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40].


In [26]:
label.head()

Unnamed: 0,filename,true_label,pred_label,max_prob
0,L-YL17-H40A-0853.JPG,2,3,0.512
1,L-TC17-H10A1-0738.JPG,4,2,0.858
2,L-LJS17-E9B-0208.JPG,7,5,0.295
3,L-YL17-H40A-1124.JPG,3,3,1.0
4,L-TC17-H10A-0673.JPG,10,4,0.81


In [27]:
prob.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,31,32,33,34,35,36,37,38,39,40
0,0.0,0.001,0.481,0.512,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.858,0.004,0.134,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.029,0.019,0.01,0.295,0.0,0.247,0.002,0.001,...,0.001,0.0,0.0,0.002,0.0,0.0,0.0,0.0,0.001,0.026
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.015,0.067,0.81,0.001,0.031,0.0,0.0,0.008,...,0.0,0.0,0.0,0.012,0.001,0.0,0.0,0.0,0.001,0.0


In [87]:
def get_top_label(row):
    if row.true_label==4 or row.true_label==5 or row.true_label==14 or row.true_label==20 or row.true_label==10:
        return 1
    else:
        return 0

In [88]:
label['is_top_label'] =  label.apply(lambda row: get_top_label(row), axis=1)

In [89]:
label.head()

Unnamed: 0,filename,true_label,pred_label,max_prob,is_top_label
0,L-YL17-H40A-0853.JPG,2,3,0.512,0
1,L-TC17-H10A1-0738.JPG,4,2,0.858,1
2,L-LJS17-E9B-0208.JPG,7,5,0.295,0
3,L-YL17-H40A-1124.JPG,3,3,1.0,0
4,L-TC17-H10A-0673.JPG,10,4,0.81,1


In [90]:
print("Non major: {}".format(label[label.is_top_label==0].shape[0]))
print("Major: {}".format(label[label.is_top_label==1].shape[0]))

Non major: 770
Major: 195


In [91]:
X_train, X_test, y_train, y_test = train_test_split(prob, label.is_top_label,
                                                    test_size=0.3, random_state= 0)

In [136]:
# logistic_model = LogisticRegression(penalty='l2', C=10) #class_weight = 'balanced'
# logistic_model.fit(X_train,y_train)

In [125]:
def compute_auc_score(X, y, model):
    """ Compute area under curve for roc curve (larger better) 
        Used to evaluate model with almost balanced class
    Parameters
    ----------
    X: array
        Feature array
    y: array
        True label array
    model: logistic regression model (or other model)
        trained model
        
    Returns
    -------
    double
        area under curve
    """
    probas_ = model.predict_proba(X)
    fpr, tpr, thresholds = roc_curve(y, probas_[:, 1])
    return auc(fpr, tpr)

def norm_conf_matrix(Xtest, ytest, model):
    """ Compute normalized confusion matrix for postive/negative accuracy
    """
    cm_1 = confusion_matrix(ytest, model.predict(Xtest))
#     cm_1 = cm_1.astype('float')/cm_1.sum(axis=1)[:, None]
    return cm_1

def custom_threshold(Xtest, model, threshold):
    """ Generate predicted label use defined threshold instead of .5
    Example: 
        y_label = custom_threshold(Xtrain,model,0.8)
    """
    y_pred_prob = model.predict_proba(Xtest)
    y_pred = y_pred_prob[:,1] > threshold
    return y_pred

In [137]:
# print("Test accuracy:{0}".format(logistic_model.score(X_test, y_test)))
# print(norm_conf_matrix(X_test, y_test, logistic_model))
# print('Postive class precision:%.2f, recall:%.2f and f-score: %.2f'%
#       (precision_recall_fscore_support(y_test,logistic_model.predict(X_test))[0][1],
#        precision_recall_fscore_support(y_test,logistic_model.predict(X_test))[1][1],
#        precision_recall_fscore_support(y_test,logistic_model.predict(X_test))[2][1]))

In [138]:
# precision_recall_fscore_support(y_test,logistic_model.predict(X_test))

In [133]:
def cross_validate_helper(c, train_x, train_y, test_x, test_y):
    print()
    
    print("c is {}".format(c))
    logistic_model = LogisticRegression(penalty='l2', C=c) #class_weight = 'balanced'
    logistic_model.fit(train_x,train_y)
#     model evaludations
#     auc = compute_auc_score(test_x,test_y, logistic_model)
#     print("AUC validation score: %.2f"%(auc))
    pr = precision_recall_fscore_support(test_y,logistic_model.predict(test_x))
    print('Postive class precision:%.3f, recall:%.3f and f-score: %.3f'%
          (pr[0][1],pr[1][1], pr[2][1]))
    
    cm = confusion_matrix(test_y, custom_threshold(test_x,logistic_model,0.5))
    print(cm)

In [135]:
for c in [0.01, 0.1, 0.5, 1, 10, 50, 100]:
    cross_validate_helper(c, X_train, y_train, X_test, y_test)


c is 0.01
Postive class precision:0.000, recall:0.000 and f-score: 0.000
[[224   0]
 [ 66   0]]

c is 0.1
Postive class precision:0.952, recall:0.303 and f-score: 0.460
[[223   1]
 [ 46  20]]

c is 0.5
Postive class precision:0.931, recall:0.409 and f-score: 0.568
[[222   2]
 [ 39  27]]

c is 1
Postive class precision:0.879, recall:0.439 and f-score: 0.586
[[220   4]
 [ 37  29]]

c is 10
Postive class precision:0.889, recall:0.606 and f-score: 0.721
[[219   5]
 [ 26  40]]

c is 50
Postive class precision:0.870, recall:0.606 and f-score: 0.714
[[218   6]
 [ 26  40]]

c is 100
Postive class precision:0.851, recall:0.606 and f-score: 0.708
[[217   7]
 [ 26  40]]


  'precision', 'predicted', average, warn_for)
