In [1]:
import pandas as pd 
import numpy as np
from sklearn import preprocessing

# load and split
train_data= pd.read_csv('MSLR-WEB10K/Fold1/train.txt', header=None)
vali_data = pd.read_csv('MSLR-WEB10K/Fold1/vali.txt', header=None)
test_data = pd.read_csv('MSLR-WEB10K/Fold1/test.txt', header=None)
train_df = pd.DataFrame(train_data[0].str.split(' ',2).tolist(),columns = ['relevancy','qid','features'])
vali_df = pd.DataFrame(vali_data[0].str.split(' ',2).tolist(),columns = ['relevancy','qid','features'])
test_df = pd.DataFrame(test_data[0].str.split(' ',2).tolist(),columns = ['relevancy','qid','features'])

In [2]:
# splits 136 features
features = list(range(1, 137))
train_feature_df = pd.DataFrame(train_df['features'].str.split(' ',135).tolist(), columns = features)
vali_feature_df = pd.DataFrame(vali_df['features'].str.split(' ',135).tolist(), columns = features)
test_feature_df = pd.DataFrame(test_df['features'].str.split(' ',135).tolist(), columns = features)

In [3]:
# drop old feature column
train_df = train_df.drop(['features'], axis=1)
vali_df = vali_df.drop(['features'], axis=1)
test_df = test_df.drop(['features'], axis=1)

# add new divided feature column to main df
train_df = pd.concat([train_df,train_feature_df], axis=1)
vali_df = pd.concat([vali_df,vali_feature_df], axis=1)
test_df = pd.concat([test_df,test_feature_df], axis=1)

In [4]:
# rel type int
train_df.relevancy = train_df.relevancy.astype(np.int64)
vali_df.relevancy = vali_df.relevancy.astype(np.int64)
test_df.relevancy = test_df.relevancy.astype(np.int64)

In [5]:
train_feat = train_df.columns[2:138]
vali_feat = vali_df.columns[2:138]
test_feat = test_df.columns[2:138]

In [6]:
train_df_all_feat = train_df[train_feat]
vali_df_all_feat = vali_df[vali_feat]
test_df_all_feat = test_df[test_feat]

# convert values to integers (removes : )
# + prepares input for lr
le = preprocessing.LabelEncoder()

train_df_all_feat = train_df_all_feat.apply(le.fit_transform)
vali_df_all_feat = vali_df_all_feat.apply(le.fit_transform)
test_df_all_feat = test_df_all_feat.apply(le.fit_transform)

#train_df_small_feat = train_df_all_feat[1,3].values

train_df_small_feat = train_df_all_feat.drop([2,4,5,6,7,8,9,10,11,12,13,14,15,17,19,20,22,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,47,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99], axis=1)
vali_df_small_feat = vali_df_all_feat.drop([2,4,5,6,7,8,9,10,11,12,13,14,15,17,19,20,22,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,47,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99], axis=1)
test_df_small_feat = test_df_all_feat.drop([2,4,5,6,7,8,9,10,11,12,13,14,15,17,19,20,22,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,47,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70,76,77,78,79,80,81,82,83,84,85,86,87,88,89,90,91,92,93,94,95,96,97,98,99], axis=1)

In [7]:
train_df_targets = np.array(train_df.relevancy).T
vali_df_targets = np.array(vali_df.relevancy).T

In [8]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(train_df_small_feat, train_df_targets)

pd.set_option('display.float_format', lambda x: '%.3f' % x)

pred_rel = []
highest_prob = []
rel_prob = []

for i in range(0, len(vali_df)):
    five_prob=[]
    query = vali_df_small_feat._slice(slice(i,i+1))    
    rel = query.iloc[0]

    query_df_all_feat = query[query.columns.values]

    predicted_rel = lr.predict(query_df_all_feat)
    pred_rel.append(predicted_rel)
    predicted_rel_prob = lr.predict_proba(query_df_all_feat)
    
    rel_prob.append(predicted_rel_prob[0])
    five_prob.append(predicted_rel_prob[0][0])
    five_prob.append(predicted_rel_prob[0][1])
    five_prob.append(predicted_rel_prob[0][2])
    five_prob.append(predicted_rel_prob[0][3])
    five_prob.append(predicted_rel_prob[0][4])
    highest_prob.append(max(five_prob))
    five_prob = []
df_pred_rel = pd.DataFrame(pred_rel)
df_rel_prob = pd.DataFrame(rel_prob)

pd_highest_prob = pd.DataFrame(highest_prob, columns = ['highest'])

In [9]:
from sklearn.metrics import accuracy_score, confusion_matrix, precision_recall_fscore_support, log_loss
from sklearn import metrics

print("Accuracy Score LR:", accuracy_score(vali_df_targets, df_pred_rel))
print("Confusion Matrix LR: \n", confusion_matrix(vali_df_targets, df_pred_rel))
print(metrics.classification_report(vali_df_targets, df_pred_rel))

Accuracy Score LR: 0.531346303436
Confusion Matrix LR: 
 [[107856  13517    143      0      6]
 [ 58716  16777    313      0      9]
 [ 22279   9258    368      0      5]
 [  2829   1298     79      0      3]
 [  1194    552     54      0      3]]
             precision    recall  f1-score   support

          0       0.56      0.89      0.69    121522
          1       0.41      0.22      0.29     75815
          2       0.38      0.01      0.02     31910
          3       0.00      0.00      0.00      4209
          4       0.12      0.00      0.00      1803

avg / total       0.47      0.53      0.45    235259



  'precision', 'predicted', average, warn_for)


In [10]:
df_pred_rel.head(10)
#df_rel_prob.head(100) # <-- prob vers of above x 5

Unnamed: 0,0
0,0
1,0
2,0
3,0
4,0
5,1
6,1
7,0
8,0
9,0


In [11]:
# str qid -> int qid

train_df['qid'] = train_df.qid.str.replace('qid:' , '').astype('int')
vali_df['qid'] = vali_df.qid.str.replace('qid:' , '').astype('int')
test_df['qid'] = test_df.qid.str.replace('qid:' , '').astype('int')



In [12]:
vali_df.head(100)

Unnamed: 0,relevancy,qid,1,2,3,4,5,6,7,8,...,127,128,129,130,131,132,133,134,135,136
0,0,10,1:2,2:0,3:0,4:0,5:2,6:0.666667,7:0,8:0,...,127:45,128:1,129:0,130:117,131:55115,132:7,133:2,134:0,135:0,136:0
1,0,10,1:1,2:0,3:1,4:3,5:3,6:0.333333,7:0,8:0.333333,...,127:76,128:0,129:0,130:153,131:3866,132:17,133:104,134:0,135:0,136:0
2,1,10,1:3,2:0,3:3,4:0,5:3,6:1,7:0,8:1,...,127:73,128:0,129:9,130:266,131:56137,132:5,133:2,134:0,135:0,136:0
3,0,10,1:3,2:0,3:2,4:0,5:3,6:1,7:0,8:0.666667,...,127:54,128:8,129:0,130:541,131:12621,132:11,133:11,134:0,135:0,136:0
4,1,10,1:3,2:0,3:3,4:0,5:3,6:1,7:0,8:1,...,127:36,128:6,129:0,130:14687,131:40205,132:5,133:3,134:0,135:0,136:0
5,2,10,1:3,2:0,3:3,4:1,5:3,6:1,7:0,8:1,...,127:21,128:1896,129:1,130:10577,131:34605,132:1,133:1,134:2175,135:273,136:79.6706653092874
6,1,10,1:3,2:0,3:3,4:1,5:3,6:1,7:0,8:1,...,127:21,128:0,129:4,130:3736,131:34605,132:1,133:1,134:0,135:0,136:0
7,1,10,1:3,2:0,3:3,4:0,5:3,6:1,7:0,8:1,...,127:16,128:0,129:1,130:11715,131:34605,132:31,133:1,134:0,135:3,136:16.2
8,0,10,1:0,2:0,3:0,4:0,5:0,6:0,7:0,8:0,...,127:35,128:7,129:0,130:11919,131:34605,132:84,133:1,134:0,135:0,136:0
9,0,10,1:2,2:0,3:1,4:0,5:2,6:0.666667,7:0,8:0.333333,...,127:16,128:0,129:11,130:63657,131:39496,132:10,133:1,134:0,135:218,136:55.0699459783914


In [13]:
train_df.columns.values

array(['relevancy', 'qid', 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
       15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
       32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48,
       49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65,
       66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82,
       83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99,
       100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112,
       113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125,
       126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136], dtype=object)

In [14]:
# matrix
np.set_printoptions(suppress=True)

matrix_rel = df_rel_prob.as_matrix()
matrix_rel[1]

array([ 0.37778474,  0.31910488,  0.19971926,  0.06782161,  0.03556951])

In [15]:
vali_predict = vali_df.copy()
vali_predict['relevancy'] = df_pred_rel
vali_predict = pd.concat([vali_predict, pd_highest_prob], axis=1)
vali_predict.head(10)

Unnamed: 0,relevancy,qid,1,2,3,4,5,6,7,8,...,128,129,130,131,132,133,134,135,136,highest
0,0,10,1:2,2:0,3:0,4:0,5:2,6:0.666667,7:0,8:0,...,128:1,129:0,130:117,131:55115,132:7,133:2,134:0,135:0,136:0,0.52
1,0,10,1:1,2:0,3:1,4:3,5:3,6:0.333333,7:0,8:0.333333,...,128:0,129:0,130:153,131:3866,132:17,133:104,134:0,135:0,136:0,0.378
2,0,10,1:3,2:0,3:3,4:0,5:3,6:1,7:0,8:1,...,128:0,129:9,130:266,131:56137,132:5,133:2,134:0,135:0,136:0,0.36
3,0,10,1:3,2:0,3:2,4:0,5:3,6:1,7:0,8:0.666667,...,128:8,129:0,130:541,131:12621,132:11,133:11,134:0,135:0,136:0,0.376
4,0,10,1:3,2:0,3:3,4:0,5:3,6:1,7:0,8:1,...,128:6,129:0,130:14687,131:40205,132:5,133:3,134:0,135:0,136:0,0.392
5,1,10,1:3,2:0,3:3,4:1,5:3,6:1,7:0,8:1,...,128:1896,129:1,130:10577,131:34605,132:1,133:1,134:2175,135:273,136:79.6706653092874,0.298
6,1,10,1:3,2:0,3:3,4:1,5:3,6:1,7:0,8:1,...,128:0,129:4,130:3736,131:34605,132:1,133:1,134:0,135:0,136:0,0.302
7,0,10,1:3,2:0,3:3,4:0,5:3,6:1,7:0,8:1,...,128:0,129:1,130:11715,131:34605,132:31,133:1,134:0,135:3,136:16.2,0.321
8,0,10,1:0,2:0,3:0,4:0,5:0,6:0,7:0,8:0,...,128:7,129:0,130:11919,131:34605,132:84,133:1,134:0,135:0,136:0,0.441
9,0,10,1:2,2:0,3:1,4:0,5:2,6:0.666667,7:0,8:0.333333,...,128:0,129:11,130:63657,131:39496,132:10,133:1,134:0,135:218,136:55.0699459783914,0.419


In [16]:
# add final decider to df
sorted_df = vali_predict.sort_values(by=['qid','relevancy','highest'], ascending=[True, False, False])
sorted_df.head(10)

Unnamed: 0,relevancy,qid,1,2,3,4,5,6,7,8,...,128,129,130,131,132,133,134,135,136,highest
59,2,10,1:3,2:0,3:3,4:0,5:3,6:1,7:0,8:1,...,128:1896,129:1,130:21725,131:35216,132:1,133:1,134:5,135:0,136:0,0.293
46,1,10,1:3,2:0,3:3,4:0,5:3,6:1,7:0,8:1,...,128:6,129:0,130:41165,131:17836,132:13,133:4,134:0,135:0,136:0,0.341
19,1,10,1:3,2:3,3:3,4:0,5:3,6:1,7:1,8:1,...,128:10,129:0,130:719,131:11547,132:15,133:5,134:0,135:0,136:0,0.338
16,1,10,1:3,2:3,3:3,4:0,5:3,6:1,7:1,8:1,...,128:95,129:2,130:10111,131:48738,132:18,133:39,134:0,135:0,136:0,0.325
53,1,10,1:3,2:3,3:3,4:3,5:3,6:1,7:1,8:1,...,128:232,129:33,130:11882,131:65535,132:1,133:1,134:0,135:0,136:0,0.325
61,1,10,1:3,2:1,3:3,4:1,5:3,6:1,7:0.333333,8:1,...,128:1938,129:1,130:48201,131:35216,132:1,133:1,134:16,135:22,136:67.0933333333333,0.314
6,1,10,1:3,2:0,3:3,4:1,5:3,6:1,7:0,8:1,...,128:0,129:4,130:3736,131:34605,132:1,133:1,134:0,135:0,136:0,0.302
73,1,10,1:3,2:0,3:3,4:0,5:3,6:1,7:0,8:1,...,128:1883,129:1,130:48645,131:35168,132:1,133:1,134:2241,135:426,136:30.1554670902879,0.299
5,1,10,1:3,2:0,3:3,4:1,5:3,6:1,7:0,8:1,...,128:1896,129:1,130:10577,131:34605,132:1,133:1,134:2175,135:273,136:79.6706653092874,0.298
21,1,10,1:3,2:0,3:3,4:0,5:3,6:1,7:0,8:1,...,128:0,129:0,130:16198,131:34605,132:10,133:1,134:0,135:1,136:3.8,0.292


In [17]:
def dcg(predicted_rel, rank):
    predicted_rel = np.asarray(predicted_rel)[:rank]
    n_relevances = len(predicted_rel)
    if n_relevances == 0:
        return float(0)

    discounts = np.log2(np.arange(n_relevances) + 2)
    return np.sum(predicted_rel / discounts)


def ndcg(predicted_rel, rank):
    high_sc_dcg = dcg(sorted(predicted_rel, reverse=True), rank)
    if high_sc_dcg == 0:
        return float(0)

    return dcg(predicted_rel, rank) / high_sc_dcg


In [18]:
# lets test our final ranking results

# True Relevance DCG
dcg([0, 0, 1, 0, 1, 2, 1, 1, 0, 0], rank=10)

# Predicted Relevance DCG
dcg([0, 0, 1, 0, 3, 1, 1, 3, 0, 1], rank=10)

3.5855583988200541

In [19]:
# Accurate DCG
ndcg([0, 0, 1, 0, 1, 2, 1, 1, 0, 0], rank=10)


# Predicted DCG
ndcg([0, 0, 1, 0, 3, 1, 1, 3, 0, 1], rank=10)


0.54603583399465572

In [None]:
# Mean Avg Precision (MAP)
# Results from df output for first 10

tr = [0, 0, 1, 0, 1, 2, 1, 1, 0, 0]
pred = [0, 0, 1, 0, 3, 1, 1, 3, 0, 1]

def avg_pred(true, pred, rank=10):
    if (len(pred) > rank):
        pred = pred[:rank]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(pred):
        if p in true and p not in pred[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not true:
        return 0.0

    return score / min(len(true), rank)

def mean_avg_pred(true, pred, rank=10):

    return np.mean([avg_pred(true,pred,rank) for a,p in zip(true, pred)])



In [None]:
mean_avg_pred(tr, pred)

In [None]:
def ROC(label,result):
    %matplotlib inline
    from sklearn.utils import shuffle
    from sklearn.metrics import roc_curve, auc, precision_score, roc_auc_score
    from sklearn.preprocessing import label_binarize
    import pylab as pl
    import numpy as py
    Y = np.array(label)
    truth = label_binarize(label, classes=[0,1,2,3,4])
    n_classes = truth.shape[1]
    pred = label_binarize(result, classes=[0,1,2,3,4])
    # Compute ROC curve and ROC area for each class
    fpr = dict()
    tpr = dict()
    roc_auc = dict()
    for i in range(n_classes):
        fpr[i], tpr[i], _ = roc_curve(truth[:, i], pred[:, i])
        roc_auc[i] = auc(fpr[i], tpr[i])
        print(roc_auc[i])
        
    # Plot of a ROC curve for a specific class
    pl.figure()
    pl.plot(fpr[2], tpr[2], label='ROC curve (area = %0.2f)' % roc_auc[2])
    pl.plot([0, 1], [0, 1], 'k--')
    pl.xlim([0.0, 1.0])
    pl.ylim([0.0, 1.05])
    pl.xlabel('FP')
    pl.ylabel('TP')
    pl.legend(loc="lower right")
    pl.show()

    for i in range(n_classes):
        pl.plot(fpr[i], tpr[i], label='ROC curve of class {0} (area = {1:0.2f})'
                                       ''.format(i, roc_auc[i]))

    pl.plot([0, 1], [0, 1], 'k--')
    pl.xlim([0.0, 1.0])
    pl.ylim([0.0, 1.05])
    pl.xlabel('FP')
    pl.ylabel('TP')
    pl.legend(loc="lower right")
    pl.show()


ROC(vali_df['relevancy'], vali_predict['relevancy'])