In [2]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv("../data/classification/count_dataset_for_multilabel.csv")
df.head()

Unnamed: 0,project_name,package#A,package#ACGAN,package#APDrawing,package#APDrawingGAN,package#APDrawingGAN.data,package#APDrawingGAN.data.base_data_loader,package#APDrawingGAN.data.base_dataset,package#APDrawingGAN.data.face_landmark,package#APDrawingGAN.data.image_folder,...,topic#vim,topic#virtual-reality,topic#vue,topic#wagtail,topic#web-components,topic#webapp,topic#webpack,topic#windows,topic#wordpress,topic#xml
0,01joy#news-search-engine,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,05bit#peewee-async,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0k#shyaml,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0x00-0x00#ShellPop,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0xAX#linux-insides,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

In [26]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.multiclass import OneVsRestClassifier

def get_forest_clf(X_train, y_train):
    clf = RandomForestClassifier(criterion='gini', min_samples_leaf=5, max_depth=10, n_estimators=10, random_state=42)
    clf_model = OneVsRestClassifier(clf)
    clf_model.fit(X_train, y_train)
    return clf_model

def get_xgboost_clf(X_train, y_train, X_test, y_test):
    clf = XGBClassifier(learning_rate=0.05,
                        n_estimators=100,
                        max_depth=5,
                        min_child_weight=5.0,
                        gamma=1,
                        subsample=0.9,
                        colsample_bytree=0.9,
                        objective="binary:logistic",
                        random_state=42)
    
    clf.fit(X_train, y_train, verbose=False, eval_set=[(X_test, y_test)])

    return clf

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score, roc_curve, precision_recall_curve


def get_youden_threshold(model, X, y_true):
    y_predict_proba = model.predict_proba(X)[:, 1]
    fpr, tpr, thresholds = roc_curve(y_true, y_predict_proba, pos_label=1)
    auc = roc_auc_score(y_true, y_predict_proba)
    youden_idx = np.argmax(np.abs(tpr - fpr))
    youden_threshold = thresholds[youden_idx]
    
    plt.figure(figsize=(5, 5))
    plt.plot(fpr, tpr, color="red", label=f"ROC curve")
    plt.plot(fpr[youden_idx], tpr[youden_idx], marker="o", color="navy", ms=10, 
             label=f"Youden Threshold={youden_threshold:.2f}\nAUC={auc:.3f})")
    plt.plot([0,1], [0,1] , color="black", ls="--")
    plt.xlim([-0.01, 1.01])
    plt.ylim([-0.01, 1.01])
    plt.xlabel('FPR', fontsize=12)
    plt.ylabel('TPR', fontsize=12)
    plt.tick_params(axis='both', which='major', labelsize=12)
    plt.legend(prop={'size':12} , loc=4)
    plt.show()
    
    return youden_threshold


def get_threshold(model, X, y_true):
    y_predict_proba = model.predict_proba(X)[:, 1]
    precision, recall, thresholds = precision_recall_curve(y_true, y_predict_proba)
    auc = roc_auc_score(y_true, y_predict_proba)
    idx = np.argmin(np.abs(precision - recall))
    threshold = thresholds[idx]
    
    f1 = []
    for i in range(len(precision)):
        f1.append(2 * (precision[i] * recall[i]) / (precision[i] + recall[i]))
        
    queue_rate = []
    for thr in thresholds:
        queue_rate.append((y_predict_proba >= thr).mean())
        
    plt.figure(figsize=(5, 5))
    plt.plot(thresholds, precision[1:], label="Precision")
    plt.plot(thresholds, recall[1:], label="Recall")
    plt.plot(thresholds, f1[1:], label="F1-Score")
    plt.plot(thresholds, queue_rate, label="Queue Rate")
    plt.legend(loc=0)
    plt.xlim([0.025, threshold + 0.2])
    plt.xlabel("Threshold", fontsize=12)
    plt.ylabel("Score", fontsize=12)
    plt.axvline(threshold, color="k", ls="--")
    plt.title(label = F"Threshold={threshold:.3f}", fontsize=12)
    plt.show()
    
    return threshold

In [None]:
def evaluate(X_train, X_test, y_train, y_test, features):
    for f in features: 
        y_train_unit, y_test_unit = y_train[f].values, y_test[f].values
        clf = get_xgboost_clf(X_train, y_train_unit, X_test, y_test_unit)
        
        threshhold = get_youden_threshold(clf, X_test, y_test_unit)
        
        y_pred_proba = clf.predict_proba(X_test)
        y_pred = [1 if p[1] > threshhold else 0 for p in y_pred_proba]
        print('================', f, '================')
        print(classification_report(y_test_unit, y_pred, labels=[0, 1]))
        
        threshhold = get_threshold(clf, X_test, y_test_unit)
        
        y_pred_proba = clf.predict_proba(X_test)
        y_pred = [1 if p[1] > threshhold else 0 for p in y_pred_proba]
        print('================', f, '================')
        print(classification_report(y_test_unit, y_pred, labels=[0, 1]))

In [None]:
X_columns = [c for c in df.columns if c.startswith('ext') or c.startswith('package')]
# X_columns

In [None]:
y_columns = [c for c in df.columns if c.startswith('tag') or c.startswith('topic')]
# y_columns

In [None]:
X = df[X_columns].to_numpy()
X.shape

In [None]:
y = df[y_columns]
y.shape

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, train_size=0.7)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

## Tags prediction

In [None]:
tags = [c for c in y.columns if c.startswith('tag') and y[c].sum() > 50]
print(len(tags))
# tags

In [None]:
evaluate(X_train, X_test, y_train, y_test, tags)