In [1]:
from sklearn import svm
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
from sklearn.utils.multiclass import unique_labels
from sklearn import svm, metrics
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import TruncatedSVD
from sklearn.manifold import TSNE
import scipy.sparse as sp
import numpy as np
import matplotlib.pyplot as plt
import generatevector
from preprocess.parse import getRootSuffix
from argparse import Namespace
from tqdm import tqdm
import time
import CrossValidation
import warnings

In [2]:
def svd(vectors, dim=10):
    svd = TruncatedSVD(n_components=dim, n_iter=10,random_state=2019)
    svd_vectors = svd.fit_transform(vectors)
    svd_cum_exp = svd.explained_variance_ratio_.sum()
    print('Cumulated Explained Variance: {:.8f}'.format(svd_cum_exp))
    return svd_vectors, svd_cum_exp

def splitvector(vectors, labels, uni, testuni):
    # split vector and label to train and test
    idx = [i for i, x in enumerate(uni) if x == testuni]
    test_vector = vectors[idx]
    test_label = np.array(labels)[idx]
    if isinstance(vectors, sp.csr_matrix):
        train_vector = sp.csr_matrix(np.delete(vectors.toarray(), idx, 0))
    else:
        train_vector = sp.csr_matrix(np.delete(vectors, idx, 0))
    train_label = np.delete(labels, idx)
    return train_vector, train_label, test_vector, test_label


def plot_confusion_matrix(y_true, y_pred, classes,
                          normalize=False,
                          title=None,
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if not title:
        if normalize:
            title = 'Normalized confusion matrix'
        else:
            title = 'Confusion matrix, without normalization'

    # Compute confusion matrix
    cm = confusion_matrix(y_true, y_pred, labels=np.array(classes))
    # Only use the labels that appear in the data
    # classes = classes[unique_labels(y_true, y_pred)]
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    fig, ax = plt.subplots()
    im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
    ax.figure.colorbar(im, ax=ax)
    # We want to show all ticks...
    ax.set(xticks=np.arange(cm.shape[1]),
           yticks=np.arange(cm.shape[0]),
           # ... and label them with the respective list entries
           xticklabels=classes, yticklabels=classes,
           title=title,
           ylabel='True label',
           xlabel='Predicted label')

    # Rotate the tick labels and set their alignment.
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
             rotation_mode="anchor")

    # Loop over data dimensions and create text annotations.
    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, format(cm[i, j], fmt),
                    ha="center", va="center",
                    color="white" if cm[i, j] > thresh else "black")
    fig.tight_layout()
    return ax

def EvaluationModel(test_label, predict, classes):
    print('Accuracy: {:.6f}'.format(metrics.accuracy_score(test_label, predict)))
    print('Precision:{:.6f}'.format(precision_score(test_label, predict, average='weighted')))
    print('Recall:{:.6f}'.format(recall_score(test_label, predict, average='weighted')))
    print('F1_Score:{:.6f}'.format(f1_score(test_label, predict, average='weighted')))

    np.set_printoptions(precision=2)
    # Plot non-normalized confusion matrix
    plot_confusion_matrix(test_label, predict, classes=classes,
                          title='Confusion matrix, without normalization')
    # Plot normalized confusion matrix
    plot_confusion_matrix(test_label, predict, classes=classes, normalize=True,
                          title='Normalized confusion matrix')
    plt.show()
    
def exec_time(start, end):
    diff_time = end - start
    m, s = divmod(diff_time, 60)
    h, m = divmod(m, 60)
    s,m,h = int(round(s, 0)), int(round(m, 0)), int(round(h, 0))
    return s, m, h

def svmclassfier(train_vector, train_label, test_vector):
    lin_clf = svm.LinearSVC()
    lin_clf.fit(train_vector, train_label)
    predict = lin_clf.predict(test_vector)
    return predict


def lrclassifier(train_vector, train_label, test_vector):
    lr_clf = LogisticRegression()
    lr_clf.fit(train_vector, train_label)
    predict = lr_clf.predict(test_vector)
    return predict

In [3]:
# if __name__ == '__main__':
#     classes = ["course", "department", "faculty", "other", "project", "staff", "student"]
#     args = Namespace(
#         stop = False, 
#         stem = False, 
#         mime = False, 
#         digit = False, 
#         other = True
#     )
#     vectors, labels, uni, filename, features = generatevector.vectoriser('tfidf', args)
#     lr_clf = LogisticRegression()
#     label_t, label_p = CrossValidation.CrossValidation(lr_clf, vectors, labels, uni, classes, partial_p=False, cfsm=False)




Finished model LogisticRegression on cornell validation set
Execution Time: 00:00:17

Finished model LogisticRegression on texas validation set
Execution Time: 00:00:14

Finished model LogisticRegression on washington validation set
Execution Time: 00:00:14

Finished model LogisticRegression on wisconsin validation set
Execution Time: 00:00:13
Accuracy: 0.815503


  'precision', 'predicted', average, warn_for)


Precision:0.812223
Recall:0.815503


  'precision', 'predicted', average, warn_for)


F1_Score:0.811697


In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn import svm
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import log_loss
import lightgbm as lgbm
import xgboost as xgb
warnings.filterwarnings("ignore")
seed = 2019
# from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
names = ["Logistic Regression", "Nearest Neighbors", "svm.LinearSVC", "Linear SVM", "RBF SVM",
         "Decision Tree", "Random Forest", "LightGBM", "XgBoost"]
classifiers = [
    LogisticRegression(),
    KNeighborsClassifier(n_neighbors=7),
    svm.LinearSVC(),
    SVC(kernel="linear", probability=False, random_state=seed),
    SVC(kernel='rbf', probability=False, random_state=seed),
    DecisionTreeClassifier(max_depth=10),
    RandomForestClassifier(max_depth=10, n_estimators=50,random_state=seed),
    lgbm.LGBMClassifier(),
    xgb.XGBClassifier()]
classes = ["course", "department", "faculty", "other", "project", "staff", "student"]
args = Namespace(
        stop = False, 
        stem = False, 
        mime = False, 
        digit = False, 
        other = True
    )
vectors, labels, uni, filename, features = generatevector.vectoriser('tfidf', args)
for name, clf in zip(names, classifiers):
    print('='*100)
    print('='*100)
    print('='*100)
    label_t, label_p = CrossValidation.CrossValidation(clf, vectors, labels, uni, classes, partial_p=False, cfsm=False)


Finished model LogisticRegression on cornell validation set
Execution Time: 00:00:15

Finished model LogisticRegression on texas validation set
Execution Time: 00:00:15

Finished model LogisticRegression on washington validation set
Execution Time: 00:00:13

Finished model LogisticRegression on wisconsin validation set
Execution Time: 00:00:13
Accuracy: 0.815503
Precision:0.812223
Recall:0.815503
F1_Score:0.811697

Finished model KNeighborsClassifier on cornell validation set
Execution Time: 00:00:13

Finished model KNeighborsClassifier on texas validation set
Execution Time: 00:00:12

Finished model KNeighborsClassifier on washington validation set
Execution Time: 00:00:11

Finished model KNeighborsClassifier on wisconsin validation set
Execution Time: 00:00:11
Accuracy: 0.692859
Precision:0.700990
Recall:0.692859
F1_Score:0.688441

Finished model LinearSVC on cornell validation set
Execution Time: 00:00:12

Finished model LinearSVC on texas validation set
Execution Time: 00:00:14

F