In [95]:
import sys
from scipy import io
import numpy as np
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from torch_geometric.utils import from_scipy_sparse_matrix
from sklearn.preprocessing import StandardScaler
import os.path as osp
import os
from tsnecuda import TSNE
import json
import pandas as pd
import matplotlib.pyplot as plt
sys.path.append('../')
from src.models import *

In [67]:
fp = '/datasets/dsc180a-wi20-public/Malware/group_data/group_02/sensitive_data'
g1 = io.loadmat(osp.join(fp, 'interim', 'graph', 'graph_1.mat'))
g2 = io.loadmat(osp.join(fp, 'interim', 'graph', 'graph_2.mat'))
post_indx = io.loadmat(osp.join(fp, 'interim', 'graph', 'graph_1.mat'))['post_indx'].reshape(-1,)

y = g1['post_label'].reshape(-1,)

In [96]:
g1_feature = np.hstack((np.array(g1['P'].sum(1)), np.array(g1['A'].T.sum(1))))
g1_feature = StandardScaler().fit_transform(g1_feature)
g2_feature = np.array(g2['P'].sum(1))
g2_feature = StandardScaler().fit_transform(g2_feature)
baseline_feature = get_baseline_feature(fp)

In [144]:
g1_node2vec = np.load(osp.join(fp, 'processed', 'node2vec', 'graph_1.npy'))
g1_infomax = np.load(osp.join(fp, 'processed', 'infomax', 'graph_1.npy'))
g1_metapath2vec =  np.load(osp.join(fp, 'processed', 'metapath2vec', 'graph_1.npy'))

g2_node2vec = np.load(osp.join(fp, 'processed', 'node2vec', 'graph_2.npy'))
g2_infomax = np.load(osp.join(fp, 'processed', 'infomax', 'graph_2.npy'))
g2_metapath2vec =  np.load(osp.join(fp, 'processed', 'metapath2vec', 'graph_2.npy'))

In [109]:
def evaluate_emb(X_train, X_test, y_train, y_test, clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    y_true = y_test.copy()
    tn, fp, fn, tp = metrics.confusion_matrix(y_true, y_pred).ravel()
    acc = metrics.accuracy_score(y_true, y_pred)
    fpr, tpr, thresholds = metrics.roc_curve(y_true, y_pred, pos_label = 1)
    auc = metrics.auc(fpr, tpr)
    precision = metrics.precision_score(y_true, y_pred)
    recall = metrics.recall_score(y_true, y_pred)
    return tn, fp, fn, tp, acc, auc, precision, recall

In [134]:
def evaluate_with_baseline(train_mask, test_mask, feature_1, feature_2, feature_3, y, clfs):
    if isinstance(feature_3, type(None)):
        X = np.hstack((feature_1, feature_2))
    else:
        X = np.hstack((feature_1, feature_2, feature_3))
    X_train, X_test, y_train, y_test = X[train_mask, :], X[test_mask, :], y[train_mask], y[test_mask]
    res = {}
    for clf in clfs:
        res[clf.__class__.__name__] = list(evaluate_emb(X_train, X_test, y_train, y_test, clf))
    res = pd.DataFrame(res).T
    res.columns = ['tn', 'fp', 'fn', 'tp', 'acc', 'auc', 'precision', 'recall']
    return res

In [135]:
train_mask, test_mask = train_test_split(np.arange(len(y_hateful)), test_size=0.2)

In [136]:
clfs = [LogisticRegression(
        verbose=False, max_iter=2000, class_weight = 'balanced', n_jobs=8, solver = 'lbfgs'),
        RandomForestClassifier(class_weight = 'balanced', n_jobs=8)]

g1 node2vec

In [139]:
res = evaluate_with_baseline(train_mask, test_mask, g1_feature, g1_node2vec, baseline_feature, y, clfs)
display(res)

Unnamed: 0,tn,fp,fn,tp,acc,auc,precision,recall
LogisticRegression,15561.0,4229.0,268.0,625.0,0.782575,0.743097,0.12876,0.699888
RandomForestClassifier,19767.0,23.0,849.0,44.0,0.95784,0.524055,0.656716,0.049272


g1 infomax

In [141]:
res = evaluate_with_baseline(train_mask, test_mask, g1_feature, g1_infomax, None, y, clfs)
display(res)

Unnamed: 0,tn,fp,fn,tp,acc,auc,precision,recall
LogisticRegression,15731.0,4059.0,269.0,624.0,0.790746,0.746832,0.133248,0.698768
RandomForestClassifier,19477.0,313.0,761.0,132.0,0.948073,0.566,0.296629,0.147816


g1 metapath2vec

In [143]:
res = evaluate_with_baseline(train_mask, test_mask, g1_feature, g1_metapath2vec, baseline_feature, y, clfs)
display(res)

Unnamed: 0,tn,fp,fn,tp,acc,auc,precision,recall
LogisticRegression,15575.0,4215.0,275.0,618.0,0.782914,0.739531,0.127871,0.692049
RandomForestClassifier,19763.0,27.0,856.0,37.0,0.957308,0.520035,0.578125,0.041433


g2 node2vec

In [160]:
res = evaluate_with_baseline(train_mask, test_mask, g2_feature, g2_node2vec, baseline_feature, y, clfs)
display(res)

Unnamed: 0,tn,fp,fn,tp,acc,auc,precision,recall
LogisticRegression,15494.0,4296.0,263.0,630.0,0.779577,0.744204,0.127893,0.705487
RandomForestClassifier,19773.0,17.0,848.0,45.0,0.958178,0.524766,0.725806,0.050392


g2 infomax

In [161]:
res = evaluate_with_baseline(train_mask, test_mask, g2_feature, g2_infomax, None, y, clfs)
display(res)

Unnamed: 0,tn,fp,fn,tp,acc,auc,precision,recall
LogisticRegression,15191.0,4599.0,230.0,663.0,0.766523,0.755026,0.125998,0.742441
RandomForestClassifier,19175.0,615.0,727.0,166.0,0.935116,0.577407,0.212548,0.18589


g2 metapath2vec

In [162]:
res = evaluate_with_baseline(train_mask, test_mask, g2_feature, g2_metapath2vec, baseline_feature, y, clfs)
display(res)

Unnamed: 0,tn,fp,fn,tp,acc,auc,precision,recall
LogisticRegression,15489.0,4301.0,267.0,626.0,0.779142,0.741838,0.127055,0.701008
RandomForestClassifier,19760.0,30.0,841.0,52.0,0.957888,0.528357,0.634146,0.058231
