In [1]:
import torch
from scipy import io
from tensorflow import keras
# from tsnecuda import TSNE
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [15]:
embedding = np.load('/datasets/dsc180a-wi20-public/Malware/group_data/group_02/sensitive_data/processed/node2vec/graph_1.npy')
g = io.loadmat('/datasets/dsc180a-wi20-public/Malware/group_data/group_02/sensitive_data/interim/graph/graph_1.mat')
y = g['post_label'].reshape(-1,)

post_cate = g['post_cate']
subreddit_cate = post_cate[:, :-1]
community_cate = (~post_cate[:, -1].astype(bool)).astype(int)
cate = np.array(subreddit_cate.argmax(1)).reshape(-1,)

# Graph1

In [4]:
def evaluate(clf, X_train, y_train, X_test, y_test):
    METRICS = [
        keras.metrics.TruePositives(name='tp'),
        keras.metrics.FalsePositives(name='fp'),
        keras.metrics.TrueNegatives(name='tn'),
        keras.metrics.FalseNegatives(name='fn'), 
        keras.metrics.BinaryAccuracy(name='accuracy'),
        keras.metrics.Precision(name='precision'),
        keras.metrics.Recall(name='recall'),
        keras.metrics.AUC(name='auc'),
    ]   
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    res = {}
    for i in METRICS:
        res[i.name] = i(y_test, y_pred).numpy()
    return res

# Node2vec

## Hatefulpost detection

In [5]:
neg, pos = np.bincount(y)
total = neg + pos
print('Examples:\n    Total: {}\n    Positive: {} ({:.2f}% of total)\n'.format(
    total, pos, 100 * pos / total))
weight_for_0 = (1 / neg)*(total)/2.0 
weight_for_1 = (1 / pos)*(total)/2.0
class_weight = {0: weight_for_0, 1: weight_for_1}
print(class_weight)
X_train, X_test, y_train, y_test = train_test_split(embedding, y, test_size=0.2)
clfs = [
        LogisticRegression(
        solver='lbfgs',verbose=False, max_iter=2000, class_weight = class_weight, n_jobs=8
        ),
        RandomForestClassifier(class_weight=class_weight, n_jobs=8)
    ]
res = {}
for clf in clfs:
    res[clf.__class__.__name__] = evaluate(clf, X_train, y_train, X_test, y_test)
res = pd.DataFrame(res).T
display(res)

Examples:
    Total: 103413
    Positive: 4409 (4.26% of total)

{0: 0.5222667770999152, 1: 11.72748922658199}


Unnamed: 0,tp,fp,tn,fn,accuracy,precision,recall,auc
LogisticRegression,488.0,7956.0,11876.0,363.0,0.597786,0.057793,0.573443,0.586137
RandomForestClassifier,0.0,0.0,19832.0,851.0,0.958855,0.0,0.0,0.5


## community: controversial vs. normal

In [17]:
neg, pos = np.bincount(community_cate)
total = neg + pos
print('Examples:\n    Total: {}\n    Positive: {} ({:.2f}% of total)\n'.format(
    total, pos, 100 * pos / total))
weight_for_0 = (1 / neg)*(total)/2.0 
weight_for_1 = (1 / pos)*(total)/2.0
class_weight = {0: weight_for_0, 1: weight_for_1}
print(class_weight)
X_train, X_test, y_train, y_test = train_test_split(embedding, community_cate, test_size=0.2)
clfs = [
        LogisticRegression(
        solver='lbfgs',verbose=False, max_iter=2000, class_weight = class_weight, n_jobs=8
        ),
        RandomForestClassifier(class_weight=class_weight, n_jobs=8)
    ]
res = {}
for clf in clfs:
    res[clf.__class__.__name__] = evaluate(clf, X_train, y_train, X_test, y_test)
res = pd.DataFrame(res).T
display(res)

Examples:
    Total: 103413
    Positive: 80225 (77.58% of total)

{0: 2.22988183543212, 1: 0.644518541601745}


Unnamed: 0,tp,fp,tn,fn,accuracy,precision,recall,auc
LogisticRegression,13318.0,917.0,3767.0,2681.0,0.826041,0.935581,0.832427,0.818327
RandomForestClassifier,15962.0,2929.0,1755.0,37.0,0.856597,0.844953,0.997687,0.686184


## community: subreddits

In [7]:
X_train, X_test, y_train, y_test = train_test_split(embedding, cate, test_size=0.2)
clfs = [
        LogisticRegression(
        verbose=False, max_iter=2000, class_weight = 'balanced', n_jobs=8
        ),
#         LinearSVC(class_weight=class_weight),
        RandomForestClassifier(class_weight='balanced', n_jobs=8)
    ]
res = {}
for clf in clfs:
    clf.fit(X_train, y_train)
    res[clf.__class__.__name__] = clf.score(X_test, y_test)
res

{'LogisticRegression': 0.8065077599961321,
 'RandomForestClassifier': 0.7484407484407485}

# metapath2vec

## Hateful post detection

In [None]:
embedding = np.load('/datasets/dsc180a-wi20-public/Malware/group_data/group_02/sensitive_data/processed/metapath2vec/graph_1.npy')

In [23]:
neg, pos = np.bincount(y)
total = neg + pos
print('Examples:\n    Total: {}\n    Positive: {} ({:.2f}% of total)\n'.format(
    total, pos, 100 * pos / total))
weight_for_0 = (1 / neg)*(total)/2.0 
weight_for_1 = (1 / pos)*(total)/2.0
class_weight = {0: weight_for_0, 1: weight_for_1}
print(class_weight)
X_train, X_test, y_train, y_test = train_test_split(embedding, y, test_size=0.2)
clfs = [
        LogisticRegression(
        solver='lbfgs',verbose=False, max_iter=2000, class_weight = class_weight, n_jobs=8
        ),
        RandomForestClassifier(class_weight=class_weight, n_jobs=8)
    ]
res = {}
for clf in clfs:
    res[clf.__class__.__name__] = evaluate(clf, X_train, y_train, X_test, y_test)
res = pd.DataFrame(res).T
display(res)

Examples:
    Total: 103413
    Positive: 4409 (4.26% of total)

{0: 0.5222667770999152, 1: 11.72748922658199}


Unnamed: 0,tp,fp,tn,fn,accuracy,precision,recall,auc
LogisticRegression,488.0,7750.0,12055.0,390.0,0.60644,0.059238,0.555809,0.582247
RandomForestClassifier,0.0,0.0,19805.0,878.0,0.95755,0.0,0.0,0.5


## Community: controversial vs. normal detection

In [24]:
neg, pos = np.bincount(community_cate)
total = neg + pos
print('Examples:\n    Total: {}\n    Positive: {} ({:.2f}% of total)\n'.format(
    total, pos, 100 * pos / total))
weight_for_0 = (1 / neg)*(total)/2.0 
weight_for_1 = (1 / pos)*(total)/2.0
class_weight = {0: weight_for_0, 1: weight_for_1}
print(class_weight)
X_train, X_test, y_train, y_test = train_test_split(embedding, community_cate, test_size=0.2)
clfs = [
        LogisticRegression(
        solver='lbfgs',verbose=False, max_iter=2000, class_weight = class_weight, n_jobs=8
        ),
        RandomForestClassifier(class_weight=class_weight, n_jobs=8)
    ]
res = {}
for clf in clfs:
    res[clf.__class__.__name__] = evaluate(clf, X_train, y_train, X_test, y_test)
res = pd.DataFrame(res).T
display(res)

Examples:
    Total: 103413
    Positive: 80225 (77.58% of total)

{0: 2.22988183543212, 1: 0.644518541601745}


Unnamed: 0,tp,fp,tn,fn,accuracy,precision,recall,auc
LogisticRegression,12658.0,1308.0,3396.0,3321.0,0.776193,0.906344,0.792165,0.757052
RandomForestClassifier,15915.0,3360.0,1344.0,64.0,0.834453,0.825681,0.995995,0.640854


## Community: subreddits detection

In [25]:
X_train, X_test, y_train, y_test = train_test_split(embedding, cate, test_size=0.2)
clfs = [
        LogisticRegression(
        verbose=False, max_iter=2000, class_weight = 'balanced', n_jobs=8
        ),
#         LinearSVC(class_weight=class_weight),
        RandomForestClassifier(class_weight='balanced', n_jobs=8)
    ]
res = {}
for clf in clfs:
    clf.fit(X_train, y_train)
    res[clf.__class__.__name__] = clf.score(X_test, y_test)
res

{'LogisticRegression': 0.7048300536672629,
 'RandomForestClassifier': 0.6225885993327854}

# Infomax

## Hatefulpost detection

In [19]:
embedding = np.load('/datasets/dsc180a-wi20-public/Malware/group_data/group_02/sensitive_data/processed/infomax/graph_1.npy')
neg, pos = np.bincount(y)
total = neg + pos
print('Examples:\n    Total: {}\n    Positive: {} ({:.2f}% of total)\n'.format(
    total, pos, 100 * pos / total))
weight_for_0 = (1 / neg)*(total)/2.0 
weight_for_1 = (1 / pos)*(total)/2.0
class_weight = {0: weight_for_0, 1: weight_for_1}
print(class_weight)
X_train, X_test, y_train, y_test = train_test_split(embedding, y, test_size=0.2)
clfs = [
        LogisticRegression(
        solver='lbfgs',verbose=False, max_iter=2000, class_weight = class_weight, n_jobs=8
        ),
        RandomForestClassifier(class_weight=class_weight, n_jobs=8)
    ]
res = {}
for clf in clfs:
    res[clf.__class__.__name__] = evaluate(clf, X_train, y_train, X_test, y_test)
res = pd.DataFrame(res).T
display(res)

Examples:
    Total: 103413
    Positive: 4409 (4.26% of total)

{0: 0.5222667770999152, 1: 11.72748922658199}


Unnamed: 0,tp,fp,tn,fn,accuracy,precision,recall,auc
LogisticRegression,540.0,6010.0,13796.0,337.0,0.69313,0.082443,0.615735,0.656146
RandomForestClassifier,235.0,2288.0,17518.0,642.0,0.858338,0.093143,0.267959,0.576219
