In [1]:
import torch
from scipy import io
from tensorflow import keras
# from tsnecuda import TSNE
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [3]:
embedding = np.load('/datasets/dsc180a-wi20-public/Malware/group_data/group_02/sensitive_data/processed/node2vec/graph_2.npy')
g = io.loadmat('/datasets/dsc180a-wi20-public/Malware/group_data/group_02/sensitive_data/interim/graph/graph_2.mat')
y = g['post_label'].reshape(-1,)

post_cate = g['post_cate']
subreddit_cate = post_cate[:, :-1]
community_cate = (~post_cate[:, -1].astype(bool)).astype(int)
cate = np.array(subreddit_cate.argmax(1)).reshape(-1,)

# Graph 2

In [7]:
def evaluate(clf, X_train, y_train, X_test, y_test):
    METRICS = [
        keras.metrics.TruePositives(name='tp'),
        keras.metrics.FalsePositives(name='fp'),
        keras.metrics.TrueNegatives(name='tn'),
        keras.metrics.FalseNegatives(name='fn'), 
        keras.metrics.BinaryAccuracy(name='accuracy'),
        keras.metrics.Precision(name='precision'),
        keras.metrics.Recall(name='recall'),
        keras.metrics.AUC(name='auc'),
    ]   
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    res = {}
    for i in METRICS:
        res[i.name] = i(y_test, y_pred).numpy()
    return res

# Node2vec

## Hatefulpost detection

In [8]:
neg, pos = np.bincount(y)
total = neg + pos
print('Examples:\n    Total: {}\n    Positive: {} ({:.2f}% of total)\n'.format(
    total, pos, 100 * pos / total))
weight_for_0 = (1 / neg)*(total)/2.0 
weight_for_1 = (1 / pos)*(total)/2.0
class_weight = {0: weight_for_0, 1: weight_for_1}
print(class_weight)
X_train, X_test, y_train, y_test = train_test_split(embedding, y, test_size=0.2)
clfs = [
        LogisticRegression(
        solver='lbfgs',verbose=False, max_iter=2000, class_weight = class_weight, n_jobs=8
        ),
        RandomForestClassifier(class_weight=class_weight, n_jobs=8)
    ]
res = {}
for clf in clfs:
    res[clf.__class__.__name__] = evaluate(clf, X_train, y_train, X_test, y_test)
res = pd.DataFrame(res).T
display(res)

Examples:
    Total: 103413
    Positive: 4409 (4.26% of total)

{0: 0.5222667770999152, 1: 11.72748922658199}


Unnamed: 0,tp,fp,tn,fn,accuracy,precision,recall,auc
LogisticRegression,549.0,7851.0,11951.0,332.0,0.604361,0.065357,0.623156,0.61334
RandomForestClassifier,0.0,0.0,19802.0,881.0,0.957405,0.0,0.0,0.5


## community: controversial vs. normal

In [9]:
neg, pos = np.bincount(community_cate)
total = neg + pos
print('Examples:\n    Total: {}\n    Positive: {} ({:.2f}% of total)\n'.format(
    total, pos, 100 * pos / total))
weight_for_0 = (1 / neg)*(total)/2.0 
weight_for_1 = (1 / pos)*(total)/2.0
class_weight = {0: weight_for_0, 1: weight_for_1}
print(class_weight)
X_train, X_test, y_train, y_test = train_test_split(embedding, community_cate, test_size=0.2)
clfs = [
        LogisticRegression(
        solver='lbfgs',verbose=False, max_iter=2000, class_weight = class_weight, n_jobs=8
        ),
        RandomForestClassifier(class_weight=class_weight, n_jobs=8)
    ]
res = {}
for clf in clfs:
    res[clf.__class__.__name__] = evaluate(clf, X_train, y_train, X_test, y_test)
res = pd.DataFrame(res).T
display(res)

Examples:
    Total: 103413
    Positive: 80225 (77.58% of total)

{0: 2.22988183543212, 1: 0.644518541601745}


Unnamed: 0,tp,fp,tn,fn,accuracy,precision,recall,auc
LogisticRegression,14057.0,717.0,3997.0,1912.0,0.872891,0.951469,0.880268,0.864084
RandomForestClassifier,15931.0,2256.0,2458.0,38.0,0.889088,0.875955,0.99762,0.759523


## community: subreddits

In [10]:
X_train, X_test, y_train, y_test = train_test_split(embedding, cate, test_size=0.2)
clfs = [
        LogisticRegression(
        verbose=False, max_iter=2000, class_weight = 'balanced', n_jobs=8
        ),
#         LinearSVC(class_weight=class_weight),
        RandomForestClassifier(class_weight='balanced', n_jobs=8)
    ]
res = {}
for clf in clfs:
    clf.fit(X_train, y_train)
    res[clf.__class__.__name__] = clf.score(X_test, y_test)
res

{'LogisticRegression': 0.8407387709713291,
 'RandomForestClassifier': 0.8071846443939468}

# metapath2vec

## Hateful post detection

In [11]:
embedding = np.load('/datasets/dsc180a-wi20-public/Malware/group_data/group_02/sensitive_data/processed/metapath2vec/graph_2.npy')

In [12]:
neg, pos = np.bincount(y)
total = neg + pos
print('Examples:\n    Total: {}\n    Positive: {} ({:.2f}% of total)\n'.format(
    total, pos, 100 * pos / total))
weight_for_0 = (1 / neg)*(total)/2.0 
weight_for_1 = (1 / pos)*(total)/2.0
class_weight = {0: weight_for_0, 1: weight_for_1}
print(class_weight)
X_train, X_test, y_train, y_test = train_test_split(embedding, y, test_size=0.2)
clfs = [
        LogisticRegression(
        solver='lbfgs',verbose=False, max_iter=2000, class_weight = class_weight, n_jobs=8
        ),
        RandomForestClassifier(class_weight=class_weight, n_jobs=8)
    ]
res = {}
for clf in clfs:
    res[clf.__class__.__name__] = evaluate(clf, X_train, y_train, X_test, y_test)
res = pd.DataFrame(res).T
display(res)

Examples:
    Total: 103413
    Positive: 4409 (4.26% of total)

{0: 0.5222667770999152, 1: 11.72748922658199}


Unnamed: 0,tp,fp,tn,fn,accuracy,precision,recall,auc
LogisticRegression,517.0,8220.0,11558.0,388.0,0.583813,0.059174,0.571271,0.577829
RandomForestClassifier,0.0,0.0,19778.0,905.0,0.956244,0.0,0.0,0.5


## Community: controversial vs. normal detection

In [13]:
neg, pos = np.bincount(community_cate)
total = neg + pos
print('Examples:\n    Total: {}\n    Positive: {} ({:.2f}% of total)\n'.format(
    total, pos, 100 * pos / total))
weight_for_0 = (1 / neg)*(total)/2.0 
weight_for_1 = (1 / pos)*(total)/2.0
class_weight = {0: weight_for_0, 1: weight_for_1}
print(class_weight)
X_train, X_test, y_train, y_test = train_test_split(embedding, community_cate, test_size=0.2)
clfs = [
        LogisticRegression(
        solver='lbfgs',verbose=False, max_iter=2000, class_weight = class_weight, n_jobs=8
        ),
        RandomForestClassifier(class_weight=class_weight, n_jobs=8)
    ]
res = {}
for clf in clfs:
    res[clf.__class__.__name__] = evaluate(clf, X_train, y_train, X_test, y_test)
res = pd.DataFrame(res).T
display(res)

Examples:
    Total: 103413
    Positive: 80225 (77.58% of total)

{0: 2.22988183543212, 1: 0.644518541601745}


Unnamed: 0,tp,fp,tn,fn,accuracy,precision,recall,auc
LogisticRegression,12470.0,1280.0,3389.0,3544.0,0.766765,0.906909,0.778694,0.752273
RandomForestClassifier,15971.0,3339.0,1330.0,43.0,0.836484,0.827084,0.997315,0.641086


## Community: subreddits detection

In [14]:
X_train, X_test, y_train, y_test = train_test_split(embedding, cate, test_size=0.2)
clfs = [
        LogisticRegression(
        verbose=False, max_iter=2000, class_weight = 'balanced', n_jobs=8
        ),
#         LinearSVC(class_weight=class_weight),
        RandomForestClassifier(class_weight='balanced', n_jobs=8)
    ]
res = {}
for clf in clfs:
    clf.fit(X_train, y_train)
    res[clf.__class__.__name__] = clf.score(X_test, y_test)
res

{'LogisticRegression': 0.6431852245805734,
 'RandomForestClassifier': 0.598027365469226}

# Infomax

## Hatefulpost detection

In [16]:
embedding = np.load('/datasets/dsc180a-wi20-public/Malware/group_data/group_02/sensitive_data/processed/infomax/graph_2.npy')
neg, pos = np.bincount(y)
total = neg + pos
print('Examples:\n    Total: {}\n    Positive: {} ({:.2f}% of total)\n'.format(
    total, pos, 100 * pos / total))
weight_for_0 = (1 / neg)*(total)/2.0 
weight_for_1 = (1 / pos)*(total)/2.0
class_weight = {0: weight_for_0, 1: weight_for_1}
print(class_weight)
X_train, X_test, y_train, y_test = train_test_split(embedding, y, test_size=0.2)
clfs = [
        LogisticRegression(
        solver='lbfgs',verbose=False, max_iter=2000, class_weight = class_weight, n_jobs=8
        ),
        RandomForestClassifier(class_weight=class_weight, n_jobs=8)
    ]
res = {}
for clf in clfs:
    res[clf.__class__.__name__] = evaluate(clf, X_train, y_train, X_test, y_test)
res = pd.DataFrame(res).T
display(res)

Examples:
    Total: 103413
    Positive: 4409 (4.26% of total)

{0: 0.5222667770999152, 1: 11.72748922658199}


Unnamed: 0,tp,fp,tn,fn,accuracy,precision,recall,auc
LogisticRegression,583.0,7124.0,12691.0,285.0,0.641783,0.075646,0.671659,0.656067
RandomForestClassifier,423.0,5043.0,14772.0,445.0,0.734661,0.077387,0.487327,0.616412
