In [12]:
import cPickle as pickle
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

In [1]:
# YOU MUST SET THIS TO THE ROOT DATA DIRECTORY FROM THE DOWNLOADED ZIP FILE
DATA_DIR = "/dfs/scratch0/reddit/conflict/prediction/"

## Data loading and pre-processing....

In [4]:
# loading handcrafted features
meta_features = {}
meta_labels = {}
with open(DATA_DIR+"/detailed_data/handcrafted_features.tsv") as fp:
    for line in fp:
        info = line.split()
        meta_features[info[0]] = np.array(map(float, info[-1].split(",")))
        meta_labels[info[0]] = 1 if info[1] == "burst" else 0

In [6]:
# loading the user, source, and target community embeddings for all examples
with open(DATA_DIR + "/detailed_data/full_ids.txt") as fp:
    ids = {id.strip():i for i, id in enumerate(fp.readlines())}
all_embeds = np.load(open(DATA_DIR + "/detailed_data/full_embeds.npy"))

In [9]:
# loading the post embeddings from the LSTM 
lstm_embeds = np.load(open(DATA_DIR + "/detailed_data/lstm_embeds.npy"))
lstm_ids = pickle.load(open(DATA_DIR + "/detailed_data/lstm_embeds-ids.pkl"))
lstm_ids = {id:i for i, id in enumerate(lstm_ids)}

In [8]:
# loading preprocessed lstm data to ensure identical train/val/test splits
train_data = pickle.load(open(DATA_DIR + "/preprocessed_train_data.pkl"))
val_data = pickle.load(open(DATA_DIR + "/preprocessed_val_data.pkl"))
test_data = pickle.load(open(DATA_DIR + "/preprocessed_test_data.pkl"))

In [10]:
# flattening the preprocessed LSTM data (no need for minibatching here....)
def flatten(data):
    ids, text, users, subreddits, lengths,sfs, labels = [], [], [], [], [], [], []
    for batch in data:
        bids, btext, busers, bsubreddits, blengths, bsfs, blabels = batch
        ids.extend(bids)
        text.extend(btext.numpy().tolist())
        users.extend(busers.numpy().tolist())
        subreddits.extend(bsubreddits.numpy().tolist())
        lengths.extend(blengths)
        labels.extend(blabels)
        sfs.extend(bsfs)
    return (ids, text, users, subreddits, lengths, labels)
flat_train_data = flatten(train_data)
flat_val_data = flatten(val_data)
flat_test_data = flatten(test_data)

In [11]:
train_X = np.stack([np.concatenate([meta_features[i], all_embeds[ids[i]], lstm_embeds[lstm_ids[i]]]) for i in flat_train_data[0]])
val_X =  np.stack([np.concatenate([meta_features[i], all_embeds[ids[i]], lstm_embeds[lstm_ids[i]]]) for i in flat_val_data[0] if i in meta_features])
test_X =  np.stack([np.concatenate([meta_features[i],all_embeds[ids[i]], lstm_embeds[lstm_ids[i]]]) for i in flat_test_data[0] if i in meta_features])

train_Y = np.stack([meta_labels[i] for i in flat_train_data[0] if i in meta_features])
val_Y =  np.stack([meta_labels[i] for i in flat_val_data[0] if i in meta_features])
test_Y =  np.stack([meta_labels[i] for i in flat_test_data[0] if i in meta_features])

## Running the models

#### Baseline model 

In [20]:
# First we run the Random Forest with only the metadata/handcrafted features...
baseline_mod = RandomForestClassifier(n_estimators=500, n_jobs=100, random_state=0)
# note that the first 263 features are the handcrafted ones... 
baseline_mod.fit(train_X[:, :263], train_Y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=100,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [21]:
# For reference, on the authors server we get 0.682
print roc_auc_score(val_Y, baseline_mod.predict_proba(val_X[:, :263])[:,1])

0.6823965206409768


In [22]:
# For reference, on the authors server we get 0.667
print roc_auc_score(test_Y, baseline_mod.predict_proba(test_X[:, :263])[:,1])

0.6656650084718871


In [23]:
# First we run the Random Forest with only the metadata/handcrafted features...
ensemble_mod = RandomForestClassifier(n_estimators=500, n_jobs=100, random_state=0)
# note that the first 263 features are the handcrafted ones... 
ensemble_mod.fit(train_X[:, :], train_Y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=100,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [24]:
# For reference, on the authors server we get 0.765
print roc_auc_score(val_Y, ensemble_mod.predict_proba(val_X[:, :])[:,1])

0.7648275951540929


In [25]:
# For reference, on the authors server we get 0.756
print roc_auc_score(test_Y, ensemble_mod.predict_proba(test_X[:, :])[:,1])

0.7564078921461626
