In [2]:
%matplotlib inline
from gensim.models import Word2Vec
import sys
import random
from sklearn.linear_model import SGDClassifier
from sklearn.dummy import DummyClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import KFold, cross_val_score
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
from sklearn import tree
from keras.wrappers.scikit_learn import KerasClassifier
from keras.models import Sequential
from keras.layers import Dense
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import itertools
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import f1_score

because the backend has already been chosen;
matplotlib.use() must be called *before* pylab, matplotlib.pyplot,
or matplotlib.backends is imported for the first time.



In [3]:
# load the model containing features
node2vecmodel = Word2Vec.load_word2vec_format("node2vec-model-p=10000.0-q=10000.0-l=144-r=3-d=206-k=59-weighted=False-directed=False" , binary=False)

In [4]:
# we use this list to store feature, label pairs.
# to have a 50:50 pos:neg distribution, we have to first shuffle the list, and then 
# take cut the correct list to fit the distribution
all_pairs = []

def handle_features(source, target):
    features_source = node2vecmodel[source]
    features_target = node2vecmodel[target]
    # hadamard
    return features_source * features_target

i = 0
# load all training pairs
with open("training_data.csv") as training_data:
    for line in training_data:
        source, target, label = tuple(line.split())
        if source in node2vecmodel and target in node2vecmodel:
            features = handle_features(source, target)
            label = int(label)
            all_pairs.append((features, label))
        else:
            # count the number of pairs not in the model
            i += 1

# now shuffle all_pairs
random.shuffle(all_pairs)

# we can now calculate the number of samples to keep for pos, neg
neg_list = [x for x in all_pairs if x[1] == 0]
neg_len = len(neg_list)
pos_list = [x for x in all_pairs if x[1] == 1]
pos_len = len(pos_list)
print("Not in the model: " + str(i))

# make sure that we have 50:50 distribution between pos:neg
if  pos_len > neg_len:
    # too many in positives
    pos_list = pos_list[:neg_len]
else:
    # too many negatives
    neg_list = neg_list[:pos_len]

assert(len(pos_list) == len(neg_list))
    
    
X = [x[0] for x in (neg_list + pos_list)]
Y = [x[1] for x in (neg_list + pos_list)]

print("X size: " + str(len(X)))
print("Y size: " + str(len(Y)))
    
X = np.array(X)
Y = np.array(Y)



Not in the model: 80164
X size: 65474
Y size: 65474


In [None]:
num_folds = 10
seed = 7

num_features = len(X[0])

# prepare models
models = []
models.append(('SGD', SGDClassifier(loss="hinge", penalty="l2")))
# sanity check
models.append(('Dummy', DummyClassifier("uniform")))
def keras_baseline_model():
    # create model
    model = Sequential()
    model.add(Dense(32, input_dim=num_features, init='normal', activation='relu'))
    model.add(Dense(1, init='normal', activation="relu"))
    # Compile model
    model.compile(loss='mean_squared_error', optimizer='adam')
    return model
#models.append(('Keras', KerasClassifier(build_fn=keras_baseline_model, nb_epoch=100, batch_size=128, verbose=0)))

models.append(('Gradient Boosting', GradientBoostingClassifier()))


# evaluate each model in turn
results = []
names = []
scoring = 'f1'
for name, model in models:
    print("-------------------------" + name + "-----------------------------------")
    kfold = KFold(n_splits=num_folds, shuffle=True, random_state=seed)
    cv_results = cross_val_score(model, X, y=Y, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)
    

-------------------------SGD-----------------------------------
SGD: 0.784990 (0.009752)
-------------------------Dummy-----------------------------------
Dummy: 0.498493 (0.008196)
-------------------------Gradient Boosting-----------------------------------


In [6]:
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')