In [1]:
import csv
import json
import pandas as pd
import numpy as np
import random
from collections import defaultdict

In [2]:
train_set = pd.read_csv('../data/final/train_reconstructed.csv')

X_train = train_set.iloc[:,:-1].values
y_train = train_set['edge'].values

print('Training set X: {}'.format(X_train[:2]))
print('Training set Y: {}'.format(y_train[:2]))

Training set X: [[ 7.         15.          7.          5.          2.          2.
   2.          0.          0.         12.          0.46153846  3.
   0.375       0.          0.        ]
 [ 7.          7.          7.          7.          2.          1.
   0.          1.          0.         13.          0.56521739  2.
   0.33333333  0.          0.        ]]
Training set Y: [1 1]


In [3]:
test_set = pd.read_csv('../data/final/dev-test.csv')

X_test = test_set.iloc[:,:-1].values
y_test = test_set['edge'].values

print('Test set X: {}'.format(X_test[:2]))
print('Test set Y: {}'.format(y_test[:2]))

Test set X: [[4.         5.         3.         3.         1.         0.
  0.         1.         0.         6.         0.3        0.
  0.         0.         0.        ]
 [8.         4.         2.         2.         0.         0.
  0.         0.         0.         8.         0.34782609 1.
  0.14285714 0.         0.        ]]
Test set Y: [0 1]


In [4]:
from sklearn.dummy import DummyClassifier

ds_clf = DummyClassifier(strategy="most_frequent") # Define our model, set parameter strategy to 'most_frequent'
ds_clf.fit(X_train, y_train) # Use model.fit to train with our dataset 
Y_predict = ds_clf.predict(X_test) # Use model.predict to make prediction
print("Prediction :", Y_predict[:10])
print("Accuracy for train set:", ds_clf.score(X_train,y_train))
print("Accuracy for dev set:", ds_clf.score(X_test, y_test)) # Use model.score to evaluate our model.

Prediction : [1 1 1 1 1 1 1 1 1 1]
Accuracy for train set: 0.5012030975822467
Accuracy for dev set: 0.5


In [5]:
from sklearn.naive_bayes import BernoulliNB

bnb = BernoulliNB()
bnb.fit(X_train, y_train)
Y_proba = bnb.predict_proba(X_test) # Use model.predict to make prediction
# Prob of being one
print("Probabilities :", Y_proba[:10,1])
print("Accuracy for train set:", bnb.score(X_train,y_train))
print("Accuracy for dev set: ", bnb.score(X_test, y_test)) # Use model.score to evaluate our model.


Probabilities : [7.06494086e-06 1.08507418e-02 6.05664564e-03 5.90119628e-03
 5.57597558e-01 5.57597558e-01 5.57597558e-01 5.57597558e-01
 1.08507418e-02 2.31106212e-05]
Accuracy for train set: 0.9594729486923538
Accuracy for dev set:  0.7986025482942869


In [67]:
from sklearn.ensemble import RandomForestClassifier
# 2 0.80
clf = RandomForestClassifier(n_estimators=100,min_samples_split=10,bootstrap=False,max_leaf_nodes=5,
    max_depth=9)
clf.fit(X_train, y_train)
print("Probabilities :",  clf.predict_proba(X_test)[:10,1])
print("Accuracy for train set:", clf.score(X_train,y_train))
print("Accuracy for dev set: ", clf.score(X_test, y_test)) # Use model.score to evaluate our model.

Probabilities : [0.05434232 0.10237368 0.14717173 0.23653314 0.60564735 0.61282509
 0.68590436 0.61669096 0.32421626 0.09030745]
Accuracy for train set: 0.9613852959409904
Accuracy for dev set:  0.8070283600493219


In [22]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(random_state=0,max_iter=200)
clf.fit(X_train, y_train)
print("Accuracy for train set:", clf.score(X_train,y_train))
print("Accuracy for dev set: ", clf.score(X_test, y_test)) # Use model.score to evaluate our model.


Accuracy for train set: 0.9613958033434554
Accuracy for dev set:  0.7858610768598439


In [24]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten
from tensorflow.keras.optimizers import SGD
from tensorflow.keras.losses import BinaryCrossentropy

# Specify multi-class logistic regression. 
model = Sequential()
model.add(Dense(64,activation='selu'))
model.add(Dense(128,activation='sigmoid'))
model.add(Dense(1,activation='sigmoid'))

# Specify the loss function, optimizer and metrics for training
model.compile(optimizer='adam', loss=BinaryCrossentropy(),metrics=['accuracy'])
# momentum and nesterov are hyperparameters for momentum mechanism to seed up training

# Fit the model (this may take some time)
model.fit(X_train, y_train, epochs=30, batch_size=2000)

# Evaluate on the test set
score = model.evaluate(X_test, y_test, batch_size=2000) # fill in
print("On the test set: the loss is {} and the accuracy is {}".format(score[0], score[1]))

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
On the test set: the loss is 0.7180258631706238 and the accuracy is 0.7909987568855286
