In [None]:
import csv
import json
import pandas as pd
import numpy as np
import random
from collections import defaultdict

In [3]:
train_set = pd.read_csv('../data/final/train_reconstructed.csv')

X_train = train_set.iloc[:,:-1].values
y_train = train_set['edge'].values

print('Training set X: {}'.format(X_train[:2]))
print('Training set Y: {}'.format(y_train[:2]))

Training set X: [[22.          2.          0.          3.         12.          0.46153846
   3.          0.375       0.          0.        ]
 [14.          0.          1.          5.         13.          0.56521739
   2.          0.33333333  0.          0.        ]]
Training set Y: [1 1]


In [4]:
test_set = pd.read_csv('../data/final/dev-test.csv')

X_test = test_set.iloc[:,:-1].values
y_test = test_set['edge'].values

print('Test set X: {}'.format(X_test[:2]))
print('Test set Y: {}'.format(y_test[:2]))

Test set X: [[ 9.          0.          1.          2.          6.          0.3
   0.          0.          0.          0.        ]
 [12.          0.          0.          2.          8.          0.34782609
   1.          0.14285714  0.          0.        ]]
Test set Y: [0 1]


In [5]:
from sklearn.dummy import DummyClassifier

ds_clf = DummyClassifier(strategy="most_frequent") # Define our model, set parameter strategy to 'most_frequent'
ds_clf.fit(X_train, y_train) # Use model.fit to train with our dataset 
Y_predict = ds_clf.predict(X_test) # Use model.predict to make prediction
print("Prediction :", Y_predict[:10])
print("Accuracy for train set:", ds_clf.score(X_train,y_train))
print("Accuracy for dev set:", ds_clf.score(X_test, y_test)) # Use model.score to evaluate our model.

Prediction : [1 1 1 1 1 1 1 1 1 1]
Accuracy for train set: 0.5013927576601671
Accuracy for dev set: 0.5


In [6]:
from sklearn.naive_bayes import BernoulliNB

bnb = BernoulliNB()
bnb.fit(X_train, y_train)
Y_proba = bnb.predict_proba(X_test) # Use model.predict to make prediction
# Prob of being one
print("Probabilities :", Y_proba[:10,1])
print("Accuracy for train set:", bnb.score(X_train,y_train))
print("Accuracy for dev set: ", bnb.score(X_test, y_test)) # Use model.score to evaluate our model.


Probabilities : [1.98339377e-05 1.99907224e-02 4.20573320e-04 5.90976879e-03
 6.60444015e-01 6.60444015e-01 6.60444015e-01 6.60444015e-01
 1.99907224e-02 4.50153082e-05]
Accuracy for train set: 0.9596468176801387
Accuracy for dev set:  0.7640772708590218


In [7]:
from sklearn.ensemble import RandomForestClassifier
# 2 0.80
clf = RandomForestClassifier(n_estimators=100,max_depth=2)
clf.fit(X_train, y_train)
print("Probabilities :",  clf.predict_proba(X_test)[:10,1])
print("Accuracy for train set:", clf.score(X_train,y_train))
print("Accuracy for dev set: ", clf.score(X_test, y_test)) # Use model.score to evaluate our model.

Probabilities : [0.04991666 0.06684381 0.11988967 0.16334668 0.74430083 0.74430083
 0.77337858 0.75021855 0.25037957 0.05300882]
Accuracy for train set: 0.9613811951437431
Accuracy for dev set:  0.8070283600493219


In [8]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(random_state=0,max_iter=200)
clf.fit(X_train, y_train)
print("Accuracy for train set:", clf.score(X_train,y_train))
print("Accuracy for dev set: ", clf.score(X_test, y_test)) # Use model.score to evaluate our model.


Accuracy for train set: 0.9612025017080991
Accuracy for dev set:  0.7879161528976573


In [13]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten
from tensorflow.keras.optimizers import SGD
from tensorflow.keras.losses import BinaryCrossentropy

# Specify multi-class logistic regression. 
model = Sequential()
model.add(Dense(128,activation='tanh'))
model.add(Dense(128,activation='sigmoid'))
model.add(Dense(1,activation='sigmoid'))

# Specify the loss function, optimizer and metrics for training
model.compile(optimizer='adam', loss=BinaryCrossentropy(),metrics=['accuracy'])
# momentum and nesterov are hyperparameters for momentum mechanism to seed up training

# Fit the model (this may take some time)
model.fit(X_train, y_train, epochs=30, batch_size=1250)

# Evaluate on the test set
score = model.evaluate(X_test, y_test, batch_size=1250) # fill in
print("On the test set: the loss is {} and the accuracy is {}".format(score[0], score[1]))

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
On the test set: the loss is 0.6925538778305054 and the accuracy is 0.7922317981719971


In [45]:
# Training full model
X_big = np.concatenate((X_train,X_test),0)
y_big = np.concatenate((y_train,y_test),0)

test_final = pd.read_csv('../data/final/test-final.csv')
test_final = test_final.values


In [50]:
clf = RandomForestClassifier(n_estimators=100,max_depth=2)
clf.fit(X_big, y_big)
print("Probabilities :",  clf.predict_proba(test_final)[:,1])
pred = clf.predict_proba(test_final)[:,1]
# print("Accuracy for train set:", clf.score(X_train,y_train))
# print("Accuracy for dev set: ", clf.score(X_test, y_test)) # Use model.score to evaluate our model.

submission = {
    'Id': range(1,len(pred)+1),
    'Predicted': pred
}

submission_df = pd.DataFrame(data=submission)
submission_df.to_csv('../data/final/sub.csv', index=False)

Probabilities : [0.06548474 0.81886548 0.31918873 ... 0.13307536 0.2042267  0.73967939]
