In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
import random
import copy

In [None]:
df_test = pd.read_csv('test1.csv', sep=',')
df_train = pd.read_csv('train1.csv', sep=',')

In [None]:
def encode_categorical(data):
    result_data = copy.deepcopy(data)
    for column in data.columns.values:
        result_data = pd.concat([result_data, pd.get_dummies(result_data[column], prefix = column, prefix_sep = ': ')], axis = 1)
        result_data.drop([column], axis=1, inplace=True)
    return result_data


In [None]:
def transform_data(name):
    df = pd.read_csv(name, sep=',')
    df = df.replace(to_replace='positive', value=1)
    df = df.replace(to_replace='negative', value=0)
    y = np.array(df['V10'])
    df.drop(['V10'], axis=1, inplace=True)
    bin_df = encode_categorical(df)
    return np.array(bin_df).astype(int), y

In [None]:
X_train, y_train = transform_data('train1.csv')
X_test, y_test = transform_data('test1.csv')

In [None]:
X_train_pos = X_train[y_train == 1]
X_train_neg = X_train[y_train == 0]

Algorithm 1: Lazy classification with majority rule

In [None]:
def algorithm1(X_train_pos, X_train_neg, X_test):
  y_pred = []
  for k in range(len(X_test)):
    pos_value=0
    neg_value=0

    for i in range(len(X_train_pos)):
      cnt = 0

      desc = X_test[k] == X_train_pos[i]
      for j in range(len(X_train_neg)):
        intersection_with_other = X_test[k][desc] == X_train_neg[j][desc]
        if np.all(intersection_with_other):
          cnt+=1
      if cnt == 0:
        pos_value+=1

    for i in range(len(X_train_neg)):
      cnt = 0

      desc = X_test[k] == X_train_neg[i]
      for j in range(len(X_train_pos)):
        intersection_with_other = X_test[k][desc] == X_train_pos[j][desc]
        if np.all(intersection_with_other):
          cnt+=1
      if cnt == 0:
        neg_value+=1

    if pos_value > neg_value:
      y_pred.append(1)
    elif pos_value < neg_value:
      y_pred.append(0)
    else:
      x = random.randint(0, 9)%2
      y_pred.append(x)
  return np.array(y_pred)

In [None]:
y_pred1 = algorithm1(X_train_pos, X_train_neg, X_test)

In [None]:
print(f'accuracy: {accuracy_score(y_test, y_pred1)}\nprecision: {precision_score(y_test, y_pred1)} \nrecall: {recall_score(y_test, y_pred1)} \nF1 score: {f1_score(y_test, y_pred1)}')

accuracy: 0.8279569892473119
precision: 0.7922077922077922 
recall: 1.0 
F1 score: 0.8840579710144928


Algorithm 2: Algorithm 1 + normalized comparison

In [None]:
def algorithm2(X_train_pos, X_train_neg, X_test):
  y_pred = []
  for k in range(len(X_test)):
    pos_value=0
    neg_value=0

    for i in range(len(X_train_pos)):
      cnt = 0

      desc = X_test[k] == X_train_pos[i]
      for j in range(len(X_train_neg)):
        intersection_with_other = X_test[k][desc] == X_train_neg[j][desc]
        if np.all(intersection_with_other):
          cnt+=1
      if cnt == 0:
        pos_value+=1

    for i in range(len(X_train_neg)):
      cnt = 0

      desc = X_test[k] == X_train_neg[i]
      for j in range(len(X_train_pos)):
        intersection_with_other = X_test[k][desc] == X_train_pos[j][desc]
        if np.all(intersection_with_other):
          cnt+=1
      if cnt == 0:
        neg_value+=1

    if pos_value/len(X_train_pos) > neg_value/len(X_train_neg):
      y_pred.append(1)
    elif pos_value/len(X_train_pos) < neg_value/len(X_train_neg):
      y_pred.append(0)
    else:
      x = random.randint(0, 9)%2
      y_pred.append(x)
  
  return y_pred

In [None]:
y_pred2 = algorithm2(X_train_pos, X_train_neg, X_test)

In [None]:
print(f'accuracy: {accuracy_score(y_test, y_pred2)}\nprecision: {precision_score(y_test, y_pred2)} \nrecall: {recall_score(y_test, y_pred2)} \nF1 score: {f1_score(y_test, y_pred2)}')

accuracy: 0.8602150537634409
precision: 0.875 
recall: 0.9180327868852459 
F1 score: 0.8959999999999999


Algorithm 3: Algorithm 2 + cardinality of intersection

In [None]:
def algorithm3(X_train_pos, X_train_neg, X_test, c):
  y_pred = []
  for k in range(len(X_test)):
    pos_value=0
    neg_value=0

    for i in range(len(X_train_pos)):
      cnt = 0

      desc = X_test[k] == X_train_pos[i]
      if sum(desc)/len(X_train_pos[i]) > c:
        for j in range(len(X_train_neg)):
          intersection_with_other = X_test[k][desc] == X_train_neg[j][desc]
          if np.all(intersection_with_other):
            cnt+=1
        if cnt == 0:
          pos_value+=1

    for i in range(len(X_train_neg)):
      cnt = 0
      desc = X_test[k] == X_train_neg[i]
      if sum(desc)/len(X_train_neg[i]) > c:
        for j in range(len(X_train_pos)):
          intersection_with_other = X_test[k][desc] == X_train_pos[j][desc]
          if np.all(intersection_with_other):
            cnt+=1
        if cnt == 0:
          neg_value+=1

    if pos_value/len(X_train_pos) > neg_value/len(X_train_neg):
      y_pred.append(1)
    elif pos_value/len(X_train_pos) < neg_value/len(X_train_neg):
      y_pred.append(0)
    else:
      x = random.randint(0, 9)%2
      y_pred.append(x)
  return y_pred

In [None]:
c = np.linspace(0.6, 0.95, 8)
for param in c:
  y_pred3 = algorithm3(X_train_pos, X_train_neg, X_test, param)
  print(f'parameter: {param} \naccuracy: {accuracy_score(y_test, y_pred3)}\nprecision: {precision_score(y_test, y_pred3)} \nrecall: {recall_score(y_test, y_pred3)} \nF1 score: {f1_score(y_test, y_pred3)}\n')

parameter: 0.6 
accuracy: 0.8494623655913979
precision: 0.873015873015873 
recall: 0.9016393442622951 
F1 score: 0.8870967741935485

parameter: 0.65 
accuracy: 0.8709677419354839
precision: 0.9622641509433962 
recall: 0.8360655737704918 
F1 score: 0.8947368421052632

parameter: 0.7 
accuracy: 0.8709677419354839
precision: 0.9622641509433962 
recall: 0.8360655737704918 
F1 score: 0.8947368421052632

parameter: 0.75 
accuracy: 0.8172043010752689
precision: 0.9782608695652174 
recall: 0.7377049180327869 
F1 score: 0.8411214953271028

parameter: 0.7999999999999999 
accuracy: 0.9354838709677419
precision: 1.0 
recall: 0.9016393442622951 
F1 score: 0.9482758620689655

parameter: 0.85 
accuracy: 0.9354838709677419
precision: 1.0 
recall: 0.9016393442622951 
F1 score: 0.9482758620689655

parameter: 0.8999999999999999 
accuracy: 0.4838709677419355
precision: 0.6756756756756757 
recall: 0.4098360655737705 
F1 score: 0.5102040816326531

parameter: 0.95 
accuracy: 0.5268817204301075
precision: 0.6

Algorithm 4: mean cardinality of 20 most cardinal intersections

In [None]:
def algorithm4(X_train_pos, X_train_neg, X_test):
  y_pred = []
  for k in range(len(X_test)):
    pos_context=0
    pos = []
    neg_context=0
    neg = []

    for i in range(len(X_train_pos)):
      desc = sum(X_test[k] == X_train_pos[i])
      pos.append(desc)

    for i in range(len(X_train_neg)):
      desc = sum(X_test[k] == X_train_neg[i])
      neg.append(desc)


    mean_pos = sum(sorted(pos, reverse=True)[:20])/15
    mean_neg = sum(sorted(neg, reverse=True)[:20])/15

    if mean_pos > mean_neg:
      y_pred.append(1)
    elif mean_pos < mean_neg:
      y_pred.append(0)
    else:
      x = random.randint(0, 9)%2
      y_pred.append(x)
  return(y_pred)

In [None]:
y_pred4 = algorithm4(X_train_pos, X_train_neg, X_test)

In [None]:
print(f'accuracy: {accuracy_score(y_test, y_pred4)}\nprecision: {precision_score(y_test, y_pred4)} \nrecall: {recall_score(y_test, y_pred4)} \nF1 score: {f1_score(y_test, y_pred4)}')

accuracy: 1.0
precision: 1.0 
recall: 1.0 
F1 score: 1.0


5-fold training

In [None]:
df_test = pd.read_csv('test4.csv', sep=',')
df_train = pd.read_csv('train4.csv', sep=',')

In [None]:
skf = StratifiedKFold(n_splits=5, shuffle=True)
skf.get_n_splits(X_train, y_train)

5

In [None]:
i = 1
for train_i, test_i in skf.split(X_train, y_train):
  X_train_, X_test_ = X_train[train_i], X_train[test_i]
  y_train_, y_test_ = y_train[train_i], y_train[test_i]
  X_train_pos = X_train_[y_train_ == 1]
  X_train_neg = X_train_[y_train_ == 0]
  y_pred = algorithm1(X_train_pos, X_train_neg, X_test_)
  print(f'fold: {i}\naccuracy: {accuracy_score(y_test_, y_pred)}\nprecision: {precision_score(y_test_, y_pred)} \nrecall: {recall_score(y_test_, y_pred)} \nF1 score: {f1_score(y_test_, y_pred)}\n')
  i+=1

fold: 1
accuracy: 0.7745664739884393
precision: 0.743421052631579 
recall: 1.0 
F1 score: 0.8528301886792453

fold: 2
accuracy: 0.7803468208092486
precision: 0.7483443708609272 
recall: 1.0 
F1 score: 0.8560606060606061

fold: 3
accuracy: 0.7572254335260116
precision: 0.7290322580645161 
recall: 1.0 
F1 score: 0.8432835820895522

fold: 4
accuracy: 0.7687861271676301
precision: 0.738562091503268 
recall: 1.0 
F1 score: 0.849624060150376

fold: 5
accuracy: 0.7572254335260116
precision: 0.7290322580645161 
recall: 1.0 
F1 score: 0.8432835820895522



In [None]:
i = 1
for train_i, test_i in skf.split(X_train, y_train):
  X_train_, X_test_ = X_train[train_i], X_train[test_i]
  y_train_, y_test_ = y_train[train_i], y_train[test_i]
  X_train_pos = X_train_[y_train_ == 1]
  X_train_neg = X_train_[y_train_ == 0]
  y_pred = algorithm2(X_train_pos, X_train_neg, X_test_)
  print(f'fold: {i}\naccuracy: {accuracy_score(y_test_, y_pred)}\nprecision: {precision_score(y_test_, y_pred)} \nrecall: {recall_score(y_test_, y_pred)} \nF1 score: {f1_score(y_test_, y_pred)}\n')
  i+=1

fold: 1
accuracy: 0.8554913294797688
precision: 0.860655737704918 
recall: 0.9292035398230089 
F1 score: 0.8936170212765957

fold: 2
accuracy: 0.8554913294797688
precision: 0.8384615384615385 
recall: 0.9646017699115044 
F1 score: 0.8971193415637859

fold: 3
accuracy: 0.884393063583815
precision: 0.8549618320610687 
recall: 0.9911504424778761 
F1 score: 0.9180327868852459

fold: 4
accuracy: 0.838150289017341
precision: 0.8148148148148148 
recall: 0.9734513274336283 
F1 score: 0.8870967741935484

fold: 5
accuracy: 0.8786127167630058
precision: 0.8650793650793651 
recall: 0.9646017699115044 
F1 score: 0.9121338912133892



In [None]:
i = 1
for train_i, test_i in skf.split(X_train, y_train):
  X_train_, X_test_ = X_train[train_i], X_train[test_i]
  y_train_, y_test_ = y_train[train_i], y_train[test_i]
  X_train_pos = X_train_[y_train_ == 1]
  X_train_neg = X_train_[y_train_ == 0]
  y_pred = algorithm3(X_train_pos, X_train_neg, X_test_, 0.85)
  print(f'fold: {i}\naccuracy: {accuracy_score(y_test_, y_pred)}\nprecision: {precision_score(y_test_, y_pred)} \nrecall: {recall_score(y_test_, y_pred)} \nF1 score: {f1_score(y_test_, y_pred)}\n')
  i+=1

fold: 1
accuracy: 0.9826589595375722
precision: 1.0 
recall: 0.9734513274336283 
F1 score: 0.9865470852017937

fold: 2
accuracy: 0.9653179190751445
precision: 1.0 
recall: 0.9469026548672567 
F1 score: 0.9727272727272728

fold: 3
accuracy: 0.9653179190751445
precision: 1.0 
recall: 0.9469026548672567 
F1 score: 0.9727272727272728

fold: 4
accuracy: 0.976878612716763
precision: 1.0 
recall: 0.9646017699115044 
F1 score: 0.9819819819819819

fold: 5
accuracy: 0.9884393063583815
precision: 1.0 
recall: 0.9823008849557522 
F1 score: 0.9910714285714286



In [None]:
i = 1
for train_i, test_i in skf.split(X_train, y_train):
  X_train_, X_test_ = X_train[train_i], X_train[test_i]
  y_train_, y_test_ = y_train[train_i], y_train[test_i]
  X_train_pos = X_train_[y_train_ == 1]
  X_train_neg = X_train_[y_train_ == 0]
  y_pred = algorithm4(X_train_pos, X_train_neg, X_test_)
  print(f'fold: {i}\naccuracy: {accuracy_score(y_test_, y_pred)}\nprecision: {precision_score(y_test_, y_pred)} \nrecall: {recall_score(y_test_, y_pred)} \nF1 score: {f1_score(y_test_, y_pred)}\n')
  i+=1

fold: 1
accuracy: 0.8728323699421965
precision: 0.8421052631578947 
recall: 0.9911504424778761 
F1 score: 0.9105691056910569

fold: 2
accuracy: 0.8959537572254336
precision: 0.8625954198473282 
recall: 1.0 
F1 score: 0.9262295081967213

fold: 3
accuracy: 0.930635838150289
precision: 0.904 
recall: 1.0 
F1 score: 0.9495798319327732

fold: 4
accuracy: 0.9075144508670521
precision: 0.8818897637795275 
recall: 0.9911504424778761 
F1 score: 0.9333333333333333

fold: 5
accuracy: 0.9248554913294798
precision: 0.8968253968253969 
recall: 1.0 
F1 score: 0.9456066945606695



## Classical ML methods:

Logistic Regression

In [None]:
i = 1
for train_i, test_i in skf.split(X_train, y_train):
  X_train_, X_test_ = X_train[train_i], X_train[test_i]
  y_train_, y_test_ = y_train[train_i], y_train[test_i]
  clf = LogisticRegression(random_state=346).fit(X_train_, y_train_)
  y_pred = clf.predict(X_test_)
  print(f'fold: {i}\naccuracy: {accuracy_score(y_test_, y_pred)}\nprecision: {precision_score(y_test_, y_pred)} \nrecall: {recall_score(y_test_, y_pred)} \nF1 score: {f1_score(y_test_, y_pred)}\n')
  i+=1

fold: 1
accuracy: 0.976878612716763
precision: 0.9658119658119658 
recall: 1.0 
F1 score: 0.9826086956521739

fold: 2
accuracy: 0.976878612716763
precision: 0.9739130434782609 
recall: 0.9911504424778761 
F1 score: 0.9824561403508772

fold: 3
accuracy: 0.9710982658959537
precision: 0.9736842105263158 
recall: 0.9823008849557522 
F1 score: 0.9779735682819383

fold: 4
accuracy: 0.9653179190751445
precision: 0.9495798319327731 
recall: 1.0 
F1 score: 0.9741379310344828

fold: 5
accuracy: 0.9884393063583815
precision: 0.9826086956521739 
recall: 1.0 
F1 score: 0.9912280701754386



Naive Bayes

In [None]:
i = 1
for train_i, test_i in skf.split(X_train, y_train):
  X_train_, X_test_ = X_train[train_i], X_train[test_i]
  y_train_, y_test_ = y_train[train_i], y_train[test_i]
  clf = GaussianNB()
  clf.fit(X_train_, y_train_)
  y_pred = clf.predict(X_test_)
  print(f'fold: {i}\naccuracy: {accuracy_score(y_test_, y_pred)}\nprecision: {precision_score(y_test_, y_pred)} \nrecall: {recall_score(y_test_, y_pred)} \nF1 score: {f1_score(y_test_, y_pred)}\n')
  i+=1

fold: 1
accuracy: 0.630057803468208
precision: 0.7130434782608696 
recall: 0.7256637168141593 
F1 score: 0.719298245614035

fold: 2
accuracy: 0.6763005780346821
precision: 0.7394957983193278 
recall: 0.7787610619469026 
F1 score: 0.7586206896551726

fold: 3
accuracy: 0.6763005780346821
precision: 0.7317073170731707 
recall: 0.7964601769911505 
F1 score: 0.7627118644067795

fold: 4
accuracy: 0.6647398843930635
precision: 0.7272727272727273 
recall: 0.7787610619469026 
F1 score: 0.752136752136752

fold: 5
accuracy: 0.7052023121387283
precision: 0.7672413793103449 
recall: 0.7876106194690266 
F1 score: 0.777292576419214

