In [1]:
# library used
from utils import import_data
import numpy as np
import logging
logging.basicConfig(level=logging.DEBUG)

In [2]:
# get the dataset
dataset_path = "data"
dataset_version = "fake-v1.0"

fake_dataset = import_data(dataset_path, dataset_version).T
fake_dataset.head()

Unnamed: 0,user_media_count,user_follower_count,user_following_count,user_has_profil_pic,user_is_private,follower_following_ratio,user_biography_length,username_length,username_digit_count,is_fake
0,0.0,25.0,1937.0,1.0,1.0,0.012907,0.0,10.0,0.0,1.0
1,0.0,324.0,4122.0,1.0,0.0,0.078603,0.0,15.0,4.0,1.0
2,0.0,15.0,399.0,0.0,0.0,0.037594,0.0,12.0,3.0,1.0
3,1.0,14.0,107.0,1.0,0.0,0.130841,0.0,10.0,1.0,1.0
4,0.0,264.0,4651.0,1.0,0.0,0.056762,0.0,14.0,0.0,1.0


In [7]:
# drop unused feature for data training
X = fake_dataset.drop(['is_fake', 'follower_following_ratio'], axis=1)

# label
y = fake_dataset.is_fake

# change the format to integer to make it faster
X = X.astype(int) 

# change the format to numpy array
X_num = X.to_numpy()
y_num  = y.to_numpy()
print(X_num)
print(y_num)

def train_test_split(X, y, test_size=0.2):
    i = int((1 - test_size) * X.shape[0]) 
    o = np.random.permutation(X.shape[0])
    
    X_train, X_test = np.split(np.take(X,o,axis=0), [i])
    y_train, y_test = np.split(np.take(y,o), [i])
    return X_train, X_test, y_train, y_test

# slice the data into variable for training
X_train, X_test, y_train, y_test = train_test_split(X_num, y_num, test_size=0.2)

[[   0   25 1937 ...    0   10    0]
 [   0  324 4122 ...    0   15    4]
 [   0   15  399 ...    0   12    3]
 ...
 [   1   78  783 ...   28   11    0]
 [   4  135  517 ...   44   11    4]
 [   4  203  823 ...   71   12    0]]
[1. 1. 1. ... 0. 0. 0.]


In [4]:
class SVM:
  def __init__(self, kernel='linear', C=10000.0, max_iter=100, degree=2, gamma=1):
    self.kernel = {'poly'  : lambda x,y: np.dot(x, y.T)**degree,
                   'rbf'   : lambda x,y: np.exp(-gamma*np.sum((y - x[:,np.newaxis])**2, axis=-1)),
                   'linear': lambda x,y: np.dot(x, y.T)}[kernel]
    self.C = C
    self.max_iter = max_iter

  def restrict_to_square(self, t, v0, u):
    t = (np.clip(v0 + t*u, 0, self.C) - v0)[1]/u[1]
    return (np.clip(v0 + t*u, 0, self.C) - v0)[0]/u[0]

  def fit(self, X, y):
    self.X = X.copy() # store for decision function
    self.y = y * 2 - 1 # convert classes 0 and 1 to -1 and +1 , store
    self.lambdas = np.zeros_like(self.y, dtype=float) # dual variables , all zeros satisfy eq.(1 b)
    self.K = self.kernel(self.X, self.X) * self.y[:,np.newaxis] * self.y # eq.(4)
    
    iters = 0
    for _ in range(self.max_iter):
      iters += 1
      for idxM in range(len(self.lambdas)):                                       # iterate all lambda_M
        idxL = np.random.randint(0, len(self.lambdas))                            # choose randomly lambda_L
        Q = self.K[[[idxM, idxM], [idxL, idxL]], [[idxM, idxL], [idxM, idxL]]]    # eq.(5c)
        v0 = self.lambdas[[idxM, idxL]]                                           # eq.(5a)
        k0 = 1 - np.sum(self.lambdas * self.K[[idxM, idxL]], axis=1)              # eq.(5b)
        u = np.array([-self.y[idxL], self.y[idxM]])                               # eq.(6b) 
        t_max = np.dot(k0, u) / (np.dot(np.dot(Q, u), u) + 1E-15) # eq.(7) , +1E -15 if idxM == idxL                
        self.lambdas[[idxM, idxL]] = v0 + u * self.restrict_to_square(t_max, v0, u) # eq.(8)
    
    logging.info("Convergence has reached after %s." % iters)
    idx, = np.nonzero(self.lambdas > 1E-15) # select indexes of support vectors
    self.b = np.mean((1.0 - np.sum(self.K[idx] * self.lambdas, axis=1)) * self.y[idx])
  
  def decision_function(self, X):
    return np.sum(self.kernel(X, self.X) * self.y * self.lambdas, axis=1) + self.b  # f from eq .(2)

  def predict(self, X):
    return (np.sign(self.decision_function(X)) + 1) // 2
  
# Convert one-hot representation into one column
def unhot(function): 

  def wrapper(actual, predicted):
      if len(actual.shape) > 1 and actual.shape[1] > 1:
          actual = actual.argmax(axis=1)
      if len(predicted.shape) > 1 and predicted.shape[1] > 1:
          predicted = predicted.argmax(axis=1)
      return function(actual, predicted)

  return wrapper

@unhot
def classification_error(actual, predicted):
    return (actual != predicted).sum() / float(actual.shape[0])

@unhot
def accuracy(actual, predicted):
    return 1.0 - classification_error(actual, predicted)

In [9]:
model = SVM(kernel= 'rbf', max_iter=100, C=0.5)
model.fit(X_train, y_train) 
predictions = model.predict(X_test)
print("Classification accuracy with RBF kernel: %s" % (accuracy(y_test, predictions)))

model = SVM(kernel= 'linear', max_iter=100, C=0.5)
model.fit(X_train, y_train) 
predictions = model.predict(X_test)
print("Classification accuracy with linear kernel: %s" % (accuracy(y_test, predictions)))

model = SVM(kernel= 'poly', max_iter=100, C=0.5)
model.fit(X_train, y_train) 
predictions = model.predict(X_test)
print("Classification accuracy with poly kernel: %s" % (accuracy(y_test, predictions)))

INFO:root:Convergence has reached after 100.


Classification accuracy with RBF kernel: [0. 0. 1. 0. 1. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 1. 0. 1. 0. 0. 0. 0. 0. 1. 0. 0. 1. 0. 0. 1. 0. 0. 1. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 1. 1. 1. 0. 0. 1. 0. 1. 1. 0. 1. 0. 0. 1. 0. 0. 1. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0.
 0. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 1.
 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 0. 0.]


## With PCA

In [8]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

pca = PCA(n_components=2)
scaler = StandardScaler()

X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

model = SVM(kernel= 'rbf', max_iter=100, C=0.5)
model.fit(X_train, y_train) 
predictions = model.predict(X_test)
print("Classification accuracy with RBF kernel + PCA: %s" % (accuracy(y_test, predictions)))

model = SVM(kernel= 'linear', max_iter=100, C=0.5)
model.fit(X_train, y_train) 
predictions = model.predict(X_test)
print("Classification accuracy with linear kernel + PCA: %s" % (accuracy(y_test, predictions)))

model = SVM(kernel= 'poly', max_iter=100, C=0.5)
model.fit(X_train, y_train) 
predictions = model.predict(X_test)
print("Classification accuracy with poly kernel + PCA: %s" % (accuracy(y_test, predictions)))

INFO:root:Convergence has reached after 100.


Classification accuracy with RBF kernel + PCA: 0.9163179916317992


INFO:root:Convergence has reached after 100.


Classification accuracy with linear kernel + PCA: 0.895397489539749


INFO:root:Convergence has reached after 100.


Classification accuracy with poly kernel + PCA: 0.8786610878661087
