In [8]:
from utils import import_data
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style('whitegrid')

In [9]:
dataset_path = "data"
dataset_version = "fake-v1.0"

fake_dataset = import_data(dataset_path, dataset_version).T

In [10]:
fake_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1194 entries, 0 to 1193
Data columns (total 10 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   user_media_count          1194 non-null   float64
 1   user_follower_count       1194 non-null   float64
 2   user_following_count      1194 non-null   float64
 3   user_has_profil_pic       1194 non-null   float64
 4   user_is_private           1194 non-null   float64
 5   follower_following_ratio  1194 non-null   float64
 6   user_biography_length     1194 non-null   float64
 7   username_length           1194 non-null   float64
 8   username_digit_count      1194 non-null   float64
 9   is_fake                   1194 non-null   float64
dtypes: float64(10)
memory usage: 93.4 KB


In [11]:
fake_dataset.describe()

Unnamed: 0,user_media_count,user_follower_count,user_following_count,user_has_profil_pic,user_is_private,follower_following_ratio,user_biography_length,username_length,username_digit_count,is_fake
count,1194.0,1194.0,1194.0,1194.0,1194.0,1194.0,1194.0,1194.0,1194.0,1194.0
mean,57.596315,369.095477,744.261307,0.922948,0.657454,0.812195,22.847571,11.123953,0.495812,0.167504
std,107.402157,372.775741,1032.52642,0.266786,0.47476,0.804559,33.492868,2.998135,1.21201,0.373582
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0
25%,3.0,152.0,267.0,1.0,0.0,0.481885,0.0,9.0,0.0,0.0
50%,20.0,304.0,449.0,1.0,1.0,0.839351,7.0,11.0,0.0,0.0
75%,67.0,481.0,711.0,1.0,1.0,1.003664,33.0,13.0,0.0,0.0
max,1058.0,4492.0,7497.0,1.0,1.0,16.8,150.0,30.0,10.0,1.0


In [12]:
from sklearn.model_selection import train_test_split

X = fake_dataset.drop('is_fake', axis=1)
y = fake_dataset.is_fake

# Convert y to {-1, 1}
y = (y * 2) - 1
print(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1111)

0       1.0
1       1.0
2       1.0
3       1.0
4       1.0
       ... 
1189   -1.0
1190   -1.0
1191   -1.0
1192   -1.0
1193   -1.0
Name: is_fake, Length: 1194, dtype: float64


In [13]:
import logging
import scipy.spatial.distance as dist

class Linear(object):
    def __call__(self, x, y):
        return np.dot(x, y.T)

    def __repr__(self):
        return "Linear kernel"


class Poly(object):
    def __init__(self, degree=2):
        self.degree = degree

    def __call__(self, x, y):
        return np.dot(x, y.T) ** self.degree

    def __repr__(self):
        return "Poly kernel"


class RBF(object):
    def __init__(self, gamma=0.1):
        self.gamma = gamma

    def __call__(self, x, y):
        x = np.atleast_2d(x)
        y = np.atleast_2d(y)
        return np.exp(-self.gamma * dist.cdist(x, y) ** 2).flatten()

    def __repr__(self):
        return "RBF kernel"
    
class BaseEstimator:
    y_required = True
    fit_required = True

    def _setup_input(self, X, y=None):
        """Ensure inputs to an estimator are in the expected format.

        Ensures X and y are stored as numpy ndarrays by converting from an
        array-like object if necessary. Enables estimators to define whether
        they require a set of y target values or not with y_required, e.g.
        kmeans clustering requires no target labels and is fit against only X.

        Parameters
        ----------
        X : array-like
            Feature dataset.
        y : array-like
            Target values. By default is required, but if y_required = false
            then may be omitted.
        """
        if not isinstance(X, np.ndarray):
            X = np.array(X)

        if X.size == 0:
            raise ValueError("Got an empty matrix.")

        if X.ndim == 1:
            self.n_samples, self.n_features = 1, X.shape
        else:
            self.n_samples, self.n_features = X.shape[0], np.prod(X.shape[1:])

        self.X = X

        if self.y_required:
            if y is None:
                raise ValueError("Missed required argument y")

            if not isinstance(y, np.ndarray):
                y = np.array(y)

            if y.size == 0:
                raise ValueError("The targets array must be no-empty.")

        self.y = y

    def fit(self, X, y=None):
        self._setup_input(X, y)

    def predict(self, X=None):
        if not isinstance(X, np.ndarray):
            X = np.array(X)

        if self.X is not None or not self.fit_required:
            return self._predict(X)
        else:
            raise ValueError("You must call `fit` before `predict`")

    def _predict(self, X=None):
        raise NotImplementedError()


np.random.seed(9999)
class SVM(BaseEstimator):
    def __init__(self, C=1.0, kernel=None, tol=1e-3, max_iter=100):
        """Support vector machines implementation using simplified SMO optimization.

        Parameters
        ----------
        C : float, default 1.0
        kernel : Kernel object
        tol : float , default 1e-3
        max_iter : int, default 100
        """
        self.C = C
        self.tol = tol
        self.max_iter = max_iter
        if kernel is None:
            self.kernel = Linear()
        else:
            self.kernel = kernel

        self.b = 0
        self.alpha = None
        self.K = None

    def fit(self, X, y=None):
        self._setup_input(X, y)
        self.K = np.zeros((self.n_samples, self.n_samples))
        for i in range(self.n_samples):
            self.K[:, i] = self.kernel(self.X, self.X[i, :])
        self.alpha = np.zeros(self.n_samples)
        self.sv_idx = np.arange(0, self.n_samples)
        return self._train()

    def _train(self):
        iters = 0
        while iters < self.max_iter:
            iters += 1
            alpha_prev = np.copy(self.alpha)

            for j in range(self.n_samples):
                # Pick random i
                i = self.random_index(j)

                eta = 2.0 * self.K[i, j] - self.K[i, i] - self.K[j, j]
                if eta >= 0:
                    continue
                L, H = self._find_bounds(i, j)

                # Error for current examples
                e_i, e_j = self._error(i), self._error(j)

                # Save old alphas
                alpha_io, alpha_jo = self.alpha[i], self.alpha[j]

                # Update alpha
                self.alpha[j] -= (self.y[j] * (e_i - e_j)) / eta
                self.alpha[j] = self.clip(self.alpha[j], H, L)

                self.alpha[i] = self.alpha[i] + self.y[i] * self.y[j] * (alpha_jo - self.alpha[j])

                # Find intercept
                b1 = (
                    self.b - e_i - self.y[i] * (self.alpha[i] - alpha_io) * self.K[i, i]
                    - self.y[j] * (self.alpha[j] - alpha_jo) * self.K[i, j]
                )
                b2 = (
                    self.b - e_j - self.y[j] * (self.alpha[j] - alpha_jo) * self.K[j, j]
                    - self.y[i] * (self.alpha[i] - alpha_io) * self.K[i, j]
                )
                if 0 < self.alpha[i] < self.C:
                    self.b = b1
                elif 0 < self.alpha[j] < self.C:
                    self.b = b2
                else:
                    self.b = 0.5 * (b1 + b2)

            # Check convergence
            diff = np.linalg.norm(self.alpha - alpha_prev)
            if diff < self.tol:
                break
        logging.info("Convergence has reached after %s." % iters)

        # Save support vectors index
        self.sv_idx = np.where(self.alpha > 0)[0]

    def _predict(self, X=None):
        n = X.shape[0]
        result = np.zeros(n)
        for i in range(n):
            result[i] = np.sign(self._predict_row(X[i, :]))
        return result

    def _predict_row(self, X):
        k_v = self.kernel(self.X[self.sv_idx], X)
        return np.dot((self.alpha[self.sv_idx] * self.y[self.sv_idx]).T, k_v.T) + self.b

    def clip(self, alpha, H, L):
        if alpha > H:
            alpha = H
        if alpha < L:
            alpha = L
        return alpha

    def _error(self, i):
        """Error for single example."""
        return self._predict_row(self.X[i]) - self.y[i]

    def _find_bounds(self, i, j):
        """Find L and H such that L <= alpha <= H.
        Also, alpha must satisfy the constraint 0 <= αlpha <= C.
        """
        if self.y[i] != self.y[j]:
            L = max(0, self.alpha[j] - self.alpha[i])
            H = min(self.C, self.C - self.alpha[i] + self.alpha[j])
        else:
            L = max(0, self.alpha[i] + self.alpha[j] - self.C)
            H = min(self.C, self.alpha[i] + self.alpha[j])
        return L, H

    def random_index(self, z):
        i = z
        while i == z:
            i = np.random.randint(0, self.n_samples - 1)
        return i
    
def unhot(function):
    """Convert one-hot representation into one column."""

    def wrapper(actual, predicted):
        if len(actual.shape) > 1 and actual.shape[1] > 1:
            actual = actual.argmax(axis=1)
        if len(predicted.shape) > 1 and predicted.shape[1] > 1:
            predicted = predicted.argmax(axis=1)
        return function(actual, predicted)

    return wrapper

@unhot
def classification_error(actual, predicted):
    return (actual != predicted).sum() / float(actual.shape[0])


@unhot
def accuracy(actual, predicted):
    return 1.0 - classification_error(actual, predicted)

In [14]:
logging.basicConfig(level=logging.DEBUG)
from sklearn.datasets import make_classification

def classification():
    # Generate a random binary classification problem.
    # X, y = make_classification(
    #     n_samples=1200, n_features=10, n_informative=5, random_state=1111, n_classes=2, class_sep=1.75
    # )
  
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1111)

    for kernel in [RBF(gamma=0.1), Linear()]:
        model = SVM(max_iter=500, kernel=kernel, C=0.6)
        model.fit(X_train, y_train)
        predictions = model.predict(X_test)
        print("Classification accuracy (%s): %s" % (kernel, accuracy(y_test, predictions)))


if __name__ == "__main__":
    classification()

INFO:root:Convergence has reached after 20.
INFO:root:Convergence has reached after 1.


Classification accuracy (RBF kernel): 0.8440111420612814
Classification accuracy (Linear kernel): 0.21727019498607247
