In [None]:
from google.colab import drive
drive.mount('/gdrive')
import numpy as np
import pandas as pd
import csv
from pandas import read_csv
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from matplotlib import pyplot as plt
from sklearn import preprocessing

In [None]:
def load_dataset_online_news():
  X = []
  y = []
  with open('/gdrive/MyDrive/OnlineNewsPopularity.csv', 'r') as online_news_sharing:
      online_news_sharing_reader = csv.reader(online_news_sharing)
      next(online_news_sharing_reader)
      for row in online_news_sharing_reader:
          row = [float(string) for string in row]
          X.append([row[0], row[1], row[2], row[3], row[4], row[5], row[6], row[7], row[8], row[9],\
              row[10], row[11], row[12], row[13], row[14], row[15], row[16], row[17], row[18], row[19],\
              row[20], row[21], row[22], row[23], row[24], row[25], row[26], row[27], row[28], row[29],\
              row[30], row[31], row[32], row[33], row[34], row[35], row[36], row[37], row[38], row[39],\
              row[40], row[41], row[42], row[43], row[44], row[45], row[46], row[47], row[48], row[49],\
              row[50], row[51], row[52], row[53], row[54], row[55], row[56], row[57]])
          y.append(row[58])
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

  scaler = preprocessing.StandardScaler().fit(X_train)
  X_train = scaler.transform(X_train)
  X_test = scaler.transform(X_test)

  return X_train, X_test, y_train, y_test

In [None]:
def data_preprocessing(X_train, X_test, y_train, y_test, threshold, left_trunc_point = float("-inf"), right_trunc_point = float("inf")):
  X_train_trunc = []
  y_train_trunc = []

  # Truncating training set
  for num in range(len(y_train)):
    if y_train[num] > left_trunc_point and y_train[num] < right_trunc_point:
      X_train_trunc.append(X_train[num])
      # Labeling data
      y_train_trunc.append(int(y_train[num] > threshold))

  X_train_trunc = np.array(X_train_trunc)
 
  # Adding a column for bias
  column_ones = np.ones(X_train_trunc.shape[0])
  X_train_trunc = np.column_stack((column_ones, X_train_trunc))

  # Labeling data on the untruncated testset 
  y_test_processed = []
  X_test_processed = None
  for num in range(len(y_test)):
    y_test_processed.append(int(y_test[num] > threshold))
  X_test_processed = np.array(X_test)
  column_ones = np.ones(X_test_processed.shape[0])
  X_test_processed = np.column_stack((column_ones, X_test_processed))

  return X_train_trunc, y_train_trunc, X_test_processed, y_test_processed

In [None]:
def sigmoid(x):
  return 1 / (np.exp(-x) + 1)

def cost_func(x, y, theta):
  pred = sigmoid(np.dot(x, theta))
  return -(y.T * np.log(pred) + (1 - y.T) * np.log(1 - pred)).sum() / len(pred)

def compute_params(theta, x, left_trunc = float("-inf"), right_trunc = float("inf")):
  upper_lim = right_trunc - np.dot(theta, x)
  lower_lim = left_trunc - np.dot(theta, x)
  exp_noise = sigmoid(upper_lim) + sigmoid(lower_lim)
  return exp_noise, upper_lim, lower_lim

def gradient(theta, x, y, left_trunc = float("-inf"), right_trunc = float("inf")):
  exp_noise, upper_lim, lower_lim = compute_params(theta, x, left_trunc, right_trunc)

  if y == 1:

    return np.dot((sigmoid(upper_lim) + sigmoid(np.max((lower_lim, -np.dot(theta, x)))) - exp_noise), x) 
  else:

    return np.dot((sigmoid(lower_lim) + sigmoid(np.min((upper_lim, -np.dot(theta, x)))) - exp_noise), x)

def left_trunc_percentile(y_train, y_train_trunc):
  p = 1-(len(y_train_trunc)/len(y_train))
  percentile = -np.log((1-p)/p)
  return percentile


def test_accuracy(theta_cur, X_test_processed, y_test_processed, left_trunc_cur):
  y_pred = []
  
  for num in range(len(X_test_processed)):
    exp_noise, _, _ = compute_params(theta_cur, X_test_processed[num], left_trunc_cur)
    y_pred.append(sigmoid(np.dot(theta_cur, X_test_processed[num])+ 0.5 * exp_noise))
  y_pred = np.array(y_pred)
  pointwise_diff = abs((y_pred>0.5).astype(int) - y_test_processed)
  acc = 1-np.sum(pointwise_diff)/len(pointwise_diff)
  return acc

def Logistic_SGD(x, y, max_iter = 500, learning_rate = 0.0003, left_trunc = float("-inf"), right_trunc = float("inf")):
  theta = np.zeros_like(x[0])
  theta = np.random.rand(theta.shape[0])
  x = np.array(x)
  y = np.array(y)
  bb = 0
  acc_prev = 0
  max_acc = 0
  lr = learning_rate

  for iter in range(max_iter):
    state=np.random.get_state()
    np.random.shuffle(x)
    np.random.set_state(state)
    np.random.shuffle(y)
    if iter >= 20:
      lr = lr * 20/iter 
    for index in range(len(x)):
      G = gradient(theta, x[index], y[index], left_trunc, right_trunc)
      theta = theta + lr * G

  return theta

In [None]:
SGD_acc = []
tradition_acc = []
for i in range(7):
  threshold = 10000
  left_trunc_point = i*1000+1000
  X_train, X_test, y_train, y_test = load_dataset_online_news()
  X_train_trunc, y_train_trunc, X_test_processed, y_test_processed = data_preprocessing(X_train, X_test, y_train, y_test, threshold, left_trunc_point)
  l_trunc = left_trunc_percentile(y_train, y_train_trunc)
  max_iter = 100
  lr = 0.0003
  Theta = Logistic_SGD(X_train_trunc, y_train_trunc, max_iter = max_iter, learning_rate = lr, left_trunc = l_trunc)
  print(Theta)
  print(test_accuracy(Theta, X_test_processed, y_test_processed, l_trunc))
  SGD_acc.append(test_accuracy(Theta, X_test_processed, y_test_processed, l_trunc))

  logi_model = LogisticRegression()
  pred = logi_model.fit(X_train_trunc, y_train_trunc).predict(X_test_processed)
  print(1-np.sum(abs(pred-y_test_processed))/len(X_test_processed))

  tradition_acc.append(1-np.sum(abs(pred-y_test_processed))/len(X_test_processed))


In [None]:
x_axis = np.arange(7)*1000+1000
y1 = SGD_acc
y2 = tradition_acc
plt.plot(x_axis, y1, label='Truncated Logistic Regression', color = 'red', marker = 's')
plt.plot(x_axis, y2, label='Standard Logistic Regression', color = 'blue', marker = 'o')
plt.xlabel('Truncation Parameter C',fontsize=14)
plt.ylabel(r'Testset Accuracy',fontsize=14)
plt.grid(alpha=0.4,linestyle=':')
plt.legend()

plt.show()