In [1]:
def read_vocabulary_aclImdb(file_name):
    ID = 0
    file = open(file_name, "r", encoding='utf8')
    vocabulary = []

    for word in file:
        vocabulary.append(ID)
        ID += 1

    file.close()
    return vocabulary
    
def read_dataset_aclImdb(file_name):
    pos_examples = []
    neg_examples = []

    file = open(file_name, "r", encoding='utf8')
    
    content = file.read()
    content_list = content.split("\n")
    content_list.remove(content_list[len(content_list)-1])
    
    file.close()
    
    for review in tqdm(content_list, desc = "Loading..."):

        IDs = []

        index = int(review.index(" "))
        # first number is the rank of the current review
        rank = int(review[0:index])
        review = review[index + 1:]

        data = review.split(" ")
        for d in data:
            sep = d.split(":")
            IDs.append(int(sep[0]))

        if rank >= 7:
            pos_examples.append(IDs)
        elif rank <= 4:
            neg_examples.append(IDs)
       
    return pos_examples, neg_examples

def convert_data(data, c):
    x = []
    y = []
    for example in tqdm(data):
        ex = []
        for ID in vocabulary:
            if ID in example:
                ex.append(1)
            else:
                ex.append(0)
        x.append(ex)
        y.append(c)
        
        
    return x, y

In [2]:
from tqdm import tqdm
from math import log2

class Information_Gain:

    def __init__(self, vocabulary, positive_examples, negative_examples):
        self.vocabulary = vocabulary
        self.positive_examples = positive_examples
        self.negative_examples = negative_examples
        self.H_C = 1.0
        self.features = []


    def IG(self, X):
        """pretend that each word exists at least once in both positive
        and negative examples, so it will not appear a math domain error in 
        log 2 function while calculating entropy"""
        
        X_0 = 2
        X_1 = 2
        C_0_X_0 = 1
        C_1_X_0 = 1
        C_0_X_1 = 1
        C_1_X_1 = 1
        
        for example in self.positive_examples:
            if X in example:
                X_1 += 1
                C_1_X_1 += 1
            else:
                X_0 += 1
                C_1_X_0 += 1
                
        for example in self.negative_examples:
            if X in example:
                X_1 += 1
                C_0_X_1 += 1
            else:
                X_0 += 1
                C_0_X_0 += 1
                
        p_X_1 = X_1 / (len(self.positive_examples)+len(self.negative_examples))
        p_X_0 = 1.0 - p_X_1
        
        p_C_0_X_1 = C_0_X_1 / X_1
        p_C_1_X_1 = C_1_X_1 / X_1
        p_C_0_X_0 = C_0_X_0 / X_0
        p_C_1_X_0 = C_1_X_0 / X_0
        
        H_C_X_1 = -(p_C_0_X_1*log2(p_C_0_X_1) + p_C_1_X_1*log2(p_C_1_X_1))
        H_C_X_0 = -(p_C_0_X_0*log2(p_C_0_X_0) + p_C_1_X_0*log2(p_C_1_X_0))
        
        S = p_X_0*H_C_X_0 + p_X_1*H_C_X_1
        
        return self.H_C - S
    
    def myFunc(self, e):
        return e[1]
    
    def calculate_IG(self, n):
        for X in tqdm(self.vocabulary[:n], desc = "Calculating IG..."):
            IG_X = [X, self.IG(X)]
            self.features.append(IG_X)

        self.features.sort(key=self.myFunc, reverse=True)
    
    def get_m_features(self, m): 
        return [feature[0] for feature in self.features[:m]]

In [3]:
vocabulary_file = "aclImdb//imdb.vocab"
vocabulary = read_vocabulary_aclImdb(vocabulary_file)

In [4]:
training_file = "aclImdb//train//labeledBow.feat"

positive_examples, negative_examples = read_dataset_aclImdb(training_file)


Loading...: 100%|██████| 25000/25000 [00:04<00:00, 5494.34it/s]


In [5]:
testing_file = "aclImdb//test//labeledBow.feat"

testing_positive_examples, testing_negative_examples = read_dataset_aclImdb(testing_file)

Loading...: 100%|██████| 25000/25000 [00:04<00:00, 5461.12it/s]


In [6]:
I_G = Information_Gain(vocabulary, positive_examples, negative_examples)

In [7]:
I_G.calculate_IG(5000)

Calculating IG...: 100%|███| 5000/5000 [08:50<00:00,  9.42it/s]


In [8]:
vocabulary = I_G.get_m_features(1000)

In [9]:
x_pos, y_pos = convert_data(positive_examples, 1)
x_neg, y_neg = convert_data(negative_examples, 0)

    
x_train = x_pos[:int(len(positive_examples)*80/100)] + x_neg[:int(len(positive_examples)*80/100)]
y_train = y_pos[:int(len(positive_examples)*80/100)] + y_neg[:int(len(positive_examples)*80/100)]
x_dev = x_pos[int(len(positive_examples)*80/100):] + x_neg[int(len(positive_examples)*80/100):]
y_dev = y_pos[int(len(positive_examples)*80/100):] + y_neg[int(len(positive_examples)*80/100):]

x_pos, y_pos = convert_data(testing_positive_examples, 1)
x_neg, y_neg = convert_data(testing_negative_examples, 0)

x_test = x_pos + x_neg
y_test = y_pos + y_neg

100%|███████████████████| 12500/12500 [00:47<00:00, 265.73it/s]
100%|███████████████████| 12500/12500 [00:47<00:00, 264.14it/s]
100%|███████████████████| 12500/12500 [00:44<00:00, 279.51it/s]
100%|███████████████████| 12500/12500 [00:45<00:00, 276.75it/s]


In [16]:
import numpy as np
import random
random.seed = 1

class LogisticRegression:
    def __init__(self):  
        self.w = None
        self.best_normalization_term = 0
        self.threshold = 0.5
        self.max_epochs = 5
        self.lr_rates = [0.01, 0.001, 0.0001, 0.00001]
        
    
    # is basically the probability of review to be positive
    def __sigmoid(self, x, weights):
        s = 0
        for i in range(len(x)):
            s += weights[i]*x[i]
            
        s += weights[len(weights)-1]
        return 1.0 / (1.0 + np.exp(-s))
        
        
    def __gradient_descent(self, weights, lr_rate, normalization_term, x, y):
        norm_sum = 0
        
        for i in range(len(weights)):
            norm_sum += weights[i]**2
            
        accs =[]
        for j in range(len(x)):
            probCPlus = self.__sigmoid(x[j], weights)
            accs.append(y[j] - probCPlus)
    
        for l in range(len(weights)):
            if l == len(weights) - 1:
                weights[l] = (1-2*normalization_term*lr_rate)*weights[l] + lr_rate*sum(accs)
            else:
                anadelta = 0
                for j in range(len(x)):
                    anadelta += accs[j] * x[j][l]
                weights[l] = (1-2*normalization_term*lr_rate)*weights[l] + lr_rate*anadelta
            
        return weights
    
    
    def __calculate_w(self, x, y, n_t, lr):
        batch_size = 64
        total_batches = len(x) // batch_size
        
        batch_no = 0
        s = 0
        
        for epoch in tqdm(range(self.max_epochs), desc = "Norm_Term=" + str(n_t) + 
                                 " Learn_rate=" + str(lr)):
            
            for batch_no in range(total_batches):
                x_batch = x[batch_no*batch_size:(batch_no+1)*batch_size]
                y_batch = y[batch_no*batch_size:(batch_no+1)*batch_size]
                
                self.w = self.__gradient_descent(self.w, lr, n_t, x_batch, y_batch)
    
            if (batch_no+1)*batch_size < len(x):
                self.w = self.__gradient_descent(self.w, lr, n_t,
                                     x[(batch_no+1)*batch_size:len(x)], 
                                     y[(batch_no+1)*batch_size:len(x)])
            
            norm_sum = 0
            s = 0
            
            for i in range(len(self.w)):
                norm_sum += self.w[i]**2
            
            for i in range(len(x)):
                p = self.__sigmoid(x[i], self.w)
                s += y[i]*np.log(p) + (1-y[i])*np.log(1 - p) - n_t*norm_sum
            
            if s == 0:
                break
    
    
    def fit(self, x, y):
        
        self.w  = [random.uniform(-1,1) for i in range(len(x_train[0]) + 1)]
        for lr in self.lr_rates:
            self.__calculate_w(x, y, self.best_normalization_term, lr)
    
                
    def fit_hyperparameters(self, x_train, y_train, x_dev, y_dev):
        max_epochs = 10
        
        normalization_terms = [0.05, 0.1, 0.15, 0.2]
        
        best_dev_acc = -1

        for n_t in normalization_terms:
            self.w  = [random.uniform(-1,1) for i in range(len(x_train[0]) + 1)]
            for lr in self.lr_rates:
                self.__calculate_w(x_train, y_train, n_t, lr)
                    
            current_dev_acc = self.accurate(x_dev, y_dev, False)
                    
            if current_dev_acc > best_dev_acc:
                best_dev_acc = current_dev_acc
                self.best_normalization_term = n_t
                
                
            
    def accurate(self, x, y, All = False):
        total = 0
        TP = 0
        FP = 0
        FN = 0
        for i in range(len(x)):
            prob = self.__sigmoid(x[i], self.w)
            clf = -1
            if prob >= self.threshold:
                clf = 1
            elif prob <= 1 - self.threshold:
                clf = 0
            if int(y[i]) == clf:
                total+=1
            
            if clf == 1:
                if int(y[i]) == 1:
                    TP += 1
                else:
                    FP += 1
            else:
                if int(y[i]) == 1:
                    FN += 1        
        
        if All:
            Precision = TP / (TP + FP)
            Recall = TP / (TP + FN)
            F1 = 2*Precision*Recall / (Precision+Recall)
            return total/len(x), Precision, Recall, F1
        else: 
            return total/len(x)
        
        
    def predict(self, x):
        predictions = []
        
        for i in range(len(x)):
            prob = self.__sigmoid(x[i], self.w)
            clf = -1
            if prob >= self.threshold:
                clf = 1
            elif prob <= 1 - self.threshold:
                clf = 0
            
            predictions.append(clf)
        
        return predictions

In [17]:
lr = LogisticRegression()
lr.fit_hyperparameters(x_train, y_train, x_dev, y_dev)


Norm_Term=0.05 Learn_rate=0.01: 100%|█| 5/5 [01:58<00:00, 23.72
Norm_Term=0.05 Learn_rate=0.001: 100%|█| 5/5 [02:01<00:00, 24.2
Norm_Term=0.05 Learn_rate=0.0001: 100%|█| 5/5 [02:09<00:00, 25.
Norm_Term=0.05 Learn_rate=1e-05: 100%|█| 5/5 [03:28<00:00, 41.7
Norm_Term=0.1 Learn_rate=0.01: 100%|█| 5/5 [03:49<00:00, 45.81s
Norm_Term=0.1 Learn_rate=0.001: 100%|█| 5/5 [03:47<00:00, 45.58
Norm_Term=0.1 Learn_rate=0.0001: 100%|█| 5/5 [02:58<00:00, 35.7
Norm_Term=0.1 Learn_rate=1e-05: 100%|█| 5/5 [02:14<00:00, 26.91
Norm_Term=0.15 Learn_rate=0.01: 100%|█| 5/5 [02:43<00:00, 32.79
Norm_Term=0.15 Learn_rate=0.001: 100%|█| 5/5 [03:39<00:00, 43.9
Norm_Term=0.15 Learn_rate=0.0001: 100%|█| 5/5 [03:37<00:00, 43.
Norm_Term=0.15 Learn_rate=1e-05: 100%|█| 5/5 [03:34<00:00, 42.9
Norm_Term=0.2 Learn_rate=0.01: 100%|█| 5/5 [03:29<00:00, 41.98s
Norm_Term=0.2 Learn_rate=0.001: 100%|█| 5/5 [03:45<00:00, 45.17
Norm_Term=0.2 Learn_rate=0.0001: 100%|█| 5/5 [03:47<00:00, 45.5
Norm_Term=0.2 Learn_rate=1e-05: 100%|█| 

In [21]:
print(lr.best_normalization_term)

0.1


In [25]:
x_train_length = np.arange(2000, len(x_train)+1, 2000)

train_accuracy = []
dev_accuracy = []
test_accuracy = []
precision = []
recall = []
F1 = []

for length in x_train_length:
    lr.fit(x_train[:length], y_train[:length])
    
    train_accuracy.append(lr.accurate(x_train[:length], y_train[:length]))
    dev_accuracy.append(lr.accurate(x_dev, y_dev))
    
    curr_test_acc, curr_precision, curr_recall, curr_F1 = lr.accurate(x_test, y_test, True)
    
    test_accuracy.append(curr_test_acc)
    precision.append(curr_precision)
    recall.append(curr_recall)
    F1.append(curr_F1)

Norm_Term=0.1 Learn_rate=0.01: 100%|█| 5/5 [00:22<00:00,  4.53s
Norm_Term=0.1 Learn_rate=0.001: 100%|█| 5/5 [00:21<00:00,  4.38
Norm_Term=0.1 Learn_rate=0.0001: 100%|█| 5/5 [00:22<00:00,  4.5
Norm_Term=0.1 Learn_rate=1e-05: 100%|█| 5/5 [00:22<00:00,  4.59
Norm_Term=0.1 Learn_rate=0.01: 100%|█| 5/5 [00:45<00:00,  9.14s
Norm_Term=0.1 Learn_rate=0.001: 100%|█| 5/5 [00:45<00:00,  9.12
Norm_Term=0.1 Learn_rate=0.0001: 100%|█| 5/5 [00:45<00:00,  9.1
Norm_Term=0.1 Learn_rate=1e-05: 100%|█| 5/5 [00:43<00:00,  8.63
Norm_Term=0.1 Learn_rate=0.01: 100%|█| 5/5 [01:08<00:00, 13.65s
Norm_Term=0.1 Learn_rate=0.001: 100%|█| 5/5 [01:08<00:00, 13.72
Norm_Term=0.1 Learn_rate=0.0001: 100%|█| 5/5 [01:08<00:00, 13.7
Norm_Term=0.1 Learn_rate=1e-05: 100%|█| 5/5 [01:08<00:00, 13.70
Norm_Term=0.1 Learn_rate=0.01: 100%|█| 5/5 [01:30<00:00, 18.19s
Norm_Term=0.1 Learn_rate=0.001: 100%|█| 5/5 [01:30<00:00, 18.18
Norm_Term=0.1 Learn_rate=0.0001: 100%|█| 5/5 [01:31<00:00, 18.2
Norm_Term=0.1 Learn_rate=1e-05: 100%|█| 

In [26]:
import matplotlib.pyplot as plt

plt.figure(figsize=(12,8))
ax = plt.subplot()
plt.plot(x_train_length, train_accuracy, 'g-o', label="TRAIN_ACCURACY")
plt.plot(x_train_length, test_accuracy, 'r-o', label="TEST_ACCURACY")

plt.ylabel("ACCURACY\n")
plt.xlabel("\n NUM OF TRAINING EXAMPLES")

plt.ylim(0,1)

plt.legend()
plt.savefig("curves//train_test_accuracy.png")
plt.show()

In [28]:
import matplotlib.pyplot as plt

plt.figure(figsize=(12,8))

ax = plt.subplot()

plt.plot(x_train_length, precision, 'g-o', label = "PRECISION")
plt.plot(x_train_length, recall, 'r-o', label = "RECALL")
plt.plot(x_train_length, F1, 'b-o', label = "F1")

plt.xlabel("\n NUM OF TRAINING EXAMPLES")

plt.ylim(0,1)

plt.legend()
plt.savefig("curves//Recall_Precision_F1.png")
plt.show()

In [29]:
from prettytable import PrettyTable

accuracy_table = PrettyTable(["Training examples", "Train Accuracy", "Dev Accuracy", "Test Accuracy"])

for i in range(len(x_train_length)):
    accuracy_table.add_row([x_train_length[i], train_accuracy[i], dev_accuracy[i], test_accuracy[i]])

print(accuracy_table)

file = open("tables//accuracy_table.txt", "w")
file.write(str(accuracy_table))
file.close()

+-------------------+--------------------+--------------+---------------+
| Training examples |   Train Accuracy   | Dev Accuracy | Test Accuracy |
+-------------------+--------------------+--------------+---------------+
|        2000       |        1.0         |     0.5      |      0.5      |
|        4000       |        1.0         |     0.5      |    0.50004    |
|        6000       |        1.0         |     0.5      |      0.5      |
|        8000       |        1.0         |     0.5      |      0.5      |
|       10000       |        1.0         |     0.5      |      0.5      |
|       12000       |        0.89        |    0.728     |    0.72916    |
|       14000       | 0.8582142857142857 |    0.7984    |     0.8062    |
|       16000       |      0.842875      |    0.827     |    0.82652    |
|       18000       | 0.8353333333333334 |    0.8318    |     0.8276    |
|       20000       |       0.8311       |    0.8242    |    0.82088    |
+-------------------+-----------------

In [30]:
from prettytable import PrettyTable

precision_recall_F1_table = PrettyTable(["Training examples", "Precision", "Recall", "F1"])

for i in range(len(x_train_length)):
    precision_recall_F1_table.add_row([x_train_length[i], precision[i], recall[i], F1[i]])

print(precision_recall_F1_table)

file = open("tables//precision_recall_F1_table.txt", "w")
file.write(str(precision_recall_F1_table))
file.close()

+-------------------+--------------------+---------+--------------------+
| Training examples |     Precision      |  Recall |         F1         |
+-------------------+--------------------+---------+--------------------+
|        2000       |        0.5         | 0.99984 | 0.6666311073181139 |
|        4000       | 0.500020000800032  |   1.0   | 0.6666844449185311 |
|        6000       |        0.5         |   1.0   | 0.6666666666666666 |
|        8000       |        0.5         |   1.0   | 0.6666666666666666 |
|       10000       |        0.5         |   1.0   | 0.6666666666666666 |
|       12000       | 0.6545954989475956 | 0.97032 | 0.7817847819781495 |
|       14000       | 0.7498531235720347 | 0.91896 | 0.8258384557316942 |
|       16000       | 0.8056158742044178 | 0.86072 | 0.832256816863276  |
|       18000       | 0.8430222817892444 | 0.80512 | 0.8236353220394467 |
|       20000       | 0.8739511467462241 | 0.74992 | 0.8071988288986481 |
+-------------------+-----------------

In [31]:
# fit our training data to sklearn.LogisticRegression model, in order to compare the results with ours.

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

log = LogisticRegression()
log.fit(x_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [32]:
# sklearn.LogisticRegression report 
print(classification_report(y_test, log.predict(x_test)))

              precision    recall  f1-score   support

           0       0.88      0.87      0.87     12500
           1       0.87      0.88      0.87     12500

    accuracy                           0.87     25000
   macro avg       0.87      0.87      0.87     25000
weighted avg       0.87      0.87      0.87     25000



In [33]:
# our report 
print(classification_report(y_test, lr.predict(x_test)))

              precision    recall  f1-score   support

           0       0.78      0.89      0.83     12500
           1       0.87      0.75      0.81     12500

    accuracy                           0.82     25000
   macro avg       0.83      0.82      0.82     25000
weighted avg       0.83      0.82      0.82     25000

