In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import numpy as np
import re
import nltk
from sklearn.datasets import load_files
nltk.download('stopwords')
nltk.download('wordnet')
import pickle
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split


review_data = load_files(r"/content/drive/My Drive/movie_review")
X, y = review_data.data, review_data.target

documents = []

stemmer = WordNetLemmatizer()

for sen in range(0, len(X)):
    # Remove all the special characters
    document = re.sub(r'\W', ' ', str(X[sen]))
    
    # remove all single characters
    document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)
    
    # Remove single characters from the start
    document = re.sub(r'\^[a-zA-Z]\s+', ' ', document) 
    
    # Substituting multiple spaces with single space
    document = re.sub(r'\s+', ' ', document, flags=re.I)
    
    # Removing prefixed 'b'
    document = re.sub(r'^b\s+', '', document)
    
    # Converting to Lowercase
    document = document.lower()
    
    # Lemmatization
    document = document.split()
    document = [stemmer.lemmatize(word) for word in document]
    document = ' '.join(document)
    
    documents.append(document)

vectorizer = CountVectorizer(max_features=1500, min_df=5, max_df=0.7, stop_words=stopwords.words('english'))
X = vectorizer.fit_transform(documents).toarray()

tfidfconverter = TfidfTransformer()
X = tfidfconverter.fit_transform(X).toarray()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=False)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [None]:
u0 = []
u0_space = []

ramda = 10

gradient_u0 = []

m = len(X_train[0])

learning_rate = 0.01

for i in range(len(X_train[0])):
    gradient_u0.append(0)

for i in range(len(X_train[0])):
    u0.append(1)

for p in range(2000):
    for i in range(len(X_train)):
        mid_term0 = 0
        label = y_train[i]

        for j in range(len(X_train[0])):
            mid_term0 += X_train[i][j] * u0[j]

        mid_term0 = 1 / (1 + np.exp(-1 * mid_term0))

        for k in range(len(X_train[0])):
            gradient_u0[k] += ( ( mid_term0 - label ) * X_train[i][k] ) / m

    for i in range(len(u0)):
        u0[i] = ( ( 1 - (learning_rate * ramda) ) * u0[i] ) - (learning_rate * gradient_u0[i])
        u0_space.append(u0[i])
        gradient_u0[i] = 0

u_space = []

for i in range(2000):
    temp = u0_space[1500*i:1500*(i+1)]
    u_space.append(temp)

In [None]:
result_space = []
result = 0
result1_space = []
result1 = 0

for i in range(2000):
    for j in range(len(X_train)):
        label = y_train[j]
        theta_sum = 0
        result_sum = 0
        for k in range(len(X_train[0]):
            result_sum += u_space[i][k] * X_train[j][k]
            if ( j = 0 ):
                theta_sum += u_space[i][k] ** 2
        result_sum = 1 / (1 + np.exp(-1 * result_sum))
        result +=  ( (-1 * label * np.log(result_sum)) - ( (1 - label) * np.log(1-np.log(1-result_sum) ) ) ) / m
    result += (ramda / 2) * theta_sum
    result_space.append(result)
    result = 0

for i in range(2000):
    for j in range(len(X_test)):
        label = y_test[j]
        theta_sum1 = 0
        result_sum1 = 0
        for k in range(len(X_test[0]):
            result_sum1 += u_space[i][k] * X_test[j][k]
            if ( j = 0 ):
                theta_sum1 += u_space[i][k] ** 2
        result_sum1 = 1 / (1 + np.exp(-1 * result_sum1))
        result1 +=  ( (-1 * label * np.log(result_sum1)) - ( (1 - label) * np.log(1-np.log(1-result_sum1) ) ) ) / m
    result1 += (ramda / 2) * theta_sum1
    result1_space.append(result1)
    result1 = 0

x = range(2000)
plt.xlabel('t(iteration')
plt.ylabel('J(Cost function')
plt.plot(x, result_space, 'blue', label='training_loss')
plt.plot(x, result1_space, 'red', label='test_loss')
plt.legend(loc = 0)
plt.show()

In [None]:
correct_space = []
correct1_space = []

for i in range(2000):
    correct = 0
    for j in range(len(X_train)):
        label = y_train[j]
        result_sum = 0
        for k in range(len(X_train[0]):
            result_sum += u_space[i][k] * X_train[j][k]
        result_sum = 1 / (1 + np.exp(-1 * result_sum))
        if (result_sum >= 1/2):
            im_label = 1
        else:
            im_label = 0

        if (im_label == label):
            correct += 1
    correct_space.append( (correct / m) * 100 )

for i in range(2000):
    correct = 0
    for j in range(len(X_test)):
        label = y_test[j]
        result_sum = 0
        for k in range(len(X_test[0]):
            result_sum += u_space[i][k] * X_test[j][k]
        result_sum = 1 / (1 + np.exp(-1 * result_sum))
        if (result_sum >= 1/2):
            im_label = 1
        else:
            im_label = 0

        if (im_label == label):
            correct += 1
    correct1_space.append( (correct / m) * 100 )