In [1]:
from google.colab import drive
drive.mount('/content/drive')

%cd /content/drive/MyDrive/IMDBText

import pandas as pd
imdb_train_data = pd.read_csv('train.csv')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/IMDBText


In [6]:
pip install afinn

Collecting afinn
  Downloading afinn-0.1.tar.gz (52 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/52.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m52.6/52.6 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: afinn
  Building wheel for afinn (setup.py) ... [?25l[?25hdone
  Created wheel for afinn: filename=afinn-0.1-py3-none-any.whl size=53429 sha256=abe43e22fe830c4382f4f0573f7853ff1521addd8a34d5bfe1b4abd9eb5dfae5
  Stored in directory: /root/.cache/pip/wheels/b0/05/90/43f79196199a138fb486902fceca30a2d1b5228e6d2db8eb90
Successfully built afinn
Installing collected packages: afinn
Successfully installed afinn-0.1


In [22]:
import nltk
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('sentiwordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package sentiwordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/sentiwordnet.zip.


True

In [19]:
import pandas as pd
import time
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import sentiwordnet as swn
from nltk import pos_tag
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from nltk.corpus import wordnet
from afinn import Afinn
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
import string
import re

# Pre-Processing Text

In [9]:
# Function for text preprocessing
def preprocess_text(text):
    # Tokenization
    tokens = word_tokenize(text)
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    # Remove stopwords and punctuation
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [token for token in lemmatized_tokens if token.lower() not in stop_words and token not in string.punctuation]
    # Join tokens back into text
    preprocessed_text = ' '.join(filtered_tokens)
    return preprocessed_text



In [10]:
# Function to get wordnet POS tag from Penn Treebank POS tag
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return 'a'  # Adjective
    elif treebank_tag.startswith('V'):
        return 'v'  # Verb
    elif treebank_tag.startswith('N'):
        return 'n'  # Noun
    elif treebank_tag.startswith('R'):
        return 'r'  # Adverb
    else:
        return None

In [11]:

# Function to get sentiment score using SentiWordNet
def get_sentiment_score(word, pos_tag):
    if pos_tag:
        synsets = list(swn.senti_synsets(word, pos_tag))
        if synsets:
            # Take the average of positive and negative scores
            return (synsets[0].pos_score() - synsets[0].neg_score())
    return 0

#SentiWordNet

In [16]:
# Preprocess the text data
imdb_train_data['review'] = imdb_train_data['review'].apply(preprocess_text)


In [17]:
# Tokenize and tag parts of speech for each review
imdb_train_data['tokens'] = imdb_train_data['review'].apply(word_tokenize)
imdb_train_data['pos_tags'] = imdb_train_data['tokens'].apply(pos_tag)
imdb_train_data['pos_tags'] = imdb_train_data['pos_tags'].apply(lambda x: [(word, get_wordnet_pos(tag)) for (word, tag) in x])


In [29]:
# Start time
start_time = time.time()

# Calculate sentiment score for each review using SentiWordNet

imdb_train_data['sentiment_score'] = imdb_train_data['pos_tags'].apply(lambda x: sum(get_sentiment_score(word, pos_tag) for word, pos_tag in x))

# Convert sentiment score to binary sentiment label
imdb_train_data['predicted_sentiment'] = imdb_train_data['sentiment_score'].apply(lambda x: 'positive' if x > 0 else 'negative' if x < 0 else 'neutral')

# End time
end_time = time.time()

# Calculate computational time
training_time = end_time - start_time


In [24]:
# Load IMDb test data
imdb_test_data = pd.read_csv('test.csv')


In [25]:
# Preprocess the test data
imdb_test_data['review'] = imdb_test_data['review'].apply(preprocess_text)



In [26]:
# Tokenize and tag parts of speech for each review in the test data
imdb_test_data['tokens'] = imdb_test_data['review'].apply(word_tokenize)
imdb_test_data['pos_tags'] = imdb_test_data['tokens'].apply(pos_tag)
imdb_test_data['pos_tags'] = imdb_test_data['pos_tags'].apply(lambda x: [(word, get_wordnet_pos(tag)) for (word, tag) in x])



In [30]:
# Start time
start_time = time.time()

# Calculate sentiment score for each review in the test data using SentiWordNet
imdb_test_data['sentiment_score'] = imdb_test_data['pos_tags'].apply(lambda x: sum(get_sentiment_score(word, pos_tag) for word, pos_tag in x))

# Convert sentiment score to binary sentiment label for the test data
imdb_test_data['predicted_sentiment'] = imdb_test_data['sentiment_score'].apply(lambda x: 'positive' if x > 0 else 'negative' if x < 0 else 'neutral')
# End time
end_time = time.time()
# Calculate computational time
testing_time = end_time - start_time


In [31]:
# Evaluate the accuracy of the SentiWordNet lexicon-based approach
accuracy = accuracy_score(imdb_test_data['sentiment'], imdb_test_data['predicted_sentiment'])
print("Accuracy of SentiWordNet lexicon-based approach on IMDb test data:", accuracy)

print("Train time:", training_time, "seconds")

print("test time:", testing_time, "seconds")


Accuracy of SentiWordNet lexicon-based approach on IMDb test data: 0.63005
Train time: 108.77739357948303 seconds
test time: 70.90979218482971 seconds


#### **Accuracy of SentiWordNet on test data: 0.63005**

#AFINN

In [41]:
from afinn import Afinn
from sklearn.metrics import accuracy_score



# Instantiate the AFINN lexicon
afinn = Afinn()
# Start time
start_time = time.time()
# Calculate sentiment scores for each review in the test data using AFINN
imdb_test_data['sentiment_score'] = imdb_test_data['review'].apply(afinn.score)

# Convert sentiment scores to binary sentiment labels
imdb_test_data['predicted_sentiment'] = imdb_test_data['sentiment_score'].apply(lambda score: 'positive' if score > 0 else 'negative' if score < 0 else 'neutral')
# End time
end_time = time.time()
# Calculate computational time
test_time = end_time - start_time

In [42]:
# Evaluate the accuracy of the AFINN lexicon approach
accuracy = accuracy_score(imdb_test_data['sentiment'], imdb_test_data['predicted_sentiment'])
print("Accuracy of AFINN lexicon approach on IMDb test data:", accuracy)
print("test time:", test_time, "seconds")

Accuracy of AFINN lexicon approach on IMDb test data: 0.6829
test time: 41.70845556259155 seconds


#### **Accuracy of AFIN on test data: 0.68275**

#Semi-Supervised Learning Using Lexicon Method

In [34]:
# Calculate sentiment scores for labeled data
imdb_train_data['sentiment_score'] = imdb_train_data['review'].apply(afinn.score)


In [35]:
# Create TF-IDF vectors for labeled data
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(imdb_train_data['review'])
y_train = imdb_train_data['sentiment']

In [36]:
# Start time
start_time = time.time()
# Train a logistic regression classifier using labeled data and sentiment scores
classifier = LogisticRegression()
classifier.fit(X_train_tfidf, y_train)
# End time
end_time = time.time()
# Calculate computational time
training_time = end_time - start_time

In [37]:
# Preprocess the test data
imdb_test_data['review'] = imdb_test_data['review'].apply(preprocess_text)

# Create TF-IDF vectors for test data
X_test_tfidf = tfidf_vectorizer.transform(imdb_test_data['review'])

In [38]:
# Start time
start_time = time.time()
# Predict sentiment labels for test data
y_pred = classifier.predict(X_test_tfidf)
# End time
end_time = time.time()
# Calculate computational time
test_time = end_time - start_time


In [40]:
# Evaluate the accuracy of the classifier on test data
accuracy = accuracy_score(imdb_test_data['sentiment'], y_pred)
print("Accuracy of semi-supervised learning using lexicon-based methods on IMDb test data:", accuracy)
print("Train time:", training_time, "seconds")
print("test time:", test_time, "seconds")


Accuracy of semi-supervised learning using lexicon-based methods on IMDb test data: 0.88685
Train time: 0.536574125289917 seconds
test time: 0.004080295562744141 seconds


#### **Accuracy of semi-supervised learning using lexicon on test data: 0.88685**