In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [8]:
import nltk
nltk.download('punkt')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [11]:
import nltk
nltk.download('stopwords')
import nltk
nltk.download('wordnet')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [13]:
import nltk
nltk.download('averaged_perceptron_tagger')


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [14]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from gensim.models import Word2Vec

# Load the Dataset file
df = pd.read_csv('/content/drive/MyDrive/IMDB/train.csv')

# Preprocessing text data
def preprocess_text(text):
    # Tokenization
    tokens = word_tokenize(text)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word.lower() not in stop_words]

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    # POS Tagging
    pos_tags = pos_tag(tokens)

    return pos_tags

# Apply preprocessing to each row in the dataframe
df['review'] = df['review'].apply(preprocess_text)

# Customized word embeddings
sentences = df['review'].tolist()
word2vec_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)

# Train-test split
X = df['review']
y = df['sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert text data to vectors using Bag-of-Words
vectorizer = CountVectorizer(analyzer=lambda x: x)
X_train_bow = vectorizer.fit_transform(X_train)
X_test_bow = vectorizer.transform(X_test)

# Train models
models = {
    "Naive Bayes": MultinomialNB(),
    "Random Forest": RandomForestClassifier(),
    "k-NN": KNeighborsClassifier()
}

for name, model in models.items():
    model.fit(X_train_bow, y_train)
    y_pred = model.predict(X_test_bow)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{name} Accuracy: {accuracy}")


Naive Bayes Accuracy: 0.8541666666666666
Random Forest Accuracy: 0.8405
k-NN Accuracy: 0.5856666666666667


In [15]:
# Evaluate on testing file
test_data = pd.read_csv("/content/drive/MyDrive/IMDB/test.csv")  # Load your testing data
test_data['review'] = test_data['review'].apply(preprocess_text)
X_test_final = vectorizer.transform(test_data['review'])
y_test_final = test_data['sentiment']

for name, model in models.items():
    y_pred_test = model.predict(X_test_final)
    acc_test = accuracy_score(y_test_final, y_pred_test)
    print(f'{name} Test Accuracy: {acc_test}')

Naive Bayes Test Accuracy: 0.85485
Random Forest Test Accuracy: 0.8434
k-NN Test Accuracy: 0.59035
