### Import Libraries

In [17]:
import nltk
import numpy as np
from sklearn.datasets import load_files
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer



###  NLTK Resources installation 

In [18]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')


[nltk_data] Downloading package stopwords to C:\Users\E N V
[nltk_data]     Y\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\E N V
[nltk_data]     Y\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\E N V
[nltk_data]     Y\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

### Dataset Loading

In [19]:
dataset_path = r"C:\Users\E N V Y\Desktop\aclImdb"

reviews = load_files(dataset_path + "/train", categories=['pos', 'neg'], shuffle=True)
X, y = reviews.data, reviews.target




###  Preprocessing the Text

In [20]:
import nltk
nltk.download('punkt_tab')  # This is the missing resource mentioned in the error


stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = text.decode('utf-8', errors='ignore')
    tokens = nltk.word_tokenize(text)
    tokens = [word.lower() for word in tokens if word.isalpha() and word.lower() not in stop_words]
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return " ".join(tokens)

X_cleaned = [preprocess_text(review) for review in X]



[nltk_data] Downloading package punkt_tab to C:\Users\E N V
[nltk_data]     Y\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


### Convert cleaned text to TF-IDF features

In [21]:
vectorizer = TfidfVectorizer(max_features=5000)
X_tfidf = vectorizer.fit_transform(X_cleaned)


### Spliting data into train and test sets

In [22]:
X_train, X_test, y_train, y_test = train_test_split(
    X_tfidf, y, test_size=0.2, random_state=42
)


### Training of a Logistic Regression model

In [23]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)


### Predict and evaluate the model

In [24]:
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred, target_names=['Negative', 'Positive']))


              precision    recall  f1-score   support

    Negative       0.88      0.86      0.87      2482
    Positive       0.86      0.89      0.88      2518

    accuracy                           0.87      5000
   macro avg       0.87      0.87      0.87      5000
weighted avg       0.87      0.87      0.87      5000

