In [1]:
# 1. Imports and Data Loading
import nltk
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re

# Download necessary resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

# Load dataset
# You can get IMDB dataset from Kaggle or use sklearn's sample:
from sklearn.datasets import load_files
reviews = load_files('aclImdb/train/', categories=['pos', 'neg'], encoding='utf-8')
X, y = reviews.data, reviews.target

# 2. Text Preprocessing
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    # Lowercase
    text = text.lower()
    # Remove special characters
    text = re.sub(r'\W', ' ', text)
    # Tokenize
    tokens = nltk.word_tokenize(text)
    # Remove stopwords and lemmatize
    cleaned = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words and word.isalpha()]
    return ' '.join(cleaned)

X_cleaned = [preprocess(text) for text in X]

# 3. Feature Engineering
vectorizer = TfidfVectorizer(max_features=5000)
X_features = vectorizer.fit_transform(X_cleaned)

# 4. Model Training
X_train, X_test, y_train, y_test = train_test_split(X_features, y, test_size=0.2, random_state=42)
model = LogisticRegression()
model.fit(X_train, y_train)

# 5. Model Evaluation
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred, target_names=['Negative', 'Positive']))


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Mg\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Mg\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Mg\AppData\Roaming\nltk_data...


FileNotFoundError: [WinError 3] The system cannot find the path specified: 'aclImdb/train/'