In [1]:
pip install numpy nltk scikit-learn

Defaulting to user installation because normal site-packages is not writeable
Looking in links: /usr/share/pip-wheels
Note: you may need to restart the kernel to use updated packages.


In [3]:
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /home/c91183a3-2950-452f-877d-df9c01a8c533/nltk_data..
[nltk_data]     .


True

In [None]:
import numpy as np
import re
import nltk
from nltk.corpus import movie_reviews
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split

# Download the necessary NLTK resources
nltk.download('movie_reviews')
nltk.download('stopwords')
nltk.download('wordnet')

# Load the IMDB movie reviews dataset
X = [movie_reviews.raw(fileid) for fileid in movie_reviews.fileids()]
y = [movie_reviews.categories(fileid)[0] for fileid in movie_reviews.fileids()]

# Text preprocessing function
def preprocess_text(text):
    lemmatizer = WordNetLemmatizer()
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\s+[a-zA-Z]\s+', ' ', text)
    text = re.sub(r'^[a-zA-Z]\s+', ' ', text)
    text = text.lower()
    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words if word not in stopwords.words('english')]
    return ' '.join(words)

# Preprocess the text data
X = [preprocess_text(text) for text in X]

# Convert labels to binary format (positive = 1, negative = 0)
y = [1 if label == 'pos' else 0 for label in y]

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a Bag of Words representation of the text data
count_vectorizer = CountVectorizer(max_features=1000)
X_train_counts = count_vectorizer.fit_transform(X_train)
X_test_counts = count_vectorizer.transform(X_test)

# Convert the Bag of Words representation to a TF-IDF representation
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_test_tfidf = tfidf_transformer.transform(X_test_counts)

# Train a logistic regression model
clf = LogisticRegression(random_state=42)
clf.fit(X_train_tfidf, y_train)

# Make predictions on the test set
y_pred = clf.predict(X_test_tfidf)

# Evaluate the model
print('Confusion Matrix:')
print(confusion_matrix(y_test, y_pred))
print('\nClassification Report:')
print(classification_report(y_test, y_pred))
print('Accuracy Score:', accuracy_score(y_test, y_pred))


[nltk_data] Downloading package movie_reviews to
[nltk_data]     /home/c91183a3-2950-452f-877d-df9c01a8c533/nltk_data..
[nltk_data]     .
[nltk_data]   Package movie_reviews is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/c91183a3-2950-452f-877d-df9c01a8c533/nltk_data..
[nltk_data]     .
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/c91183a3-2950-452f-877d-df9c01a8c533/nltk_data..
[nltk_data]     .
[nltk_data]   Package wordnet is already up-to-date!
