In [3]:
# Install required libraries
# !pip install datasets transformers torch flask

# Import necessary libraries
import re
import numpy as np
from datasets import load_dataset
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

# Load IMDb dataset
dataset = load_dataset("imdb")

# Extract data
X_train = dataset['train']['text']
y_train = dataset['train']['label']
X_test = dataset['test']['text']
y_test = dataset['test']['label']

# Preprocess text function
def preprocess_text(text):
    text = re.sub(r'<.*?>', ' ', text)  # Remove HTML tags
    text = re.sub(r'\W', ' ', text)  # Remove special characters
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    tokens = word_tokenize(text)  # Tokenization
    stop_words = set(stopwords.words('english'))  # Stopword removal
    tokens = [word for word in tokens if word.lower() not in stop_words]
    ps = PorterStemmer()
    tokens = [ps.stem(word) for word in tokens]  # Stemming
    return ' '.join(tokens)

# Apply preprocessing
X_train = [preprocess_text(review) for review in X_train]
X_test = [preprocess_text(review) for review in X_test]

# Convert text data into numerical form using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)


In [7]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Train Neural Network model
model_nn = MLPClassifier(hidden_layer_sizes=(100,50), max_iter=300, activation='relu', solver='adam')
model_nn.fit(X_train_tfidf, y_train)

# Predictions and evaluation
y_pred_nn = model_nn.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, y_pred_nn)
precision = precision_score(y_test, y_pred_nn)
recall = recall_score(y_test, y_pred_nn)
f1 = f1_score(y_test, y_pred_nn)

print("Neural Network Performance:")
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-Score: {f1}")


Neural Network Performance:
Accuracy: 0.84044
Precision: 0.8520142278104061
Recall: 0.824
F1-Score: 0.8377729879214283


In [8]:
import joblib

# Save the Logistic Regression model and TF-IDF vectorizer
joblib.dump(model_nn, 'model_nn.pkl')
# joblib.dump(vectorizer, 'vectorizer.pkl')


['model_nn.pkl']