In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report
import re
import nltk
from nltk.corpus import stopwords

# Download stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special characters
    text = ' '.join([word for word in text.split() if word not in stop_words])  # Remove stopwords
    return text

# Load dataset
df = pd.read_csv("sentimentdataset.csv")
df = df[["Text", "Sentiment"]]
df["Sentiment"] = df["Sentiment"].str.strip()
df["ProcessedText"] = df["Text"].apply(preprocess_text)

# Encode labels
label_encoder = LabelEncoder()
df["SentimentEncoded"] = label_encoder.fit_transform(df["Sentiment"])

# Remove rare classes (classes with only 1 sample)
class_counts = df["SentimentEncoded"].value_counts()
rare_classes = class_counts[class_counts < 2].index
df = df[~df["SentimentEncoded"].isin(rare_classes)]

# Split dataset
X = df["ProcessedText"]
y = df["SentimentEncoded"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Convert text to numerical features
vectorizer = TfidfVectorizer(ngram_range=(1,2), max_features=10000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# # Train model with improved parameters
mlp_model = MLPClassifier(
    hidden_layer_sizes=(256, 128, 64),  # Deeper network
    activation='relu',
    solver='adam',
    alpha=0.001,  # Regularization to avoid overfitting
    learning_rate='adaptive',
    max_iter=1000,
    early_stopping=True,  # Stop when validation loss stops improving
    random_state=42
)
mlp_model.fit(X_train_tfidf, y_train)

# Predict
y_pred = mlp_model.predict(X_test_tfidf)

# Evaluate
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")
print("Classification Report:\n", classification_report(y_test, y_pred))

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer

# Download necessary NLTK resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

# Initialize NLP tools
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove non-alphabetic characters
    words = word_tokenize(text)  # Tokenize words
    words = [lemmatizer.lemmatize(stemmer.stem(word)) for word in words if word not in stop_words and len(word) > 2]  
    return ' '.join(words)

# Load dataset
df = pd.read_csv("sentimentdataset.csv")
df = df[["Text", "Sentiment"]]
df["Sentiment"] = df["Sentiment"].str.strip()
df["ProcessedText"] = df["Text"].apply(preprocess_text)

# Encode labels
label_encoder = LabelEncoder()
df["SentimentEncoded"] = label_encoder.fit_transform(df["Sentiment"])

# Remove rare classes (classes with fewer than 5 samples)
class_counts = df["SentimentEncoded"].value_counts()
rare_classes = class_counts[class_counts < 5].index
df = df[~df["SentimentEncoded"].isin(rare_classes)]

# Ensure dataset is balanced
min_class_count = df["SentimentEncoded"].value_counts().min()
df_balanced = df.groupby("SentimentEncoded").apply(lambda x: x.sample(min_class_count, random_state=42)).reset_index(drop=True)

# Use a fixed proportion for test_size to avoid errors
test_size = 0.2  # 20% of data for testing

# Split dataset
X = df_balanced["ProcessedText"]
y = df_balanced["SentimentEncoded"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42, stratify=y)

# Convert text to numerical features
vectorizer = TfidfVectorizer(ngram_range=(1, 4), max_features=20000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train model with improved parameters
mlp_model = MLPClassifier(
    hidden_layer_sizes=(1024, 512, 256, 128),  # Larger and deeper network
    activation='relu',
    solver='adam',
    alpha=0.0003,  # Better regularization to avoid overfitting
    learning_rate='adaptive',
    batch_size=128,  # Larger batch size for stable updates
    max_iter=3000,  # Allow more iterations
    early_stopping=True,  # Stop when validation loss stops improving
    n_iter_no_change=15,  # More patience for better convergence
    shuffle=True,  # Helps generalization
    tol=1e-4,  # Stopping criterion
    random_state=42,
    verbose=True
)
mlp_model.fit(X_train_tfidf, y_train)

# Predict
y_pred = mlp_model.predict(X_test_tfidf)

# Evaluate
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")
print("Classification Report:\n", classification_report(y_test, y_pred, zero_division=0))