In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re

# Example dataset
data = {
    'sentence': ["Food is mediocre, except for Mushroom Custard and Foie Gras in Fire Ice."],
    'aspect': ["food", "Mushroom Custard"],
    'sentiment': ["negative", "positive"]
}
df = pd.DataFrame(data)

# Preprocessing function
def preprocess_text(text):
    text = text.lower()  # Lowercase text
    text = re.sub(r'\W', ' ', text)  # Remove all non-word characters
    text = re.sub(r'\s+', ' ', text)  # Replace all runs of whitespaces with a single space
    lemmatizer = WordNetLemmatizer()
    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words if word not in set(stopwords.words('english'))]
    return ' '.join(words)

# Applying preprocessing
df['processed_sentence'] = df['sentence'].apply(preprocess_text)

# Feature extraction
tfidf = TfidfVectorizer()
X = tfidf.fit_transform(df['processed_sentence'])

# Label encoding for sentiments
le = LabelEncoder()
y = le.fit_transform(df['sentiment'])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Training the model
model = LogisticRegression()
model.fit(X_train, y_train)

# Predicting sentiment
y_pred = model.predict(X_test)

# Evaluating the model
print(f'Accuracy: {accuracy_score(y_test, y_pred)}')

# Example prediction
new_sentence = preprocess_text("The Mushroom Custard was a delightful surprise.")
new_features = tfidf.transform([new_sentence])
print(f'Predicted sentiment: {le.inverse_transform(model.predict(new_features))[0]}')

ValueError: All arrays must be of the same length