In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

class SentimentAnalyzer:
    def __init__(self, max_length=100, max_words=10000):
        self.max_length = max_length
        self.max_words = max_words
        self.tokenizer = Tokenizer(num_words=self.max_words)
        self.label_encoder = LabelEncoder()
        self.model = None

    def prepare_data(self, df):
        df = df.dropna(subset=['text', 'sentiment'])  # Drop rows with missing values
        X = df['text'].astype(str)
        y = self.label_encoder.fit_transform(df['sentiment'])  # Encode labels
        self.tokenizer.fit_on_texts(X)
        X_seq = pad_sequences(self.tokenizer.texts_to_sequences(X), maxlen=self.max_length, padding='post')
        return train_test_split(X_seq, y, test_size=0.2, random_state=42)

    def create_model(self):
        model = tf.keras.Sequential([
            tf.keras.layers.Embedding(self.max_words, 128, input_length=self.max_length),
            tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
            tf.keras.layers.Dense(64, activation='relu'),
            tf.keras.layers.Dropout(0.5),
            tf.keras.layers.Dense(len(self.label_encoder.classes_), activation='softmax')
        ])
        model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
        return model

    def train(self, df, epochs=1, batch_size=32):
        X_train, X_test, y_train, y_test = self.prepare_data(df)
        self.model = self.create_model()
        history = self.model.fit(X_train, y_train, validation_data=(X_test, y_test),
                                 epochs=epochs, batch_size=batch_size, verbose=1)
        return history

    def predict(self, texts):
        if isinstance(texts, str):
            texts = [texts]
        seq = pad_sequences(self.tokenizer.texts_to_sequences(texts), maxlen=self.max_length, padding='post')
        preds = self.model.predict(seq)
        labels = self.label_encoder.inverse_transform(preds.argmax(axis=1))
        conf = preds.max(axis=1)
        return list(zip(labels, conf))

# Load data and train
df = pd.read_csv(r'C:\Users\felin\Downloads\Sentiment analysis_Social media post.zip')
analyzer = SentimentAnalyzer(max_length=100, max_words=10000)

# Train the model
analyzer.train(df, epochs=1)

# Predictions
texts = ["This game is amazing!", "The service was terrible", "It's okay, nothing special"]
predictions = analyzer.predict(texts)
for text, (sentiment, confidence) in zip(texts, predictions):
    print(f"Text: {text}")
    print(f"Sentiment: {sentiment} (Confidence: {confidence:.2f})\n")




[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 214ms/step - accuracy: 0.3486 - loss: 1.0972 - val_accuracy: 0.3000 - val_loss: 1.1128
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 515ms/step
Text: This game is amazing!
Sentiment: neutral (Confidence: 0.40)

Text: The service was terrible
Sentiment: neutral (Confidence: 0.40)

Text: It's okay, nothing special
Sentiment: neutral (Confidence: 0.40)

