In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.pipeline import Pipeline
from sklearn.base import TransformerMixin
from wordcloud import WordCloud
import string
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
df = pd.read_csv("sentiment.csv", encoding="latin-1")
print("Shape of the dataset:", df.shape)
print("\nDescription of the dataset:")
print(df.describe())
print("\nHead of the dataset:")
print(df.head())
print("\nTail of the dataset:")
print(df.tail())
df = df.dropna(subset=['tweet_text', 'is_there_an_emotion_directed_at_a_brand_or_product']).reset_index(drop=True)
print("\nCleaned dataset:")
print(df.head())
null_values = df.isnull().sum()
print("Columns with null values:")
print(null_values[null_values > 0])
X = df['tweet_text']
y = df['is_there_an_emotion_directed_at_a_brand_or_product']
class TextPreprocessor(TransformerMixin):
    def transform(self, X, **transform_params):
        return [self._clean_text(text) for text in X]
    def fit(self, X, y=None, **fit_params):
        return self
    def _clean_text(self, text):
        stop_words = set(stopwords.words('english'))
        text = text.lower()
        text = ''.join([char for char in text if char not in string.punctuation])
        text = ' '.join([word for word in text.split() if word not in stop_words])
        return text
def build_model(model):
    pipeline = Pipeline([
        ('preprocessor', TextPreprocessor()),
        ('tfidf', TfidfVectorizer(ngram_range=(1, 2))),
        ('classifier', model)
    ])
    return pipeline
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier()
}
for name, model in models.items():
    pipeline = build_model(model)
    scores = cross_val_score(pipeline, X, y, cv=5, scoring='accuracy')
    print(f"{name} Accuracy: {scores.mean()}")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
final_model = build_model(LogisticRegression(max_iter=1000))
final_model.fit(X_train, y_train)
y_pred = final_model.predict(X_test)
print("\nClassification Report:\n", classification_report(y_test, y_pred))
conf_matrix = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=df['is_there_an_emotion_directed_at_a_brand_or_product'].unique(), yticklabels=df['is_there_an_emotion_directed_at_a_brand_or_product'].unique())
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()
sentiment_mapping = {
    'No emotion': 'No emotion toward brand or product',
    'Positive emotion': 'Positive emotion',
    'Negative emotion': 'Negative emotion'
}
for sentiment_name, sentiment_label in sentiment_mapping.items():
    subset = df[df['is_there_an_emotion_directed_at_a_brand_or_product'] == sentiment_label]
    text = " ".join(subset['tweet_text'].values)
    wordcloud = WordCloud(stopwords=stop_words, background_color='white').generate(text)
    plt.figure(figsize=(10, 7))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.title(f'Word Cloud for {sentiment_name}')
    plt.axis('off')
    plt.show()
def predict_sentiment(tweet):
    cleaned_tweet = TextPreprocessor().transform([tweet])
    print("Cleaned Tweet:", cleaned_tweet)  # Debugging step
    return final_model.predict(cleaned_tweet)[0]
def get_user_input_and_predict():
    tweet = input("Enter a tweet: ")
    prediction = predict_sentiment(tweet)
    print("Predicted Sentiment:", prediction)
if __name__ == "__main__":
    get_user_input_and_predict()
