In [None]:
pip install nltk spacy scikit-learn
python -m spacy download en_core_web_sm


In [None]:
import pandas as pd
import numpy as np
import nltk
import spacy
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Load dataset (assuming CSV format with 'tweet_text' column)
data = pd.read_csv('tweets.csv')


In [None]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('vader_lexicon')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

def preprocess(text):
    doc = nlp(text.lower())
    tokens = [lemmatizer.lemmatize(token.text) for token in doc if token.text not in stop_words and token.is_alpha()]
    return ' '.join(tokens)

data['processed_text'] = data['tweet_text'].apply(preprocess)



In [None]:
sia = SentimentIntensityAnalyzer()
data['sentiment'] = data['tweet_text'].apply(lambda x: sia.polarity_scores(x)['compound'])
data['sentiment_label'] = data['sentiment'].apply(lambda x: 'positive' if x > 0 else 'negative')

 

In [None]:
n_clusters = 5
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(data['processed_text'])

kmeans = KMeans(n_clusters=n_clusters, random_state=42)
labels = kmeans.fit_predict(X)

data['cluster'] = labels
silhouette_avg = silhouette_score(X, labels)
print("Silhouette score:", silhouette_avg)

