In [45]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.preprocessing import LabelEncoder

# Download NLTK data
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')  # Download the 'punkt' tokenizer

# Load the dataset
data = pd.read_csv('/test.csv')


# Adjust column name if necessary
review_column = 'text'  # Update this if the actual column name is different
sentiment_column = 'label'  # Update this if the actual column name is different

# Preprocessing function
def preprocess_text(text):
    text = re.sub(r'\W', ' ', text)  # Remove special characters
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text = text.lower()  # Convert to lowercase
    tokens = nltk.word_tokenize(text)  # Tokenization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stopwords.words('english')]
    return ' '.join(tokens)

# Apply preprocessing
data['cleaned_review'] = data[review_column].apply(preprocess_text)

# Vectorize the text using TF-IDF
vectorizer = TfidfVectorizer(max_features=1000)
X = vectorizer.fit_transform(data['cleaned_review']).toarray()

# Encode the sentiment labels
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(data['label'])

# Combine TF-IDF vectors with sentiment labels for clustering
features = pd.DataFrame(X)
features.columns = [str(i) for i in features.columns]

# Apply K-means clustering
kmeans = KMeans(n_clusters=3, random_state=42)  # Assuming 3 clusters
kmeans.fit(features)

# Assign cluster labels to each review
data['cluster'] = kmeans.labels_

# Save the clustered data
data.to_csv('clustered_imdb_reviews.csv', index=False)

# Optional: Print the resulting clusters
print(data[['text', 'label', 'cluster']].head())


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


                                                text     label  cluster
0  lovingly photographed in the manner of a golde...  positive        1
1              consistently clever and suspenseful .  positive        1
2  it's like a " big chill " reunion of the baade...  positive        1
3  the story gives ample opportunity for large-sc...  positive        1
4                  red dragon " never cuts corners .  positive        1
