In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!unzip /content/drive/MyDrive/archive.zip -d /content/drive/MyDrive

Archive:  /content/drive/MyDrive/archive.zip
  inflating: /content/drive/MyDrive/IMDB Dataset.csv  


In [1]:
import os
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import pickle

# --- 1. Load and Prepare the Dataset ---
print("--- Loading and Preparing Dataset ---")
try:
    df = pd.read_csv('/content/drive/MyDrive/IMDB Dataset.csv')
    df.dropna(inplace=True)
except FileNotFoundError:
    print("Error: 'data/IMDB Dataset.csv' not found. Please download it first.")
    exit()

# Download necessary NLTK data
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)

# --- 2. Define the Preprocessing Function ---
def preprocess_text(text):
    if not isinstance(text, str):
        return ""

    stop_words = set(stopwords.words('english'))
    negation_words = {'not', 'no', 'nor', "isn't", "aren't", "wasn't", "weren't", "don't", "doesn't", "didn't"}
    stop_words = stop_words - negation_words

    text = re.sub(r'<br\s*/?>', ' ', text)
    text = re.sub(r"[^a-zA-Z]", " ", text).lower()
    tokens = text.split()
    lemmatizer = WordNetLemmatizer()
    clean_tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return " ".join(clean_tokens)

print("Preprocessing text... (This may take a few minutes)")
df['clean_review'] = df['review'].apply(preprocess_text)
print("Preprocessing complete.")

# --- 3. Train the Sentiment Model ---
print("\n--- Training Baseline Sentiment Model ---")
X = df['clean_review']
y = df['sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

vectorizer = TfidfVectorizer(max_features=15000, ngram_range=(1, 2))
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

sentiment_model = LogisticRegression(max_iter=1000)
sentiment_model.fit(X_train_vec, y_train)

print("\n--- Baseline Model Evaluation Report ---")
y_pred = sentiment_model.predict(X_test_vec)
print(classification_report(y_test, y_pred))

# --- 4. Save the model for later use ---
if not os.path.exists('models'):
    os.makedirs('models')
with open('models/sentiment_model.pkl', 'wb') as f:
    pickle.dump(sentiment_model, f)
with open('models/tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)
print("\n✅ Baseline model and vectorizer saved to 'models/' folder.")

--- Loading and Preparing Dataset ---
Preprocessing text... (This may take a few minutes)
Preprocessing complete.

--- Training Baseline Sentiment Model ---

--- Baseline Model Evaluation Report ---
              precision    recall  f1-score   support

    negative       0.91      0.89      0.90      5000
    positive       0.89      0.91      0.90      5000

    accuracy                           0.90     10000
   macro avg       0.90      0.90      0.90     10000
weighted avg       0.90      0.90      0.90     10000


✅ Baseline model and vectorizer saved to 'models/' folder.


In [2]:
ASPECT_KEYWORDS = {
    "acting": ["actor", "actress", "acting", "performance", "cast", "character"],
    "plot": ["plot", "story", "script", "narrative", "storyline", "ending"],
    "visuals": ["visuals", "effects", "cgi", "cinematography", "scenery"],
    "directing": ["directing", "director", "filmmaker", "style"]
}

In [3]:
def get_aspect_sentiments(review, model, vectorizer):
    """
    Analyzes a review to find sentiment for predefined aspects.
    """
    clean_review = preprocess_text(review)
    review_tokens = clean_review.split()
    aspect_sentiments = {}

    for aspect, keywords in ASPECT_KEYWORDS.items():
        for keyword in keywords:
            if keyword in review_tokens:
                try:
                    keyword_index = review_tokens.index(keyword)
                    # Create a "window" of text around the keyword
                    start = max(0, keyword_index - 10)
                    end = min(len(review_tokens), keyword_index + 11)
                    context_window = " ".join(review_tokens[start:end])

                    # Use our pre-trained model to predict sentiment on this snippet
                    vectorized_window = vectorizer.transform([context_window])
                    prediction = model.predict(vectorized_window)[0]

                    aspect_sentiments[aspect] = prediction
                    break
                except Exception as e:
                    print(f"Error processing aspect '{aspect}': {e}")

    return aspect_sentiments

# --- 3. Example Usage ---
example_review = "The acting by the main actress was incredible and she gave a great performance, but the plot was a bit slow and the storyline was not very engaging."

print(f"\n--- ABSA Results for Example Review ---\n'{example_review}'")
aspects = get_aspect_sentiments(example_review, sentiment_model, vectorizer)
print(aspects)


--- ABSA Results for Example Review ---
'The acting by the main actress was incredible and she gave a great performance, but the plot was a bit slow and the storyline was not very engaging.'
{'acting': 'positive', 'plot': 'positive'}


In [1]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import pickle
import os

# --- Download NLTK data if needed ---
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)

# --- Define the Preprocessing Function (Crucial to have it here) ---
def preprocess_text(text):
    if not isinstance(text, str):
        return ""

    stop_words = set(stopwords.words('english'))
    negation_words = {'not', 'no', 'nor', "isn't", "aren't", "wasn't", "weren't", "don't", "doesn't", "didn't"}
    stop_words = stop_words - negation_words

    text = re.sub(r'<br\s*/?>', ' ', text)
    text = re.sub(r"[^a-zA-Z]", " ", text).lower()
    tokens = text.split()
    lemmatizer = WordNetLemmatizer()
    clean_tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return " ".join(clean_tokens)

# --- 1. Load your DataFrame and Preprocess it ---
try:
    # We load the original CSV again to ensure we have the correct data
    df = pd.read_csv('/content/drive/MyDrive/IMDB Dataset.csv')
    print("--- Dataset loaded ---")

    # Run the preprocessing step to create the 'clean_review' column
    print("Preprocessing text...")
    df['clean_review'] = df['review'].apply(preprocess_text)
    print("Preprocessing complete.")

except Exception as e:
    print(f"Please ensure your DataFrame is loaded and preprocessed. Error: {e}")
    exit()

# --- 2. Your Existing LDA Training Code ---
print("\n--- Starting Topic Modeling with LDA ---")

count_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=1000, stop_words='english')
dtm = count_vectorizer.fit_transform(df['clean_review'].dropna())

lda = LatentDirichletAllocation(n_components=10, random_state=42)
lda.fit(dtm)
print("LDA model training complete.")

def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print(f"Topic {topic_idx}:")
        print(" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))

print("\n--- Top Words for Each Discovered Topic ---")
display_topics(lda, count_vectorizer.get_feature_names_out(), 10)


# --- 3. Save the Files ---
print("\n--- Saving the LDA model and CountVectorizer ---")

if not os.path.exists('models'):
    os.makedirs('models')

with open('models/lda_model.pkl', 'wb') as f:
    pickle.dump(lda, f)

with open('models/count_vectorizer.pkl', 'wb') as f:
    pickle.dump(count_vectorizer, f)

print("\n✅ Success! 'lda_model.pkl' and 'count_vectorizer.pkl' are saved in the 'models' folder.")

--- Dataset loaded ---
Preprocessing text...
Preprocessing complete.

--- Starting Topic Modeling with LDA ---
LDA model training complete.

--- Top Words for Each Discovered Topic ---
Topic 0:
film character life story love people like make way real
Topic 1:
film role performance actor best play cast version great star
Topic 2:
kid like music song little time movie disney voice animal
Topic 3:
series episode tv like zombie time season original fi sci
Topic 4:
man movie woman father wife like life girl end scene
Topic 5:
movie great good like time funny really comedy watch think
Topic 6:
film scene horror time like director make shot plot character
Topic 7:
film year time child war family life world american old
Topic 8:
character good story plot action really movie game like little
Topic 9:
movie bad like good really make acting time thing people

--- Saving the LDA model and CountVectorizer ---

✅ Success! 'lda_model.pkl' and 'count_vectorizer.pkl' are saved in the 'models' folder.
