<a href="https://colab.research.google.com/github/tranrobin/SentimentAnalysis/blob/main/Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers
!pip install underthesea

import pandas as pd
import numpy as np
import torch
import re
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from joblib import dump
from transformers import AutoModel, AutoTokenizer
import underthesea



In [None]:
# Define a function to load the PhoBERT model and tokenizer
def load_phobert():
    phobert = AutoModel.from_pretrained("vinai/phobert-base")
    tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base", use_fast=False)
    return phobert, tokenizer

In [None]:
# Define a function to preprocess text
def preprocess_text(text, stopwords):
    text = re.sub(r"[.,?]+$-", "", text)
    text = re.sub(r"[,.;:“”\"'!?\-]", " ", text)
    text = text.strip().lower()

    tokens = underthesea.word_tokenize(text)
    filtered_tokens = [w for w in tokens if w not in stopwords]
    preprocessed_text = " ".join(filtered_tokens)

    return preprocessed_text

In [None]:
# Load stopwords
def load_stopwords(stopword_file):
    stopwords = []
    with open(stopword_file, encoding='utf-8') as f:
        stopwords = [line.strip() for line in f.readlines()]
    return stopwords

In [None]:
# Load and preprocess data
def load_and_preprocess_data(data_file, stopwords):
    df = pd.read_csv(data_file)
    text = df['comment'].apply(lambda x: preprocess_text(x, stopwords))
    label = df['classify'].fillna(2.0).astype(float)
    return text, label

In [None]:
# Extract BERT features
def extract_bert_features(text, max_len, phobert, tokenizer):
    tokenized_text = [tokenizer.encode(t, padding='max_length', max_length=max_len, truncation=True) for t in text]
    attention_mask = [[1 if token_id != 0 else 0 for token_id in token_ids] for token_ids in tokenized_text]

    input_ids = torch.tensor(tokenized_text)
    attention_mask = torch.tensor(attention_mask)

    with torch.no_grad():
        last_hidden_states = phobert(input_ids=input_ids, attention_mask=attention_mask)

    features = last_hidden_states[0][:, 0, :].numpy()
    return features

In [None]:
# Main function
def main():
    # Define parameters
    stopword_file = "stopword.txt"
    data_file = "comment.csv"
    max_sequence_length = 100

    print("Loading stopwords...")
    sw = load_stopwords(stopword_file)
    print("Stopwords loaded.")

    print("Loading PhoBERT model...")
    phobert, tokenizer = load_phobert()
    print("PhoBERT model loaded.")

    print("Loading and preprocessing data...")
    text, label = load_and_preprocess_data(data_file, sw)
    print("Data loaded and preprocessed.")

    print("Extracting BERT features...")
    features = extract_bert_features(text, max_sequence_length, phobert, tokenizer)
    print("BERT features extracted.")

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(features, label, test_size=0.1, random_state=45)

    print("Training Gaussian Naive Bayes model...")
    clf = GaussianNB()
    clf.fit(X_train, y_train)

    accuracy = clf.score(X_test, y_test)
    print(f'Model training completed. Accuracy: {accuracy * 100:.2f}%')

    # Save the trained model
    dump(clf, 'saved_model.pkl')
    print("Model saved as 'saved_model.pkl'.")

In [None]:
if __name__ == "__main__":
    main()