In [4]:
import pandas as pd
import re

def label_comment(row):
    """Gắn nhãn bình luận theo các khía cạnh."""
    comment = str(row['cleaned_comment']).lower()
    row['vaccine_view'] = ''
    row['about_doctor_mike'] = ''
    row['format_criticism'] = ''
    row['misinformation'] = ''
    row['emotion_personal_experience'] = ''

    # Vaccine View
    if any(word in comment for word in ['vaccine', 'anti vax', 'vax', 'vaccination']):
        if any(word in comment for word in ['work', 'awesome', 'eradicated', 'safe', 'prevent']):
            row['vaccine_view'] = 'pro-vaccine'
        elif any(word in comment for word in ['dangerous', 'not safe', 'cause illness', 'death', 'autism']):
            row['vaccine_view'] = 'anti-vaccine'
        else:
            row['vaccine_view'] = 'skeptic'

    # About Doctor Mike
    if any(phrase in comment for phrase in ['dr mike', 'doctor mike', 'mike']):
        if any(word in comment for word in ['patient', 'respectful', 'kind', 'calm', 'eloquent']):
            row['about_doctor_mike'] = 'positive'
        elif any(word in comment for word in ['shill', 'joke', 'propaganda', 'indoctrinated', 'lie']):
            row['about_doctor_mike'] = 'negative'
        else:
            row['about_doctor_mike'] = 'neutral'

    # Format Criticism
    if any(word in comment for word in ['format', 'vote', 'flag', 'debate', 'system']):
        row['format_criticism'] = 'yes'

    # Misinformation
    if any(phrase in comment for phrase in [
        'hiv doesnt cause aids', 'autism', 'conspiracy', '600000 deaths', 'big pharma funded',
        'vaccines cause illness', 'flat earth', 'no placebo']):
        row['misinformation'] = 'yes'

    # Emotion/Personal Experience
    if any(word in comment for word in [
        'feel', 'sad', 'angry', 'pissed', 'frustrated', 'sorry', 'crazy', 'insane', 'painful',
        'my friend', 'my mum', 'my daughter', 'i lost']):
        row['emotion_personal_experience'] = 'yes'

    return row

def classify_aspects(df, column_name='cleaned_comment'):
    """Áp dụng gắn nhãn khía cạnh cho DataFrame."""
    return df.apply(label_comment, axis=1)

# Tải file
df = pd.read_csv('../data/raw/cleaned_comments.csv')

# Áp dụng phân loại khía cạnh
df = classify_aspects(df)

# Lưu kết quả
df.to_csv('../data/processed/labeled_comments.csv', index=False)

# Xem trước
print(df[['cleaned_comment', 'vaccine_view', 'about_doctor_mike', 'format_criticism',
          'misinformation', 'emotion_personal_experience']].head(10))

# Thống kê phân bố
print("\nPhân bố Vaccine View:")
print(df['vaccine_view'].value_counts())
print("\nPhân bố About Doctor Mike:")
print(df['about_doctor_mike'].value_counts())
print("\nPhân bố Format Criticism:")
print(df['format_criticism'].value_counts())
print("\nPhân bố Misinformation:")
print(df['misinformation'].value_counts())
print("\nPhân bố Emotion/Personal Experience:")
print(df['emotion_personal_experience'].value_counts())

                                     cleaned_comment vaccine_view  \
0  a lot of this seems to come down to who contro...                
1                    the voting out system is flawed                
2  compare doctor mike and sam sewer to jordan pe...                
3  after that self appointed rhead scientist the ...                
4  facts can change over time the most absurd sta...                
5  if the doctor cant even respond what kind of d...                
6                     devon caused me a brain tumour                
7  this proves how dangerous the internet is when...                
8                         a bunch of conspiracy nuts                
9  once someone gets voted out the surrounded per...                

  about_doctor_mike format_criticism misinformation  \
0                                                     
1                                yes                  
2           neutral                                   
3                   