# youtube comment sentiment analysis

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from concurrent.futures import ThreadPoolExecutor, as_completed
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from tqdm import tqdm
import warnings
import scipy
from scipy.stats import t
from sklearn.utils import resample
from datetime import datetime
import sys
import os

warnings.filterwarnings(action="ignore")

MAX_COMMENTS = 10000

sys.path.insert(0, "/Users/tonymeissner/source/CancelCultureImpact/src/")
sys.path.insert(0, "/Users/tonymeissner/source/CancelCultureImpact/src/analysis/")
from config import *
from analyzer_functions import *

tqdm.pandas()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/tonymeissner/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/tonymeissner/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/tonymeissner/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/tonymeissner/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/tonymeissner/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


# sentiment analysis

In [2]:
def sort_comment_by_date(df):
    df["updateDt"] = pd.to_datetime(df["updateDt"]).dt.tz_localize('UTC')
    df.sort_values(by="updateDt", ascending=False, inplace=True, ignore_index=True)
    return df

In [3]:
kanye_comment = pd.read_csv(
    os.path.join(RAW_DATA_PATH, "kanye_west_youtube_comments.csv")
)
manson_comment = pd.read_csv(
    os.path.join(RAW_DATA_PATH, "marilyn_manson_youtube_comments.csv")
)
kelly_comment = pd.read_csv(os.path.join(RAW_DATA_PATH, "r_kelly_youtube_comments.csv"))
seungri_comment = pd.read_csv(
    os.path.join(RAW_DATA_PATH, "seungri_youtube_comments.csv")
)

In [None]:
kanye_comment = sort_comment_by_date(kanye_comment)
manson_comment = sort_comment_by_date(manson_comment)
kelly_comment = sort_comment_by_date(kelly_comment)
seungri_comment = sort_comment_by_date(seungri_comment)

In [5]:
kanye_comment.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 184424 entries, 0 to 184423
Data columns (total 3 columns):
 #   Column    Non-Null Count   Dtype         
---  ------    --------------   -----         
 0   text      184386 non-null  object        
 1   updateDt  184424 non-null  datetime64[ns]
 2   video_id  184424 non-null  object        
dtypes: datetime64[ns](1), object(2)
memory usage: 4.2+ MB


In [4]:
# random sampling comment data (max comment data is 10000)
def balance_dataset(df, artist_name):
    # Specify the date to split the data
    split_date = CELEBRITIES[CELEBRITIES["name"] == artist_name]["cancellation_date"].iloc[0]
    split_date = pd.to_datetime(split_date, utc=True)

    print(f"{artist_name}'s canceled date: {split_date}")

    # Split the data into before and after the specified date
    df_before = df[df["updateDt"] < split_date]
    df_after = df[df["updateDt"] >= split_date]

    print(
        f"{artist_name}'s Before canceled date data: {len(df_before)}, After canceled date data: {len(df_after)}"
    )

    # Determine the number of samples needed to balance the dataset
    n_samples = min(len(df_before), len(df_after))

    # set max comments
    if n_samples > MAX_COMMENTS:
        n_samples = MAX_COMMENTS

    # Downsample the larger dataset to match the smaller dataset
    df_before_downsampled = resample(
        df_before, replace=False, n_samples=n_samples, random_state=42
    )
    df_after_downsampled = resample(
        df_after, replace=False, n_samples=n_samples, random_state=42
    )

    print(
        f"{artist_name}'s Before canceled date data: {len(df_before_downsampled)}, After canceled date data: {len(df_after_downsampled)}\n"
    )

    # Combine the downsampled data
    df_balanced = pd.concat([df_before_downsampled, df_after_downsampled])

    return df_balanced

In [5]:
kanye_comment = balance_dataset(kanye_comment, "kanye_west")
manson_comment = balance_dataset(manson_comment, "marilyn_manson")
kelly_comment = balance_dataset(kelly_comment, "r_kelly")
seungri_comment = balance_dataset(seungri_comment, "seungri")

kanye_west's canceled date: 2022-10-25 00:00:00+00:00
kanye_west's Before canceled date data: 60703, After canceled date data: 123721
kanye_west's Before canceled date data: 10000, After canceled date data: 10000

marilyn_manson's canceled date: 2021-02-21 00:00:00+00:00
marilyn_manson's Before canceled date data: 12583, After canceled date data: 42841
marilyn_manson's Before canceled date data: 10000, After canceled date data: 10000

r_kelly's canceled date: 2021-09-27 00:00:00+00:00
r_kelly's Before canceled date data: 27978, After canceled date data: 79486
r_kelly's Before canceled date data: 10000, After canceled date data: 10000

seungri's canceled date: 2019-01-31 00:00:00+00:00
seungri's Before canceled date data: 3181, After canceled date data: 33883
seungri's Before canceled date data: 3181, After canceled date data: 3181



## celebrities

## preprocessing

### translate

In [6]:
def detect_languages_parallel(titles):
    """
    Detect languages of multiple titles in parallel
    """
    detected_languages = []
    with ThreadPoolExecutor(max_workers=10) as executor:
        futures = [executor.submit(detect_and_translate, title) for title in titles]
        for future in tqdm(as_completed(futures), total=len(futures)):
            try:
                detected_languages.append(future.result())
            except Exception as e:
                print(f"Error processing future: {e}")
                detected_languages.append("error")
    return detected_languages

In [7]:
# Apply detect_language function in parallel and save results
kanye_comment["response"] = detect_languages_parallel(kanye_comment["text"].tolist())
manson_comment["response"] = detect_languages_parallel(manson_comment["text"].tolist())
kelly_comment["response"] = detect_languages_parallel(kelly_comment["text"].tolist())
seungri_comment["response"] = detect_languages_parallel(seungri_comment["text"].tolist())

# Extract source language and translated text from response
for df in [kanye_comment, manson_comment, kelly_comment, seungri_comment]:
    df["source"] = df["response"].apply(lambda x: x["detectedLanguage"]["language"])
    df["translated"] = df["response"].apply(lambda x: x["translatedText"])

# Drop response column
kanye_comment.drop("response", axis=1, inplace=True)
manson_comment.drop("response", axis=1, inplace=True)
kelly_comment.drop("response", axis=1, inplace=True)
seungri_comment.drop("response", axis=1, inplace=True)

 31%|███▏      | 6295/20000 [10:25<23:41,  9.64it/s]  

## Preprocess function

In [21]:
def preprocess_article_and_sentiment(df):
    """
    Preprocess the article's title and sentiment analysis it
    """
    # Remove emojis in parallel
    with ThreadPoolExecutor(max_workers=10) as executor:
        futures = {executor.submit(remove_emojis, text): idx for idx, text in enumerate(df["translated"])}
        for future in tqdm(as_completed(futures), total=len(futures)):
            idx = futures[future]
            try:
                df.at[idx, "remove_emoji"] = future.result()
            except Exception as e:
                print(f"Error removing stopwords for index {idx}: {e}")
                df.at[idx, "remove_emoji"] = None

    # Remove stopwords in parallel
    with ThreadPoolExecutor(max_workers=10) as executor:
        futures = {executor.submit(remove_stopwords, text): idx for idx, text in enumerate(df["remove_emoji"])}
        for future in tqdm(as_completed(futures), total=len(futures)):
            idx = futures[future]
            try:
                df.at[idx, "remove_stopword"] = future.result()
            except Exception as e:
                print(f"Error removing stopwords for index {idx}: {e}")
                df.at[idx, "remove_stopword"] = None

    # Lemmatization in parallel
    with ThreadPoolExecutor(max_workers=10) as executor:
        futures = {executor.submit(get_lemma, text): idx for idx, text in enumerate(df["remove_stopword"])}
        for future in tqdm(as_completed(futures), total=len(futures)):
            idx = futures[future]
            try:
                df.at[idx, "lemmatization"] = future.result()
            except Exception as e:
                print(f"Error lemmatizing text for index {idx}: {e}")
                df.at[idx, "lemmatization"] = None

    return df

In [None]:
processed_comment_kanye = preprocess_article_and_sentiment(kanye_comment)
processed_comment_manson = preprocess_article_and_sentiment(manson_comment)
processed_comment_kelly = preprocess_article_and_sentiment(kelly_comment)
processed_comment_seungri = preprocess_article_and_sentiment(seungri_comment)

In [None]:
processed_comment_kanye.to_csv(os.path.join(PROCESSED_DATA_PATH, "kanye_west_youtube_comments_processed.csv"), index=False)
processed_comment_manson.to_csv(os.path.join(PROCESSED_DATA_PATH, "marilyn_manson_youtube_comments_processed.csv"), index=False)
processed_comment_kelly.to_csv(os.path.join(PROCESSED_DATA_PATH, "r_kelly_youtube_comments_processed.csv"), index=False)
processed_comment_seungri.to_csv(os.path.join(PROCESSED_DATA_PATH, "seungri_youtube_comments_processed.csv"), index=False)

In [None]:
processed_comment_kanye.head()

Unnamed: 0,text,updateDt,video_id,source,translated,remove_emoji,remove_stopword,lemmatization
0,Omg,2021-09-17,Xlvk8K0Wbpo,en,Omg,Omg,Omg,Omg
1,Yeezy Boost < Barefoot with Jesus,2022-02-09,ulq14QrjBn0,en,Yeezy Boost < Barefoot with Jesus,Yeezy Boost < Barefoot with Jesus,Yeezy Boost < Barefoot Jesus,Yeezy Boost < Barefoot Jesus
2,aint nobody finna do shshshsh,2020-07-01,ik4USIChrkY,no,aint nobody finna do shshshsh,aint nobody finna do shshshsh,aint nobody finna shshshsh,be not nobody finna shshshsh
3,Here's the votes to make this the national ant...,2020-09-05,ik4USIChrkY,en,Here's the votes to make this the national ant...,Here's the votes to make this the national ant...,Here 's votes make national anthem elect presi...,here be vote make national anthem elect presid...
4,Falling in looovvveeeeeee 😌,2022-09-01,Qp2pTDpLjLc,en,Falling in looovvveeeeeee 😌,Falling in looovvveeeeeee,Falling looovvveeeeeee,fall looovvveeeeeee


## Test

In [None]:
processed_kanye_comment = pd.read_csv(os.path.join(RAW_DATA_PATH, "kanye_west_youtube_comments_processed.csv"))
processed_manson_comment = pd.read_csv(os.path.join(RAW_DATA_PATH, "marilyn_manson_youtube_comments_processed.csv"))
processed_kelly_comment = pd.read_csv(os.path.join(RAW_DATA_PATH, "r_kelly_youtube_comments_processed.csv"))
processed_seungri_comment = pd.read_csv(os.path.join(RAW_DATA_PATH, "seungri_youtube_comments_processed.csv"))

In [None]:
processed_kanye_comment = sort_comment_by_date(processed_kanye_comment)
processed_manson_comment = sort_comment_by_date(processed_manson_comment)
processed_kelly_comment = sort_comment_by_date(processed_kelly_comment)
processed_seungri_comment = sort_comment_by_date(processed_seungri_comment)

When the sample size is very large, the critical value at a significance level of 0.05 is 1.96

In [None]:
kanye_cancellation_date = CELEBRITIES[CELEBRITIES["name"] == "kanye_west"][
    "cancellation_date"
].iloc[0]
kanye_comment_before_canceled = processed_kanye_comment[
    processed_kanye_comment["updateDt"] < kanye_cancellation_date
]
kanye_comment_after_canceled = processed_kanye_comment[
    processed_kanye_comment["updateDt"] >= CELEBRITIES[CELEBRITIES["name"] == "kanye_west"]["cancellation_date"]
]
print(
    "kanye count before: {0}, count after: {1}".format(
        kanye_comment_before_canceled["text"].count(),
        kanye_comment_after_canceled["text"].count(),
    )
)

manson_cancellation_date = CELEBRITIES[CELEBRITIES["name"] == "marilyn_manson"][
    "cancellation_date"
].iloc[0]
manson_comment_before_canceled = processed_manson_comment[
    processed_manson_comment["updateDt"] < manson_cancellation_date
]
manson_comment_after_canceled = processed_manson_comment[
    processed_manson_comment["updateDt"] >= manson_cancellation_date
]
print(
    "manson count before: {0}, count after: {1}".format(
        manson_comment_before_canceled["text"].count(),
        manson_comment_after_canceled["text"].count(),
    )
)

kelly_cancellation_date = CELEBRITIES[CELEBRITIES["name"] == "r_kelly"][
    "cancellation_date"
].iloc[0]
kelly_comment_before_canceled = processed_kelly_comment[
    processed_kelly_comment["updateDt"] < kelly_cancellation_date
]
kelly_comment_after_canceled = processed_kelly_comment[
    processed_kelly_comment["updateDt"] >= kelly_cancellation_date
]
print(
    "kelly count before: {0}, count after: {1}".format(
        kelly_comment_before_canceled["text"].count(),
        kelly_comment_after_canceled["text"].count(),
    )
)

seungri_cancellation_date = CELEBRITIES[CELEBRITIES["name"] == "seungri"][
    "cancellation_date"
].iloc[0]
seungri_comment_before_canceled = processed_seungri_comment[
    processed_seungri_comment["updateDt"] < seungri_cancellation_date
]
seungri_comment_after_canceled = processed_seungri_comment[
    processed_seungri_comment["updateDt"] >= seungri_cancellation_date
]
print(
    "seungri count before: {0}, count after: {1}".format(
        seungri_comment_before_canceled["text"].count(),
        seungri_comment_after_canceled["text"].count(),
    )
)

kanye count before: 9006, count after: 9142
manson count before: 8855, count after: 8924
kelly count before: 9181, count after: 9155
seungri count before: 2786, count after: 2719


#### kanye west

In [None]:
def sentiment_analysis_and_ttest(df_before, df_after):
    senti_analyzer = SentimentIntensityAnalyzer()
    df = pd.DataFrame()
    df2 = pd.DataFrame()
    for i, text in enumerate(df_before["lemmatization"]):
        senti_scores = senti_analyzer.polarity_scores(text)
        df = pd.concat([df, pd.DataFrame(senti_scores, index=[i])], axis=0)
    for i, text in enumerate(df_after["lemmatization"]):
        senti_scores = senti_analyzer.polarity_scores(text)
        df2 = pd.concat([df2, pd.DataFrame(senti_scores, index=[i])], axis=0)
    print("before canceled mean compound score: {:.3f}".format(df["compound"].mean()))
    print("after canceled mean compound score: {:.3f}".format(df2["compound"].mean()))
    t, p = scipy.stats.ttest_ind(df["compound"], df2["compound"])
    print("t score: {0:.3f}, p-value: {1:.3f}".format(t, p))
    return

In [None]:
print("kanye west sentiment anaylsis and ttest")
sentiment_analysis_and_ttest(
    kanye_comment_before_canceled, kanye_comment_after_canceled
)

kanye west sentiment anaylsis and ttest
before canceled mean compound score: 0.167
after canceled mean compound score: 0.112
t score: 9.292, p-value: 0.000


In [None]:
print("marilyn manson sentiment anaylsis and ttest")
sentiment_analysis_and_ttest(
    manson_comment_before_canceled, manson_comment_after_canceled
)

marilyn manson sentiment anaylsis and ttest
before canceled mean compound score: 0.245
after canceled mean compound score: 0.185
t score: 8.737, p-value: 0.000


In [None]:
print("r.kelly sentiment anaylsis and ttest")
sentiment_analysis_and_ttest(
    kelly_comment_before_canceled, kelly_comment_after_canceled
)

r.kelly sentiment anaylsis and ttest
before canceled mean compound score: 0.096
after canceled mean compound score: 0.132
t score: -5.561, p-value: 0.000


In [None]:
print("seungri sentiment anaylsis and ttest")
sentiment_analysis_and_ttest(
    seungri_comment_before_canceled, seungri_comment_after_canceled
)

seungri sentiment anaylsis and ttest
before canceled mean compound score: 0.301
after canceled mean compound score: 0.097
t score: 17.551, p-value: 0.000
