# module and packages

In [16]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from concurrent.futures import ThreadPoolExecutor, as_completed
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from tqdm import tqdm
import warnings
import scipy
from datetime import datetime
from scipy.stats import t, ttest_rel
from sklearn.utils import resample
import os
import sys
import json

sys.path.insert(0, "/Users/tonymeissner/source/CancelCultureImpact/src/")
sys.path.insert(0, "/Users/tonymeissner/source/CancelCultureImpact/src/analysis/")
from config import *
from analyzer_functions import *

warnings.filterwarnings(action="ignore")

MAX_ARTICLES = 10000

senti_analyzer = SentimentIntensityAnalyzer()

tqdm.pandas()

# Data preparation

## reading

In [25]:
kanye_article = pd.read_csv(
    os.path.join(RAW_DATA_PATH, "kanye_west_articles_gnews.csv")
)
manson_article = pd.read_csv(
    os.path.join(RAW_DATA_PATH, "marilyn_manson_articles_gnews.csv")
)
kelly_article = pd.read_csv(os.path.join(RAW_DATA_PATH, "r_kelly_articles_gnews.csv"))
seungri_article = pd.read_csv(
    os.path.join(RAW_DATA_PATH, "seungri_articles_bigkinds.csv")
)

## Big Kinds

In [26]:
# Since Seungri's data is from Korean articles, it matches the format with other artist data.
# drop useless columns
seungri_article.drop(
    [
        "뉴스 식별자",
        "언론사",
        "기고자",
        "통합 분류1",
        "통합 분류2",
        "통합 분류3",
        "사건/사고 분류1",
        "사건/사고 분류2",
        "사건/사고 분류3",
        "인물",
        "위치",
        "기관",
        "키워드",
        "특성추출(가중치순 상위 50개)",
        "본문",
        "URL",
        "분석제외 여부",
    ],
    axis=1,
    inplace=True,
)
# rename columns
seungri_article.columns = ["updateDt", "title"]

# change updateDt data type to Datetime
seungri_article["updateDt"] = seungri_article["updateDt"].astype(str)
seungri_article["updateDt"] = seungri_article["updateDt"].apply(
    lambda x: x[:4] + "-" + x[4:6] + "-" + x[6:]
)
seungri_article["updateDt"] = pd.to_datetime(seungri_article["updateDt"])
seungri_article["updateDt"] = seungri_article["updateDt"].dt.tz_localize('UTC')
seungri_article.sort_values("updateDt", inplace=True)

## Gnews

In [27]:
# columns: title, content, published_on, link, source

# drop useless columns
kanye_article.drop(["content", "link", "source"], axis=1, inplace=True)
manson_article.drop(["content", "link", "source"], axis=1, inplace=True)
kelly_article.drop(["content", "link", "source"], axis=1, inplace=True)

# rename columns, published_on -> updateDt
kanye_article.columns = ["title", "updateDt"]
manson_article.columns = ["title", "updateDt"]
kelly_article.columns = ["title", "updateDt"]

# change updateDt data type to Datetime
kanye_article["updateDt"] = pd.to_datetime(kanye_article["updateDt"]).dt.tz_convert('UTC')
manson_article["updateDt"] = pd.to_datetime(manson_article["updateDt"]).dt.tz_convert('UTC')
kelly_article["updateDt"] = pd.to_datetime(kelly_article["updateDt"]).dt.tz_convert('UTC')

# sort by updateDt
kanye_article.sort_values(by="updateDt", inplace=True)
manson_article.sort_values(by="updateDt", inplace=True)
kelly_article.sort_values(by="updateDt", inplace=True)

## show data

In [28]:
print("Kanye West:")
print(kanye_article.info())
print(kanye_article.head(3))
print()

print("Marilyn Manson:")
print(manson_article.info())
print(manson_article.head(3))
print()

print("R. Kelly:")
print(kelly_article.info())
print(kelly_article.head(3))
print()

print("Seungri:")
print(seungri_article.info())
print(seungri_article.head(3))
print()

Kanye West:
<class 'pandas.core.frame.DataFrame'>
Index: 18423 entries, 0 to 18418
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype              
---  ------    --------------  -----              
 0   title     18423 non-null  object             
 1   updateDt  18423 non-null  datetime64[ns, UTC]
dtypes: datetime64[ns, UTC](1), object(1)
memory usage: 431.8+ KB
None
                                               title                  updateDt
0  Comment créer un bon mot de passe sécurisé et ... 2018-04-19 13:15:25+00:00
1     Kanye West verstört mit Aussagen zur Sklaverei 2018-05-07 14:59:32+00:00
2  Kanye West receives flack for too-small Yeezy ... 2018-08-29 13:00:16+00:00

Marilyn Manson:
<class 'pandas.core.frame.DataFrame'>
Index: 2256 entries, 0 to 2255
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype              
---  ------    --------------  -----              
 0   title     2256 non-null   object             
 1   updateDt  2256 non

In [29]:
def count_dataset(df, artist_name: str):
    # Specify the date to split the data
    split_date = CELEBRITIES[CELEBRITIES["name"] == artist_name]["cancellation_date"].iloc[0]
    split_date = pd.to_datetime(split_date, utc=True)
    
    print(f"{artist_name}'s canceled date: {split_date}")

    # count the number of dataset before canceled and after
    before_canceled = df[df["updateDt"] < split_date]["title"].count()
    after_canceled = df[df["updateDt"] >= split_date]["title"].count()

    print(
        "{0} before canceled article count is {1}".format(
            artist_name, before_canceled
        )
    )
    print(
        "{0} after canceled article count is {1}\n".format(
            artist_name, after_canceled
        )
    )

    return

In [30]:
count_dataset(kanye_article, "kanye_west")
count_dataset(manson_article, "marilyn_manson")
count_dataset(kelly_article, "r_kelly")
count_dataset(seungri_article, "seungri")

kanye_west's canceled date: 2022-10-25 00:00:00+00:00
kanye_west before canceled article count is 13168
kanye_west after canceled article count is 5255

marilyn_manson's canceled date: 2021-02-21 00:00:00+00:00
marilyn_manson before canceled article count is 883
marilyn_manson after canceled article count is 1373

r_kelly's canceled date: 2021-09-27 00:00:00+00:00
r_kelly before canceled article count is 1281
r_kelly after canceled article count is 2100

seungri's canceled date: 2019-01-31 00:00:00+00:00
seungri before canceled article count is 1037
seungri after canceled article count is 1206



# Data preprocessing

## detection

In [40]:
def detect_languages_parallel(titles):
    """
    Detect languages of multiple titles in parallel
    """
    detected_languages = []
    with ThreadPoolExecutor(max_workers=10) as executor:
        futures = [executor.submit(detect_and_translate, title) for title in titles]
        for future in tqdm(as_completed(futures), total=len(futures)):
            try:
                detected_languages.append(future.result())
            except Exception as e:
                print(f"Error processing future: {e}")
                detected_languages.append("error")
    return detected_languages

In [41]:
# Apply detect_language function in parallel and save results
kanye_article["response"] = detect_languages_parallel(kanye_article["title"].tolist())
manson_article["response"] = detect_languages_parallel(manson_article["title"].tolist())
kelly_article["response"] = detect_languages_parallel(kelly_article["title"].tolist())
seungri_article["response"] = detect_languages_parallel(seungri_article["title"].tolist())

# Extract source language and translated text from response
for df in [kanye_article, manson_article, kelly_article, seungri_article]:
    df["source"] = df["response"].apply(lambda x: x["detectedLanguage"]["language"])
    df["translated"] = df["response"].apply(lambda x: x["translatedText"])

# Drop response column
kanye_article.drop("response", axis=1, inplace=True)
manson_article.drop("response", axis=1, inplace=True)
kelly_article.drop("response", axis=1, inplace=True)
seungri_article.drop("response", axis=1, inplace=True)

100%|██████████| 18423/18423 [17:54<00:00, 17.15it/s]
100%|██████████| 2256/2256 [03:33<00:00, 10.56it/s]
100%|██████████| 3381/3381 [04:51<00:00, 11.60it/s]
100%|██████████| 2243/2243 [06:18<00:00,  5.93it/s]


In [55]:
kanye_article.head()

Unnamed: 0,title,updateDt,source,translated
0,Comment créer un bon mot de passe sécurisé et ...,2018-04-19 13:15:25+00:00,en,"Taylor Swift, Kylie Jenner top Forbes list of ..."
1,Kanye West verstört mit Aussagen zur Sklaverei,2018-05-07 14:59:32+00:00,de,Kanye West with statements on slavery
2,Kanye West receives flack for too-small Yeezy ...,2018-08-29 13:00:16+00:00,en,Kanye West receives flack for too-small Yeezy ...
3,Kim Kardashian (re)lança a moda dos calções de...,2018-10-24 17:34:28+00:00,en,Comment créer un bon mot de passe sécurisé et ...
4,"""Donda"": Είναι, τελικά, τόσο κακό το νέο άλμπο...",2018-12-24 21:58:00+00:00,en,Kim Kardashian (re)lança a moda dos calções de...


## preprocessing

The full text of the article is so long that the correct compund score does not seem to be measured.  
Also, fasttext language detection does not work properly.  
The title of an article contains only the most important part of the article's content, so it would be better to analyze it based on the title of the article.

In [57]:
def preprocess_article_and_sentiment(df):
    """
    Preprocess the article's title and sentiment analysis it
    """
    # Remove stopwords in parallel
    with ThreadPoolExecutor(max_workers=10) as executor:
        futures = {executor.submit(remove_stopwords, text): idx for idx, text in enumerate(df["translated"])}
        for future in tqdm(as_completed(futures), total=len(futures)):
            idx = futures[future]
            try:
                df.at[idx, "remove_stopword"] = future.result()
            except Exception as e:
                print(f"Error removing stopwords for index {idx}: {e}")
                df.at[idx, "remove_stopword"] = None

    # Lemmatization in parallel
    with ThreadPoolExecutor(max_workers=10) as executor:
        futures = {executor.submit(get_lemma, text): idx for idx, text in enumerate(df["remove_stopword"])}
        for future in tqdm(as_completed(futures), total=len(futures)):
            idx = futures[future]
            try:
                df.at[idx, "lemmatization"] = future.result()
            except Exception as e:
                print(f"Error lemmatizing text for index {idx}: {e}")
                df.at[idx, "lemmatization"] = None

    return df

In [80]:
processed_article_kanye = preprocess_article_and_sentiment(kanye_article)
processed_article_manson = preprocess_article_and_sentiment(manson_article)
processed_article_kelly = preprocess_article_and_sentiment(kelly_article)
processed_article_seungri = preprocess_article_and_sentiment(seungri_article)

100%|██████████| 18423/18423 [00:00<00:00, 142354.90it/s]
  9%|▉         | 1668/18423 [00:09<01:32, 180.67it/s]


: 

: 

In [None]:
def balance_dataset(df, artist_name):
    # Specify the date to split the data
    split_date = CELEBRITIES[CELEBRITIES["name"] == artist_name][
        "cancellation_date"
    ].iloc[0]
    split_date = pd.to_datetime(split_date, utc=True)

    split_date = datetime.strptime(split_date, "%Y-%m-%d")
    print(f"{artist_name}'s canceled date: {split_date}")

    # Split the data into before and after the specified date
    df_before = df[df["updateDt"] < split_date]
    df_after = df[df["updateDt"] >= split_date]

    print(
        f"{artist_name}'s Before canceled date data: {len(df_before)}, After canceled date data: {len(df_after)}"
    )

    # Determine the number of samples needed to balance the dataset
    n_samples = min(len(df_before), len(df_after))

    # set max comments
    if n_samples > MAX_ARTICLES:
        n_samples = MAX_ARTICLES

    # Downsample the larger dataset to match the smaller dataset
    df_before_downsampled = resample(
        df_before, replace=False, n_samples=n_samples, random_state=42
    )
    df_after_downsampled = resample(
        df_after, replace=False, n_samples=n_samples, random_state=42
    )

    print(
        f"{artist_name}'s Before canceled date data: {len(df_before_downsampled)}, After canceled date data: {len(df_after_downsampled)}\n"
    )

    # Combine the downsampled data
    df_balanced = pd.concat([df_before_downsampled, df_after_downsampled])

    return df_balanced

In [7]:
processed_article_kanye = balance_dataset(processed_article_kanye, 'kanye_west')
processed_article_manson = balance_dataset(processed_article_manson, 'marilyn_manson')
processed_article_kelly = balance_dataset(processed_article_kelly, 'r_kelly')
processed_article_seungri = balance_dataset(processed_article_seungri, 'seungri')

kanye_west's canceled date: 2022-10-25 00:00:00+00:00
kanye_west's Before canceled date data: 13168, After canceled date data: 5255
kanye_west's Before canceled date data: 5255, After canceled date data: 5255

marilyn_manson's canceled date: 2021-02-21 00:00:00+00:00
marilyn_manson's Before canceled date data: 883, After canceled date data: 1373
marilyn_manson's Before canceled date data: 883, After canceled date data: 883

r_kelly's canceled date: 2021-09-27 00:00:00+00:00
r_kelly's Before canceled date data: 1281, After canceled date data: 2100
r_kelly's Before canceled date data: 1281, After canceled date data: 1281

seungri's canceled date: 2019-01-31 00:00:00+00:00
seungri's Before canceled date data: 1037, After canceled date data: 1206
seungri's Before canceled date data: 1037, After canceled date data: 1037



In [59]:
processed_article_kanye.to_csv(
    os.path.join(PROCESSED_DATA_PATH, "kanye_west_articles_processed.csv"), index=False
)
processed_article_manson.to_csv(
    os.path.join(PROCESSED_DATA_PATH, "marilyn_manson_articles_processed.csv"), index=False
)
processed_article_kelly.to_csv(
    os.path.join(PROCESSED_DATA_PATH, "r_kelly_articles_processed.csv"), index=False
)
processed_article_seungri.to_csv(
    os.path.join(PROCESSED_DATA_PATH, "seungri_articles_processed.csv"), index=False
)

# Data analyzing

## read data

In [3]:
processed_article_kanye = pd.read_csv(
    os.path.join(PROCESSED_DATA_PATH, "kanye_west_articles_processed.csv")
)
processed_article_manson = pd.read_csv(
    os.path.join(PROCESSED_DATA_PATH, "marilyn_manson_articles_processed.csv")
)
processed_article_kelly = pd.read_csv(
    os.path.join(PROCESSED_DATA_PATH, "r_kelly_articles_processed.csv")
)
processed_article_seungri = pd.read_csv(
    os.path.join(PROCESSED_DATA_PATH, "seungri_articles_processed.csv")
)

In [4]:
processed_article_kanye["updateDt"] = pd.to_datetime(processed_article_kanye["updateDt"]).dt.tz_convert('UTC')
processed_article_manson["updateDt"] = pd.to_datetime(processed_article_manson["updateDt"]).dt.tz_convert('UTC')
processed_article_kelly["updateDt"] = pd.to_datetime(processed_article_kelly["updateDt"]).dt.tz_convert('UTC')
processed_article_seungri["updateDt"] = pd.to_datetime(processed_article_seungri["updateDt"]).dt.tz_convert('UTC')

## split data by cancel date

In [8]:
kanye_cancellation_date = CELEBRITIES[CELEBRITIES["name"] == "kanye_west"][
    "cancellation_date"
].iloc[0]
kanye_article_before_canceled = processed_article_kanye[
    processed_article_kanye["updateDt"] < kanye_cancellation_date
]
kanye_article_after_canceled = processed_article_kanye[
    processed_article_kanye["updateDt"] >= kanye_cancellation_date
]
print(
    "kanye count before: {0}, count after: {1}".format(
        len(kanye_article_before_canceled),
        len(kanye_article_after_canceled),
    )
)

manson_cancellation_date = CELEBRITIES[CELEBRITIES["name"] == "marilyn_manson"][
    "cancellation_date"
].iloc[0]
manson_article_before_canceled = processed_article_manson[
    processed_article_manson["updateDt"] < manson_cancellation_date
]
manson_article_after_canceled = processed_article_manson[
    processed_article_manson["updateDt"] >= manson_cancellation_date
]
print(
    "manson count before: {0}, count after: {1}".format(
        len(manson_article_before_canceled),
        len(manson_article_after_canceled),
    )
)

kelly_cancellation_date = CELEBRITIES[CELEBRITIES["name"] == "r_kelly"][
    "cancellation_date"
].iloc[0]
kelly_article_before_canceled = processed_article_kelly[
    processed_article_kelly["updateDt"] < kelly_cancellation_date
]
kelly_article_after_canceled = processed_article_kelly[
    processed_article_kelly["updateDt"] >= kelly_cancellation_date
]
print(
    "kelly count before: {0}, count after: {1}".format(
        len(kelly_article_before_canceled),
        len(kelly_article_after_canceled),
    )
)

seungri_cancellation_date = CELEBRITIES[CELEBRITIES["name"] == "seungri"][
    "cancellation_date"
].iloc[0]
seungri_article_before_canceled = processed_article_seungri[
    processed_article_seungri["updateDt"] < seungri_cancellation_date
]
seungri_article_after_canceled = processed_article_seungri[
    processed_article_seungri["updateDt"] >= seungri_cancellation_date
]
print(
    "seungri count before: {0}, count after: {1}".format(
        len(seungri_article_before_canceled),
        len(seungri_article_after_canceled),
    )
)

kanye count before: 5255, count after: 5255
manson count before: 883, count after: 883
kelly count before: 1281, count after: 1281
seungri count before: 1037, count after: 1037


## sentiment analysis and t-test

In [9]:
def calculate_threshold(sample_size, alpha):
    df = sample_size - 1
    threshold = t.ppf(1 - alpha/2, df)
    return threshold

In [17]:
def sentiment_analysis_and_ttest(df_before, df_after):
    senti_analyzer = SentimentIntensityAnalyzer()
    df = pd.DataFrame()
    df2 = pd.DataFrame()

    for i, text in enumerate(df_before["lemmatization"]):
        text = str(text)
        senti_scores = senti_analyzer.polarity_scores(text)
        df = pd.concat([df, pd.DataFrame(senti_scores, index=[i])], axis=0)

    for i, text in enumerate(df_after["lemmatization"]):
        text = str(text)
        senti_scores = senti_analyzer.polarity_scores(text)
        df2 = pd.concat([df2, pd.DataFrame(senti_scores, index=[i])], axis=0)

    print(
        "\n\tbefore canceled mean compound score: {:.3f}".format(df["compound"].mean())
    )
    print("\tafter canceled mean compound score: {:.3f}".format(df2["compound"].mean()))

    threshold = calculate_threshold(len(df) * 2, 0.05)
    print(f"\tthreshold: {threshold}")

    t_stat_ratio, p_value_ratio = scipy.stats.ttest_rel(df["compound"], df2["compound"])
    print("\tt score: {0:.3f}, p-value: {1:.3f}".format(t_stat_ratio, p_value_ratio))
    if t_stat_ratio > threshold and p_value_ratio < 0.05:
        print("\n### reject null hypothesis ###")
    else:
        print("\n### fail to reject null hypothesis ###")
    return

In [11]:
print('### Kanye West paired sample t-test ###')
sentiment_analysis_and_ttest(kanye_article_before_canceled, kanye_article_after_canceled)

### Kanye West paired sample t-test ###

	before canceled mean compound score: 0.016
	after canceled mean compound score: -0.053
	threshold: 1.960189747203735
	t score: 10.449, p-value: 0.000

### reject null hypothesis ###


In [12]:
print('### Marilyn Manson paired sample t-test ###')
sentiment_analysis_and_ttest(manson_article_before_canceled, manson_article_after_canceled)

### Marilyn Manson paired sample t-test ###

	before canceled mean compound score: -0.380
	after canceled mean compound score: -0.318
	threshold: 1.9613089540586846
	t score: -3.202, p-value: 0.001

### fail to reject null hypothesis ###


In [13]:
print('### R.kelly paired sample t-test ###')
sentiment_analysis_and_ttest(kelly_article_before_canceled, kelly_article_after_canceled)

### R.kelly paired sample t-test ###

	before canceled mean compound score: -0.205
	after canceled mean compound score: -0.277
	threshold: 1.9608907216459075
	t score: 4.828, p-value: 0.000

### reject null hypothesis ###


In [18]:
print('### Seungri paired sample t-test ###')
sentiment_analysis_and_ttest(seungri_article_before_canceled, seungri_article_after_canceled)

### Seungri paired sample t-test ###

	before canceled mean compound score: 0.398
	after canceled mean compound score: 0.244
	threshold: 1.961109007877182
	t score: 9.083, p-value: 0.000

### reject null hypothesis ###
