In [1]:
# import sentimental analysis library
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from textblob import TextBlob
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
import os

import google.generativeai as genai
import warnings
warnings.filterwarnings("ignore")

GOOGLE_API_KEY=os.getenv('GOOGLE_API_KEY')

genai.configure(api_key=GOOGLE_API_KEY)

from google.api_core import exceptions
from google.generativeai.types import HarmCategory, HarmBlockThreshold

In [None]:
def analyze_sentiment_gemini(text):
    model = genai.GenerativeModel('gemini-pro')
    message = "Analyze the following video comment and determine the sentiment score of a given text. Return answer in a single float ranging anywhere from -1 to 1, where -1 is negative sentiment, 0 is neutral sentiment, and 1 is positive sentiment: " + text
    try:
        response = model.generate_content(message, safety_settings={
            HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
            HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
            HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE
            }
        )
    except AttributeError as e:
        model = genai.GenerativeModel('gemini-pro')
        return 0
    except Exception as e:
        model = genai.GenerativeModel('gemini-pro')
        return 0


In [None]:
def analyze_sentiment_vader(text):
    # create a SentimentIntensityAnalyzer object
    sid_obj = SentimentIntensityAnalyzer()
    # polarity_scores method of SentimentIntensityAnalyzer
    # object gives a sentiment dictionary.
    # which contains pos, neg, neu, and compound scores.
    sentiment_dict = sid_obj.polarity_scores(text)
    return sentiment_dict['compound']

In [None]:
def analyze_sentiment_textblob(text):
    # create a TextBlob object
    analysis = TextBlob(text)
    # return the sentiment
    return analysis.sentiment.polarity

In [None]:
def analyze_sentiment(text):
    sentiment_gemini = analyze_sentiment_gemini(text)
    sentiment_vader = analyze_sentiment_vader(text)
    sentiment_textblob = analyze_sentiment_textblob(text)
    return sentiment_gemini, sentiment_vader, sentiment_textblob
    # return sentiment_vader, sentiment_textblob

In [None]:
data_directory = '../preprocessed_data/'
output_directory = '../sentiment_data/sentiment_score/'
# Load the json files from the data directory
files = os.listdir(data_directory)
csv_files = [f for f in files if f.endswith('-cleaned.csv')]
csv_files

In [None]:
# Load the json files into a pandas dataframe
for file in csv_files:
    with open(data_directory + file) as f:
        df = pd.read_csv(f)
    if not os.path.exists(output_directory):
        os.makedirs(output_directory)
    # draw a histogram of the sentiment scores
    df['sentiment_gemini'] = df['comment'].apply(analyze_sentiment_gemini)
    df['sentiment_vader'] = df['comment'].apply(analyze_sentiment_vader)
    df['sentiment_textblob'] = df['comment'].apply(analyze_sentiment_textblob)
    
    # save the dataframe to a csv file
    df.to_csv(output_directory + file.split('.')[0] + '_sentiment_score.csv', index=False)
    video_title = file.split('.')[0].removesuffix('-cleaned')
    # draw a histogram of the sentiment scores
    plt.hist(df['sentiment_gemini'], bins=20, alpha=0.5, label='gemini', color='green')
    plt.hist(df['sentiment_vader'], bins=20, alpha=0.5, label='vader', color='blue')
    plt.hist(df['sentiment_textblob'], bins=20, alpha=0.5, label='textblob', color='red')
    plt.legend(loc='upper right')
    plt.title('Histogram of Sentiment Scores\n' + video_title)
    plt.xlabel('Sentiment Score')
    plt.ylabel('Frequency')
    plt.savefig(output_directory + file.split('.')[0] + '_sentiment.png')
    plt.show()

In [None]:
# get the data outside [-0.2, 0.2]
sentiment_data_path = '../sentiment_data/sentiment_score/'
files = os.listdir(sentiment_data_path)
csv_files = [f for f in files if f.endswith('.csv')]
csv_files

In [None]:
for file in csv_files:
    with open(sentiment_data_path + file) as f:
        df = pd.read_csv(f)
        
    df1 = df[(df['sentiment_vader'] < -0.2) | (df['sentiment_vader'] > 0.2)]
    df1.to_csv(sentiment_data_path + file.split('.')[0] + '_vader_filtered.csv', index=False)
    df2 = df[(df['sentiment_textblob'] < -0.2) | (df['sentiment_textblob'] > 0.2)]
    df2.to_csv(sentiment_data_path + file.split('.')[0] + '_textblob_filtered.csv', index=False)
    df3 = df[(df['sentiment_gemini'] < -0.2) | (df['sentiment_gemini'] > 0.2)]
    df3.to_csv(sentiment_data_path + file.split('.')[0] + '_gemini_filtered.csv', index=False)
    
    # sort the data by sentiment score
    df1 = df1.sort_values(by='sentiment_vader', ascending=False)
    df2 = df2.sort_values(by='sentiment_textblob', ascending=False)
    df3 = df3.sort_values(by='sentiment_gemini', ascending=False)
    
    # write the filtered data to a csv file and save it
    df1.to_csv(sentiment_data_path + file.split('.')[0] + '_vader_filtered.csv', index=False)
    df2.to_csv(sentiment_data_path + file.split('.')[0] + '_textblob_filtered.csv', index=False)
    df3.to_csv(sentiment_data_path + file.split('.')[0] + '_gemini_filtered.csv', index=False)

In [None]:
# cluster the data
from sentence_transformers import SentenceTransformer
import json
import pandas as pd
from sklearn.mixture import GaussianMixture as GMM
import os

os.environ['TOKENIZERS_PARALLELISM'] = 'false'
sentiment_data_path = '../sentiment_data/'
files = os.listdir(sentiment_data_path)
csv_files = [f for f in files if f.endswith('filtered.csv')]

In [None]:
for file in csv_files:
    with open(sentiment_data_path + file) as f:
        df = pd.read_csv(f)
        
    # get the comments  
    comments = df['comment']    
    # load the pre-trained model
    model = SentenceTransformer('all-MiniLM-L6-v2')
    # encode the comments
    embeddings = df['comment'].map(lambda x: model.encode(x))
    
    # converts embeddings into a dataframe
    embeddings = embeddings.to_frame()
    embeddings = embeddings['comment'].apply(pd.Series)
    
    # fit the Gaussian Mixture Model
    gmm = GMM(n_components=3).fit(embeddings)
    # predict the cluster
    labels = gmm.predict(embeddings)
    # add the cluster to the dataframe
    df['cluster'] = labels
    # save the dataframe to a csv file
    df.to_csv(sentiment_data_path + file.split('.')[0] + '_clustered.csv', index=False)

In [None]:
files = os.listdir(sentiment_data_path)
clustered_file = [f for f in files if f.endswith('_clustered.csv')]
# sort the data by cluster
for file in clustered_file:
    with open(sentiment_data_path + file) as f:
        df = pd.read_csv(f)
    if 'vader' in file:
        if 'sentiment_textblob' in df.keys():
            df = df.drop(columns=['sentiment_textblob'])
        if 'sentiment_gemini' in df.keys():
            df = df.drop(columns=['sentiment_gemini'])
        df = df.sort_values(by=['cluster', 'sentiment_vader'], ascending=[True, False])
    elif 'textblob' in file:
        if 'sentiment_vader' in df.keys():
            df = df.drop(columns=['sentiment_vader'])
        if 'sentiment_gemini' in df.keys():
            df = df.drop(columns=['sentiment_gemini'])
        df = df.sort_values(by=['cluster', 'sentiment_textblob'], ascending=[True, False])
    else:
        # pass
        if 'sentiment_vader' in df.keys():
            df = df.drop(columns=['sentiment_vader'])
        if 'sentiment_textblob' in df.keys():
            df = df.drop(columns=['sentiment_textblob'])
        df = df.sort_values(by=['cluster', 'sentiment_gemini'], ascending=[True, False])
    df.to_csv(sentiment_data_path + file, index=False)

In [None]:
# cluster the data
from sentence_transformers import SentenceTransformer
import json
import pandas as pd
from sklearn.mixture import GaussianMixture as GMM
import os

with open("../sentiment_data/Fox News-Trump ordered to pay $364M, found liable in civil fraud trial-cleaned_sentiment_gemini_filtered.csv") as f:
    df = pd.read_csv(f)
    # get the positive comments
    positive_comments = df[df['sentiment_gemini'] > 0.7]
    # get the negative comments
    negative_comments = df[df['sentiment_gemini'] < -0.7]
    # load the pre-trained model
    model = SentenceTransformer('all-MiniLM-L6-v2')
    # encode the comments
    positive_embeddings = positive_comments['comment'].map(lambda x: model.encode(x))
    negative_embeddings = negative_comments['comment'].map(lambda x: model.encode(x))
    # converts embeddings into a dataframe
    positive_embeddings = positive_embeddings.to_frame()
    positive_embeddings = positive_embeddings['comment'].apply(pd.Series)
    negative_embeddings = negative_embeddings.to_frame()
    negative_embeddings = negative_embeddings['comment'].apply(pd.Series)
    # fit the Gaussian Mixture Model
    gmm_positive = GMM(n_components=4).fit(positive_embeddings)
    gmm_negative = GMM(n_components=10).fit(negative_embeddings)
    # predict the cluster
    positive_labels = gmm_positive.predict(positive_embeddings)
    negative_labels = gmm_negative.predict(negative_embeddings)
    
    # save the cluster to the dataframe
    positive_comments['cluster'] = positive_labels
    negative_comments['cluster'] = negative_labels
    # save the dataframe to a csv file
    positive_comments.to_csv("../sentiment_data/Fox News-Trump ordered to pay $364M, found liable in civil fraud trial-cleaned_sentiment_gemini_filtered_positive_clustered.csv", index=False)
    negative_comments.to_csv("../sentiment_data/Fox News-Trump ordered to pay $364M, found liable in civil fraud trial-cleaned_sentiment_gemini_filtered_negative_clustered.csv", index=False)


In [None]:
# sort the data by cluster
with open("../sentiment_data/Fox News-Trump ordered to pay $364M, found liable in civil fraud trial-cleaned_sentiment_gemini_filtered_positive_clustered.csv") as f:
    df = pd.read_csv(f)
    df = df.sort_values(by=['cluster', 'sentiment_gemini'], ascending=[True, False])
    df.to_csv("../sentiment_data/Fox News-Trump ordered to pay $364M, found liable in civil fraud trial-cleaned_sentiment_gemini_filtered_positive_clustered.csv", index=False)
    
with open("../sentiment_data/Fox News-Trump ordered to pay $364M, found liable in civil fraud trial-cleaned_sentiment_gemini_filtered_negative_clustered.csv") as f:
    df = pd.read_csv(f)
    df = df.sort_values(by=['cluster', 'sentiment_gemini'], ascending=[True, False])
    df.to_csv("../sentiment_data/Fox News-Trump ordered to pay $364M, found liable in civil fraud trial-cleaned_sentiment_gemini_filtered_negative_clustered.csv", index=False)

In [None]:
# Use TF-IDF Measurement to decide the best cluster number
from sklearn.feature_extraction.text import TfidfVectorizer


In [None]:
files = os.listdir(sentiment_data_path)
clustered_file = [f for f in files if f.endswith('gemini_filtered_positive_clustered.csv')]