In [None]:
# Data processing
import pandas as pd
import numpy as np# Text preprocessiong
import nltk
# nltk.download('stopwords')
# nltk.download('omw-1.4')
# nltk.download('wordnet')
wn = nltk.WordNetLemmatizer()# Topic model
from bertopic import BERTopic# Dimension reduction
from umap import UMAP
import bertopic

## READ CSV

In [None]:
df=pd.read_csv(r"Data\US_youtube_trending_data.csv")


In [None]:
df= df.drop_duplicates(subset=['title'],keep='first')
stopwords = nltk.corpus.stopwords.words('english')
# Remove stopwords
df['review_without_stopwords'] = df['title'].apply(lambda x: ' '.join([w for w in x.split() if w.lower() not in stopwords]))
df['review_lemmatized'] = df['review_without_stopwords'].apply(lambda x: ' '.join([wn.lemmatize(w) for w in x.split() if w not in stopwords]))

pattern = r'[^a-zA-Z0-9\s]'
# Use the str.replace method to remove non-alphanumeric characters and keep only alphanumeric characters
df['review_lemmatized'] = df['review_lemmatized'].str.replace(pattern, '', regex=True)
df['review_lemmatized'] = df['review_lemmatized'].str.upper()


df.head()

## BERTOPIC MODEL 

In [None]:
umap_model = UMAP(n_neighbors=50, 
                  n_components=5, 
                  min_dist=0.65, 
                  metric='cosine', 
                  random_state=100)# Initiate BERTopic
topic_model = BERTopic(umap_model=umap_model, language="english", calculate_probabilities=False)# Run BERTopic model
topics, probabilities = topic_model.fit_transform(df['review_lemmatized'])

## TOP 30 topics

In [None]:
df_final=df.copy()
topic_prediction = topic_model.topics_[:]
df_final['topic']=topic_prediction

test=topic_model.get_topic_info()
df_final['Docs']=test['Representative_Docs']

In [None]:
filtered_df = df_final[(df_final['topic'] >= 0) & (df_final['topic'] <= 30)]


## TIME SERIES ANALYSIS

In [None]:
topics_over_time = topic_model.topics_over_time(df_final['review_lemmatized'], 
                                                df_final['trending_date'], 
                                                global_tuning=True, 
                                              evolution_tuning=True
                                               ,nr_bins=30)
time_series_data=topics_over_time[(topics_over_time['Topic'] >= 0) & (topics_over_time['Topic'] <= 30)]
time_series_data['Timestamp'] = time_series_data['Timestamp'].dt.tz_localize(None)
time_series_data['Timestamp'] = time_series_data['Timestamp'] .dt.date




In [None]:
# topic_model.visualize_topics_over_time(topics_over_time)

## INTERTOPIC DISTANCE MAP
### MODIFIED FROM ORIGINAL BERTOPIC GITHUB TO OUTPUT DATAFRAME

In [None]:
import numpy as np
import pandas as pd
from umap import UMAP
from typing import List, Union
from sklearn.preprocessing import MinMaxScaler

import plotly.express as px
import plotly.graph_objects as go


def visualize_topics_data(topic_model,
                     topics: List[int] = None,
                     top_n_topics: int = None,
                     custom_labels: Union[bool, str] = False,
                     title: str = "<b>Intertopic Distance Map</b>",
                     width: int = 650,
                     height: int = 650) -> go.Figure:
    """ Visualize topics, their sizes, and their corresponding words

    This visualization is highly inspired by LDAvis, a great visualization
    technique typically reserved for LDA.

    Arguments:
        topic_model: A fitted BERTopic instance.
        topics: A selection of topics to visualize
        top_n_topics: Only select the top n most frequent topics
        custom_labels: If bool, whether to use custom topic labels that were defined using 
                       `topic_model.set_topic_labels`.
                       If `str`, it uses labels from other aspects, e.g., "Aspect1".
        title: Title of the plot.
        width: The width of the figure.
        height: The height of the figure.

    Examples:

    To visualize the topics simply run:

    ```python
    topic_model.visualize_topics()
    ```

    Or if you want to save the resulting figure:

    ```python
    fig = topic_model.visualize_topics()
    fig.write_html("path/to/file.html")
    ```
    <iframe src="../../getting_started/visualization/viz.html"
    style="width:1000px; height: 680px; border: 0px;""></iframe>
    """
    # Select topics based on top_n and topics args
    freq_df = topic_model.get_topic_freq()
    freq_df = freq_df.loc[freq_df.Topic != -1, :]
    if topics is not None:
        topics = list(topics)
    elif top_n_topics is not None:
        topics = sorted(freq_df.Topic.to_list()[:top_n_topics])
    else:
        topics = sorted(freq_df.Topic.to_list())

    # Extract topic words and their frequencies
    topic_list = sorted(topics)
    frequencies = [topic_model.topic_sizes_[topic] for topic in topic_list]
    if isinstance(custom_labels, str):
        words = [[[str(topic), None]] + topic_model.topic_aspects_[custom_labels][topic] for topic in topic_list]
        words = ["_".join([label[0] for label in labels[:4]]) for labels in words]
        words = [label if len(label) < 30 else label[:27] + "..." for label in words]
    elif custom_labels and topic_model.custom_labels_ is not None:
        words = [topic_model.custom_labels_[topic + topic_model._outliers] for topic in topic_list]
    else:
        words = [" | ".join([word[0] for word in topic_model.get_topic(topic)[:5]]) for topic in topic_list]

    # Embed c-TF-IDF into 2D
    all_topics = sorted(list(topic_model.get_topics().keys()))
    indices = np.array([all_topics.index(topic) for topic in topics])

    if topic_model.topic_embeddings_ is not None:
        embeddings = topic_model.topic_embeddings_[indices]
        embeddings = UMAP(n_neighbors=2, n_components=2, metric='cosine', random_state=42).fit_transform(embeddings)
    else:
        embeddings = topic_model.c_tf_idf_.toarray()[indices]
        embeddings = MinMaxScaler().fit_transform(embeddings)
        embeddings = UMAP(n_neighbors=2, n_components=2, metric='hellinger', random_state=42).fit_transform(embeddings)

    # Visualize with plotly
    df = pd.DataFrame({"x": embeddings[:, 0], "y": embeddings[:, 1],
                       "Topic": topic_list, "Words": words, "Size": frequencies})
    return df

In [None]:
# ## Distance DF
# topic_model.visualize_topics(topics=list(range(0,31)))


In [None]:
topic_distance_data=visualize_topics_data(topic_model,topics=list(range(0,31)))


# BIGRAM IMPLEMENTATION

In [None]:
import nltk 
from nltk.collocations import * 


def Bigram_Data(filtered_df):

    list_documents = filtered_df['review_lemmatized'].apply(lambda x: x.split()).tolist() 
    bigram_measures = nltk.collocations.BigramAssocMeasures() 
    bigram_finder = BigramCollocationFinder.from_documents(list_documents) 
    bigram_finder.apply_freq_filter(3) 
    bigrams = bigram_finder.nbest(bigram_measures.raw_freq,20) 
    scores = bigram_finder.score_ngrams(bigram_measures.raw_freq) 
    ngram = list(bigram_finder.ngram_fd.items()) 
    ngram.sort(key=lambda item: item[-1], reverse=True) 

    frequency = [(" ".join(k), v) for k,v in ngram] 
    df_bigrams=pd.DataFrame(frequency) 
    df_bigrams.rename(columns={0: "Term", 1: "Frequency"},inplace=True)
    df_bigrams= df_bigrams.sort_values(by='Frequency', ascending=False)

    return df_bigrams.head(20)


## Individual Words

In [None]:
import nltk
from nltk import FreqDist

def Word_Frequency(filtered_df):

    list_documents = filtered_df['review_lemmatized'].apply(lambda x: x.split()).tolist()
    all_words = [word for sublist in list_documents for word in sublist]
    word_freq = FreqDist(all_words)
    df_word_frequencies = pd.DataFrame(word_freq.items(), columns=['Term', 'Frequency'])
    df_word_frequencies = df_word_frequencies.sort_values(by='Frequency', ascending=False)
    df_word_frequencies= df_word_frequencies.sort_values(by='Frequency', ascending=False)
    return df_word_frequencies.head(20)


## CREATE DATAFRAME THAT CONTAINS BIGRAM & INDIVIDUAL WORD FREQUENCY

In [None]:
def Freqeuncy_all_topics(df): 
    Frequency_df=pd.DataFrame(columns=["Term", "Frequency", "Topic",'Bigram'])
    num_topics=len(df.topic.unique())
    for topic in range(num_topics):
        topic_data=df[df.topic==topic].reset_index()
        word_df=Word_Frequency(topic_data)    
        bigram_df=Bigram_Data(topic_data)
        
        word_df['Topic']=topic
        bigram_df['Topic']=topic
        word_df['Bigram']='0'
        bigram_df['Bigram']='1'
        
        combined_df=pd.concat([bigram_df,word_df])
        Frequency_df=pd.concat([Frequency_df,combined_df])

        
    return Frequency_df


In [None]:

Topic_frequency_df=Freqeuncy_all_topics(filtered_df)


## INTERTOPIC FREQUENCY

In [None]:


mean_by_category = filtered_df.groupby('topic')['view_count'].mean().reset_index()
mean_by_category.sort_values(by='view_count', inplace=True)


## ANOVA & POST HOC TEST

In [None]:
import pandas as pd
from scipy import stats

### ANOVA PREP
data_frames = [filtered_df[filtered_df['topic'] == topic]['view_count'] for topic in filtered_df['topic'].unique()]

# Perform one-way ANOVA
f_statistic, p_value = stats.f_oneway(*data_frames)

alpha = 0.05  # significance level
if p_value < alpha:
    print("There is significant evidence that at least one topic has a different view count.")
else:
    print("There is no significant difference in view counts between topics.")


## POST HOC TEST: PAIRWISE TUKEY    
    
from statsmodels.stats.multicomp import pairwise_tukeyhsd
from statsmodels.stats.multicomp import MultiComparison

mc = MultiComparison(filtered_df['view_count'], filtered_df['topic'])
result = mc.tukeyhsd()
print(result)



# FINAL DATA OUTPUTS

In [None]:
filtered_df.to_excel(r"Final_Data\Data.xlsx")

time_series_data.to_excel(r"Final_Data\Time_Series.xlsx",index=False)
time_series_data.to_csv(r"Final_Data\Time_Series.csv",index=False)

topic_distance_data.to_excel(r"Final_Data\Distance_Data.xlsx")
topic_distance_data.to_csv(r"Final_Data\Distance_Data.csv")

Topic_frequency_df.to_csv(r'Final_Data\Frequency_df.csv',index=False)

mean_by_category.to_csv(r"Final_Data\topic_view.csv",index=False)