In [None]:

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import datetime



#NEUROTIC DESCRIPTIVES

#descriptives of neurotic metadata (retweets, replies, likes, quote count and offensive data)
dfneurotic = pd.read_csv('2neurotic_offensive_analysis.csv', low_memory=False)
dfneurotic_descr_tweet = dfneurotic[['retweetCount', 'replyCount', 'likeCount', 'quoteCount', 'predominant_label', 'offensive', 'not_offensive']]

# Create a table with specific descriptive statistics
descriptive_stats_neurotic = dfneurotic_descr_tweet.describe().loc[['count', 'mean', 'std', 'min', 'max']]

# Display the table
print("Descriptive Statistics Table for Neurotic:")
print(descriptive_stats_neurotic)


# Set up the matplotlib figure for density graphs
plt.figure(figsize=(12, 8))

# Create box plots for numeric columns only
dfneurotic_descr_tweet.boxplot(column=['retweetCount', 'replyCount', 'likeCount', 'quoteCount'])

# Label the axes and add a title for the box plot
plt.xlabel('Interaction Type')
plt.ylabel('Count')
plt.title('Box Plot of Neurotic Metadata')


# Save the figure
plt.savefig('Neurotic_Metadata_Box_Plot.pdf', format='pdf', bbox_inches='tight')

#NEUROTICISM DESCRIPTIVES

#descriptives of neuroticism metadata (retweets, replies, likes, quote count and offensive data)
dfneuroticism = pd.read_csv('2neuroticism_offensive_analysis.csv', low_memory=False)
dfneuroticism_descr_tweet = dfneuroticism[['retweetCount', 'replyCount', 'likeCount', 'quoteCount', 'predominant_label', 'offensive', 'not_offensive']]

# Create a table with specific descriptive statistics
descriptive_stats_neuroticism = dfneuroticism_descr_tweet.describe().loc[['count', 'mean', 'std', 'min', 'max']]

# Display the table
print("Descriptive Statistics Table for Neuroticism:")
print(descriptive_stats_neuroticism)


# Set up the matplotlib figure for density graphs
plt.figure(figsize=(12, 8))

# Create box plots for numeric columns only
dfneuroticism_descr_tweet.boxplot(column=['retweetCount', 'replyCount', 'likeCount', 'quoteCount'])

# Label the axes and add a title for the box plot
plt.xlabel('Interaction Type')
plt.ylabel('Count')
plt.title('Box Plot of Neuroticism Metadata')

# Save the figure
plt.savefig('Neuroticism_Metadata_Box_Plot.pdf', format='pdf', bbox_inches='tight')

#TWEET FREQUENCY GRAPH

#load dataset for visualisation + parse dates - essential first step
tweet_df1 = pd.read_csv('dat_neurotic_short.csv', \
    parse_dates=['date', 'acc_created'])

tweet_df2 = pd.read_csv('dat_neuroticism_short.csv', \
    parse_dates=['date', 'acc_created'])
#visualisation of Pro-anorexia and pro-recovery tweet frequency per 30 days
tweet_df_pa = tweet_df1.groupby(pd.Grouper(key='date', freq='30d', convention='start')).size()
tweet_df_pr = tweet_df2.groupby(pd.Grouper(key='date', freq='30d', convention='start')).size()

fig, ax = plt.subplots()

# Plotting "Neurotic Tweets" in crimson (a shade of red)
ax.plot(tweet_df_pa, label='Neurotic Tweets', color='crimson')
ax.yaxis.label.set_color('crimson')
ax.spines["left"].set_edgecolor('crimson')
ax.tick_params(axis='y', colors='crimson')
ax.spines['left'].set_color('crimson')
plt.ylim(0, 10000)

# Creating a twin axis for "Neuroticism Tweets" and plotting in steelblue
ax_twin = ax.twinx()
ax_twin.plot(tweet_df_pr, label='Neuroticism Tweets', color='steelblue')
ax_twin.yaxis.label.set_color('steelblue')
ax_twin.spines["right"].set_edgecolor('steelblue')
ax_twin.tick_params(axis='y', colors='steelblue')
plt.ylim(0, 1500)


# Setting the x-axis limits to span from 2015-01-13 to 2021-08-31
plt.xlim([datetime.date(2015, 1, 13), datetime.date(2021, 8, 31)])

# Configuring labels and title
ax.set_xlabel('Time in Years')
ax.set_ylabel('Tweets per 30 days', color='black')  # Setting the left y-axis label
ax_twin.set_ylabel('Neuroticism Tweets Count', color='steelblue')  # Setting the right y-axis label
ax.set_title("Tweet Frequency Over Time")  # You can customize the title as needed

# Adding legends
ax_twin.legend(loc='upper right')
ax.legend(loc='upper left')

# Saving the figure
plt.savefig("TweetFrequency_red_blue.pdf", format="pdf", bbox_inches="tight")

In [None]:
# Load datasets and parse 'date' and 'acc_created' columns as dates
tweet_df1 = pd.read_csv('dat_neurotic_added_columns.csv', parse_dates=['date', 'acc_created'])
tweet_df2 = pd.read_csv('dat_neuroticism_added_columns.csv', parse_dates=['date', 'acc_created'])

# Group tweets by 30-day intervals and count them
tweet_df_pa = tweet_df1.groupby(pd.Grouper(key='date', freq='30d', convention='start')).size()
tweet_df_pr = tweet_df2.groupby(pd.Grouper(key='date', freq='30d', convention='start')).size()

# Setup figure and axes for plotting
fig, ax = plt.subplots()

# Plot Neurotic Tweets frequency on primary y-axis
ax.plot(tweet_df_pa, label='Neurotic Tweets', color = 'C1')
# Configure primary y-axis appearance
ax.set_ylabel('Neurotic Tweets Counts', color = 'C1')
ax.tick_params(axis='y', colors='C1')

# Setup secondary y-axis for Neuroticism Tweets frequency
ax_twin = ax.twinx()
ax_twin.plot(tweet_df_pr, label='Neuroticism Tweets', color = 'C0')
# Configure secondary y-axis appearance
ax_twin.set_ylabel('Neuroticism Tweets Count', color= 'C0')
ax_twin.tick_params(axis='y', colors='C0')

# Set x-axis and y-axis limits
plt.ylim(0, 15000)  # Primary y-axis limit
plt.xlim([datetime.date(2015, 1, 13), datetime.date(2023, 3, 25)])  # x-axis limit

# Set common labels and title
ax.set_xlabel('Time in Years')
ax.set_ylabel('Tweets per 30 days', color = 'black')

# Add legends for both datasets
ax.legend(loc = 'upper left')
ax_twin.legend(loc = 'upper right')

# Save the plot to a PDF file
plt.savefig("TweetFrequency2.pdf", format="pdf", bbox_inches="tight")


In [None]:
#OFFENSIVE LANGUAGE DETECTION IN NEUROTIC TWEETS (when condcuted as slurm)

from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig
import numpy as np
from scipy.special import softmax
import pandas as pd
import time 
from ast import literal_eval
import tweetnlp

program_start = time.time()

# Function to preprocess text
def preprocess(text): 
    new_text = []
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)

# Paths to local model, tokenizer, and config
local_model_path = '/home/s2463873/myenv/twitter-roberta-base-offensive'
local_tokenizer_path = '/home/s2463873/myenv/twitter-roberta-base-offensive'
local_config_path = '/home/s2463873/myenv/twitter-roberta-base-offensive'

# Load model, tokenizer, and config from local files
tokenizer = AutoTokenizer.from_pretrained(local_tokenizer_path)
config = AutoConfig.from_pretrained(local_config_path)
model = AutoModelForSequenceClassification.from_pretrained(local_model_path)

#load dataframe from file
df = pd.read_csv('dat_neurotic_short.csv')
result = {}

for i, row in df.iterrows():
    text = preprocess(row['rawContent'])
    myid = row['id']
    # Tokenize the text
    encoded_input = tokenizer(text, return_tensors='pt', truncation=True, max_length=512)

    # Get model output
    output = model(**encoded_input)
    
   # Extract logits and apply softmax
    scores = softmax(output.logits[0].detach().numpy())
    
    formatted_probabilities = [round(float(score), 4) for score in scores]

    # Determine label
    label_indices = np.argsort(scores)[::-1]
    labels = ['not offensive', 'offensive']  
    predominant_label = labels[label_indices[0]]
    
    # Store results
    result[myid] = {
    'predominant_label' : predominant_label,
    'not_offensive': formatted_probabilities [0], 
    'offensive' : formatted_probabilities [1]}

# Convert results to DataFrame and save to CSV
results_df = pd.DataFrame.from_dict(result, orient='index').reset_index().rename(columns={'index': 'id'})
# Merge with original DataFrame
merged_df = df.merge(results_df, on='id', how='left')
# Save merged DataFrame
merged_df.to_csv('2neurotic_offensive_analysis.csv', index=False)


In [None]:
#OFFENSIVE LANGUAGE DETECTION IN NEUROTICISM TWEETS (when condcuted as slurm)

from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig
import numpy as np
from scipy.special import softmax
import pandas as pd
import time 
from ast import literal_eval
import tweetnlp

program_start = time.time()

# Function to preprocess text
def preprocess(text): 
    new_text = []
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)

# Paths to local model, tokenizer, and config
local_model_path = '/home/s2463873/myenv/twitter-roberta-base-offensive'
local_tokenizer_path = '/home/s2463873/myenv/twitter-roberta-base-offensive'
local_config_path = '/home/s2463873/myenv/twitter-roberta-base-offensive'

# Load model, tokenizer, and config from local files
tokenizer = AutoTokenizer.from_pretrained(local_tokenizer_path)
config = AutoConfig.from_pretrained(local_config_path)
model = AutoModelForSequenceClassification.from_pretrained(local_model_path)

#load dataframe from file
df = pd.read_csv('dat_neuroticism_short.csv')
result = {}

for i, row in df.iterrows():
    text = preprocess(row['rawContent'])
    myid = row['id']
    # Tokenize the text
    encoded_input = tokenizer(text, return_tensors='pt', truncation=True, max_length=512)

    # Get model output
    output = model(**encoded_input)
    
   # Extract logits and apply softmax
    scores = softmax(output.logits[0].detach().numpy())
    
    formatted_probabilities = [round(float(score), 4) for score in scores]

    # Determine label
    label_indices = np.argsort(scores)[::-1]
    labels = ['not offensive', 'offensive']  
    predominant_label = labels[label_indices[0]]
    
    # Store results
    result[myid] = {
    'predominant_label' : predominant_label,
    'not_offensive': formatted_probabilities [0], 
    'offensive' : formatted_probabilities [1]}

# Convert results to DataFrame and save to CSV
results_df = pd.DataFrame.from_dict(result, orient='index').reset_index().rename(columns={'index': 'id'})
# Merge with original DataFrame
merged_df = df.merge(results_df, on='id', how='left')
# Save merged DataFrame
merged_df.to_csv('2neuroticism_offensive_analysis.csv', index=False)


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from datetime import date


# Mean offensiveness over time for both keywords

# Load datasets and parse 'date' and 'acc_created' as datetime objects
tweet_df_neurotic = pd.read_csv('2neurotic_offensive_analysis.csv', parse_dates=['date', 'acc_created'])
tweet_df_neuroticism = pd.read_csv('2neuroticism_offensive_analysis.csv', parse_dates=['date', 'acc_created'])

# Group and aggregate data before plotting
# Only aggregate the 'offensive' column this time
tweet_df_neurotic_mean = tweet_df_neurotic.groupby(pd.Grouper(key='date', freq='30d')).agg({'offensive': 'mean'})
tweet_df_neuroticism_mean = tweet_df_neuroticism.groupby(pd.Grouper(key='date', freq='30d')).agg({'offensive': 'mean'})

# Plotting
fig, ax = plt.subplots(figsize=(10, 6))

# Neurotic Tweets - Only Offensive
ax.plot(tweet_df_neurotic_mean.index, tweet_df_neurotic_mean['offensive'], label='Offensive Tweets (Neurotic)', color='red', linewidth=2)

# Neuroticism Tweets - Only Offensive
tweet_df_neuroticism_mean = tweet_df_neuroticism_mean.asfreq('30d').interpolate()
ax2 = ax.twinx()
ax2.plot(tweet_df_neuroticism_mean.index, tweet_df_neuroticism_mean['offensive'], label='Offensive Tweets (Neuroticism)', color='blue', linewidth=2)

# Additional plot settings
ax.set_xlabel('Time in Years')
ax.set_ylabel('Mean Offensiveness Score')
ax.grid(True)
ax.set_ylim(0, 1)  
ax2.set_ylim(0, 1) 
plt.xlim([pd.Timestamp('2015-01-13'), pd.Timestamp('2021-08-31')])

# Legends
ax.legend(loc='upper left')
ax2.legend(loc='upper right')


# Customize x-axis date formatting
ax.xaxis.set_major_locator(mdates.YearLocator())
ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y'))

plt.title("Mean Offensiveness over Time", fontsize=14)
plt.tight_layout()
plt.savefig("2Mean_offensiveness_over_time.pdf", format="pdf", bbox_inches="tight") 


#visualize neurotic offensiveness volume over time
tweet_df_neurotic_of = tweet_df_neurotic.groupby(pd.Grouper(key='date', freq='30d', convention='start')).offensive.sum()
tweet_df_neurotic_not = tweet_df_neurotic.groupby(pd.Grouper(key='date', freq='30d', convention='start')).not_offensive.sum()

fig, ax = plt.subplots()
# Plot the offensive tweet volumes
ax.plot(tweet_df_neurotic_of.index, tweet_df_neurotic_of, label='Neurotic Offensive', color='darkred')  # Changed to darkred
# Set the limits for the primary y-axis (offensive tweets)
ax.set_ylim(0, tweet_df_neurotic_of.max() + 1000)  # Adjust as needed
ax.tick_params(axis='y', colors='darkred')  # Changes the ticks color
ax.spines['left'].set_color('darkred')

# Create twin axes for the not offensive tweet volumes
ax_twin = ax.twinx()
ax_twin.plot(tweet_df_neurotic_not.index, tweet_df_neurotic_not, label='Neurotic Not Offensive', color='salmon')  # Changed to salmon
# Set the limits for the secondary y-axis (not offensive tweets)
ax_twin.set_ylim(0, tweet_df_neurotic_not.max() + 1000)  # Adjust as needed
ax_twin.tick_params(axis= 'y', colors= 'salmon')
ax_twin.spines['right'].set_color('salmon')

# Set x-axis limits
ax.set_xlim([date(2015, 1, 13), date(2021, 8, 31)])

# Labeling axes
ax.set_xlabel('Time in Years')
ax.set_ylabel('Number of Neurotic Offensive Tweets per 30 days', color='darkred')  # Adjusted color to darkred
ax_twin.set_ylabel('Number of Neurotic Not Offensive Tweets per 30 days', color='salmon')  # Adjusted color to salmon

# Adding a title
ax.set_title("Neurotic Tweet Offensiveness Volume Over Time")

# Adding gridlines
ax.grid(True, which='both', linestyle='--', linewidth=0.5)

# Adding legends
ax.legend(loc='upper left')
ax_twin.legend(loc='upper right')

# Save the figure
plt.savefig("2Neurotic_offensive_volume_over_time.pdf", format="pdf", bbox_inches="tight")


#visualize neuroticism Offensiveness volume over time

tweet_df_neuroticism_of = tweet_df_neuroticism.groupby(pd.Grouper(key='date', freq='30d', convention='start')).offensive.sum()
tweet_df_neuroticism_not = tweet_df_neuroticism.groupby(pd.Grouper(key='date', freq='30d', convention='start')).not_offensive.sum()

fig, ax = plt.subplots()

# Plot the offensive tweet volumes
ax.plot(tweet_df_neuroticism_of.index, tweet_df_neuroticism_of, label='Neuroticism Offensive', color='navy', linewidth=2)
# Set the primary y-axis limits and label for offensive tweets
ax.set_ylim(0, tweet_df_neuroticism_of.max() + 100)  # Adjust as needed
ax.set_ylabel('Number of Offensive Tweets', color='navy')
ax.tick_params(axis= 'y', colors= 'navy')
ax.spines['left'].set_color('navy')

# Plot the not offensive tweet volumes on the twin axis
ax_twin = ax.twinx()
ax_twin.plot(tweet_df_neuroticism_not.index, tweet_df_neuroticism_not, label='Neuroticism Not Offensive', color='skyblue', linewidth=2)

# Set the secondary y-axis limits and label for not offensive tweets
ax_twin.set_ylim(0, tweet_df_neuroticism_not.max() + 100)  # Adjust as needed
ax_twin.set_ylabel('Number of Not Offensive Tweets', color='skyblue')
ax_twin.tick_params(axis= 'y', colors= 'skyblue')
ax_twin.spines['right'].set_color('skyblue')

# Set x-axis limits
ax.set_xlim([date(2015, 1, 13), date(2021, 8, 31)])

# Labeling axes
ax.set_xlabel('Time in Years')

# Adding a title
ax.set_title("Neuroticism Tweet Offensiveness Volume Over Time")

# Adding gridlines for better readability
ax.grid(True, which='both', linestyle='--', linewidth=0.5)

# Adding legends for both axes
ax.legend(loc='upper left')
ax_twin.legend(loc='upper right')

# Save the figure
plt.savefig("Neuroticism_Offensiveness_volume_over_time.pdf", format="pdf", bbox_inches="tight")

In [None]:
#TOPIC MODELLING NEUROTIC TWEETS 

import pandas as pd
from sentence_transformers import SentenceTransformer
import pickle
from datetime import datetime
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from umap import UMAP

#Load dataframe from file
df = pd.read_csv('dat_neurotic_short.csv')
tweets = df.rawContent.to_list()
timestamps = df.date.to_list()

#Generate embeddings
sentence_model = SentenceTransformer("all-mpnet-base-v2")
embeddings = sentence_model.encode(tweets, show_progress_bar=True)

#Save the embeddings with a timestamp
embeddings_filename = f"neurotic_Npp_embeddings_all-mpnet-base-v2 {datetime.now().strftime('%Y-%m-%d %H_%M_%S')}.pkl"

with open(embeddings_filename, 'wb') as file:
    pickle.dump(embeddings, file)

#Load embeddings from the saved file
with open(embeddings_filename, 'rb') as file:
    embeddings = pickle.load(file)

#Removing stop words after topic extraction
vectorizer_model = CountVectorizer(stop_words="english")

#Setting UMAP variables
umap_model = UMAP(n_neighbors=10, n_components=5, min_dist=0.0, metric='cosine')

#Defining the BERTopic parameters
topic_model = BERTopic(verbose=True, nr_topics="auto", min_topic_size=300, vectorizer_model=vectorizer_model, umap_model=umap_model)
topics, probs = topic_model.fit_transform(tweets, embeddings)

#Save the topic model
topic_model_filename = "TOPICS_neurotic_Npp_all-mpnet-base-v2_auto_mts=300_nneigh=10"
topic_model.save(topic_model_filename)

#Create topics over time
topics_over_time = topic_model.topics_over_time(tweets, timestamps, nr_bins=40)

#Save the topics over time
tot_filename = f"TOT_neurotic_Npp_all-mpnet-base-v2_auto_mts=300_nneigh=10 {datetime.now().strftime('%Y-%m-%d %H_%M_%S')}.pkl"

with open(tot_filename, 'wb') as file:
    pickle.dump(topics_over_time, file)


In [None]:
#TOPIC MODELLIG NEUROTICISM TWEETS


import pandas as pd
from sentence_transformers import SentenceTransformer
import pickle
from datetime import datetime
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from umap import UMAP

#Load dataframe from file
df = pd.read_csv('dat_neuroticism_short.csv')
tweets = df.rawContent.to_list()
timestamps = df.date.to_list()

#Generate embeddings
sentence_model = SentenceTransformer("all-mpnet-base-v2")
embeddings = sentence_model.encode(tweets, show_progress_bar=True)

#Save the embeddings with a timestamp
embeddings_filename = f"neuroticism_Npp_embeddings_all-mpnet-base-v2 {datetime.now().strftime('%Y-%m-%d %H_%M_%S')}.pkl"

with open(embeddings_filename, 'wb') as file:
    pickle.dump(embeddings, file)

#Load embeddings from the saved file
with open(embeddings_filename, 'rb') as file:
    embeddings = pickle.load(file)

#Removing stop words after topic extraction
vectorizer_model = CountVectorizer(stop_words="english")

#Setting UMAP variables
umap_model = UMAP(n_neighbors=6, n_components=5, min_dist=0.0, metric='cosine')

#Defining the BERTopic parameters
topic_model = BERTopic(verbose=True, nr_topics="auto", min_topic_size=21, vectorizer_model=vectorizer_model, umap_model=umap_model)
topics, probs = topic_model.fit_transform(tweets, embeddings)

#Save the topic model
topic_model_filename = "TOPICS_neuroticism_Npp_all-mpnet-base-v2_auto_mts=21_nneigh=6"
topic_model.save(topic_model_filename)

#Create topics over time
topics_over_time = topic_model.topics_over_time(tweets, timestamps, nr_bins=40)

#Save the topics over time
tot_filename = f"TOT_neurotic_Npp_all-mpnet-base-v2_auto_mts=21_nneigh=6 {datetime.now().strftime('%Y-%m-%d %H_%M_%S')}.pkl"

with open(tot_filename, 'wb') as file:
    pickle.dump(topics_over_time, file)


In [None]:
import os
from openai import OpenAI
import pandas as pd
from bertopic import BERTopic

# Set the OpenAI API key
client = OpenAI(
    api_key='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'
)

# Load the pre-trained BERTopic model
topic_model = BERTopic.load("TOPICS_neuroticism_Npp_all-mpnet-base-v2_auto_mts=21_nneigh=6")

# Load your dataset
df = pd.read_csv('dat_neuroticism_short.csv')
tweets = df.rawContent.to_list()

# Get topics and their keywords
topics = topic_model.get_topics()

# Dictionary to store generated topic labels
generated_topic_labels = {}

# Iterating over each topic to generate labels
for topic_num, keywords in topics.items():
    if topic_num == -1:  # Skip the outlier topic
        continue

    documents = [doc for doc, topic in zip(tweets, topic_model.topics_) if topic == topic_num][:100]  # Adjust as needed

    prompt = f"""
    I have a topic that contains the following documents: 
    {documents}
    The topic is also described by the following keywords: {keywords}

    Based on the information above, extract a representative short and exclusvie topic label in the following format:
    topic: <topic label>
    The topic label should be as short as possible while being as descriptive as possible but no more than 4 words in length, no topics should have the same label The topic label should be as short as possible while being as descriptive as possible but no more than 4 words in length, no topics should have the same label and they should not mention the keywords Neuroticism or Neurotic..
    """

    response = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[{"role": "user", "content": prompt}]
    )

    topic_label = response.choices[0].message.content.strip()
    generated_topic_labels[topic_num] = topic_label

# Saving the generated topic labels
with open("2nd_21_6_topic_labels_neuroticism.txt", "w", encoding="utf-8") as file:
    for topic_num, label in generated_topic_labels.items():
        file.write(f"Topic {topic_num}: {label}\n")

print("Topic labels saved.")


In [None]:
import os
import pandas as pd
from bertopic import BERTopic
import pickle

# Load your BERTopic model
topic_model = BERTopic.load("TOPICS_neuroticism_Npp_all-mpnet-base-v2_auto_mts=21_nneigh=6")

# Load your dataset
df = pd.read_csv('dat_neuroticism_short.csv')
timestamps = df.date.to_list()
tweets = df.rawContent.to_list()

# Define the dictionary of custom labels
custom_labels = {
    -1: "Outliers",
    0: "Personality Traits Analysis",
    1: "Unique Man Descriptions",
    2: "Cat Owner Dynamics",
    3: "Spiritual Belief and Culture",
    4: "Jewish Identity Traits",
    5: "Fermented Foods Study",
    6: "Racial Diversity Analysis",
    7: "Psychedelic Mind Enhancement",
    8: "Youthful Parental Influence",
    9: "Emotional Music Listening Patterns",
    10: "Trump's Psychopathic Behavior",
    11: "Voter Targeting Analysis",
    12: "Virgo Personality Traits",
    13: "Woody Allen Films",
    14: "Sexual bondage security perception",
    15: "Obsessive Cleaning Habit",
    16: "Urban Stress Culture",
    17: "Masking Behavior Study",
    18: "Emotional Music Analysis",
    19: "MBTI Validity Critique",
    20: "Caffeine Addiction Trends",
    21: "Perfectionism and early death",
    22: "Sleep Patterns & Habits",
    23: "Erotic Societal Studies",
    24: "Trumpian Attributes",
    25: "TV Shows Analysis",
    26: "Google Engineer Controversy Manifesto",
    27: "Cat Parasite Harmful Effects",
    28: "Web Forum Viral Trend",
    29: "Scott's Unique Loyalty",
    30: "Genetic Cannabis Addiction Risk",
    31: "Fearful Voting Patterns",
    32: "Writing Prison Release Joyful",
    33: "EU Dangers Misinfo Exit",
    34: "Fear of Falling",
    35: "Fan Anxiety Analysis",
    36: "Character Contrasts & Traits",
    37: "Industrial Progress Struggles",
    38: "Eye Movement Personality Prediction",
    39: "Writing Prison Bars Release",
    40: "College Drinking Link",
    41: "Humorous Personality Traits",
    42: "Denial Reality Distortions",
    43: "Quirky Behavior Appreciation",
    44: "Newton's Overthinking Syndrome",
    45: "Emotional Face Study",
    46: "Intense Emotional Behavior",
    47: "Trump Personality Traits",
    48: "Brexit Influence Tactics",
    49: "Early Birth Aversion",
    50: "Targeted Ad Marketing",
    51: "Vegan Food Debate",
    52: "Vaccine Preferences",
    53: "Woke Hypocrisy & Ideology",
    54: "Med Student Concerns",
    55: "Gun Owner Behavior Study",
    56: "Phone Ignoring Behavior",
    57: "Character Portrayals",
    58: "Personality Trait Change",
    59: "Emotional Turmoil in Relationships",
    60: "Media Influence on Journalism",
    61: "Testosterone Influence Cerebellum Link",
    62: "ADHD Genetic Links",
    63: "Self-compassion overlap discovery",
    64: "Social Media Cult Behavior",
    65: "Festive Winter Holidays",
    66: "Celebrity Influence Analysis",
    67: "Bondage Security Relationship Practice",
    68: "College AI Grading Technology",
    69: "Smokers Personality Changes",
    70: "Mountainous Personality Traits",
    71: "Synthpunk Album Review"
}


# Set the custom labels
topic_model.topic_labels_ = custom_labels  # Directly setting the attribute to ensure it's applied

# Save the model again with the custom labels
topic_model.save("TOPICS_neuroticism_Npp_all-mpnet-base-v2_auto_mts_21_nneigh_6_CUSTOM_LABELS")

In [None]:
import pandas as pd
from bertopic import BERTopic

# Load the model
topic_model = BERTopic.load("TOPICS_neuroticism_Npp_all-mpnet-base-v2_auto_mts_21_nneigh_6_CUSTOM_LABELS")

#create tot
topics_over_time = topic_model.topics_over_time(tweets, timestamps, nr_bins=40)
    
#pickle the topics over time
filename = f"TOT_neuroticism_Npp_all-mpnet-base-v2_auto_mts=21_nneigh=6-CUSTOM_LABELS.pkl"
filename = filename.replace(":", "_")
with open(filename, 'wb') as file:
    pickle.dump(topics_over_time, file)

In [None]:
import pandas as pd
from bertopic import BERTopic

# Load the model
topic_model = BERTopic.load("TOPICS_neuroticism_Npp_all-mpnet-base-v2_auto_mts_21_nneigh_6_CUSTOM_LABELS")

# Retrieve topic information
topic_info = topic_model.get_topic_info()

# Map custom labels
topic_info['Label'] = topic_info['Topic'].apply(lambda x: topic_model.topic_labels_.get(x, "No Label"))

# Calculate the percentage of the total for each topic
total_documents = topic_info['Count'].sum()
topic_info['Percentage'] = (topic_info['Count'] / total_documents) * 100

# Map keywords and representative documents
topic_keywords = {topic: ', '.join([word for word, _ in topic_model.get_topic(topic)]) for topic in topic_model.get_topics()}
topic_info['Keywords'] = topic_info['Topic'].map(topic_keywords)
representative_docs = topic_model.get_representative_docs()
topic_info['Representative Tweet'] = topic_info['Topic'].map(representative_docs)

# Remove outliers if necessary
#if -1 in topic_info['Topic'].values:
 #   topic_info = topic_info[topic_info['Topic'] != -1]

# Select relevant columns to save
columns_to_save = ['Topic', 'Label', 'Count', 'Percentage', 'Keywords', 'Representative Tweet']
final_dataframe = topic_info[columns_to_save]

# Save to CSV
final_dataframe.to_csv('21_6_neuroticism_topic_details.csv', index=False)

print('topic infos saved')

In [None]:
#save all neuroticism figures to PDF and HTML
import plotly.io as pio   
pio.kaleido.scope.mathjax = None

topic_model = BERTopic.load("TOPICS_neuroticism_Npp_all-mpnet-base-v2_auto_mts_21_nneigh_6_CUSTOM_LABELS")
df = pd.read_csv('dat_neuroticism_short.csv')
timestamps = df.date.to_list()
tweets = df.rawContent.to_list()

with open('neuroticism_Npp_embeddings_all-mpnet-base-v2 2024-04-16 17_06_15.pkl', 'rb') as file:
    embeddings = pickle.load(file)


fig1 = topic_model.visualize_topics(custom_labels=True)
pio.write_html(fig1, file='21_6_intertopic-neuroticism-custom_labels.html', auto_open=False)
#pio.write_image(fig1, file='topics-neuroticism-custom_labels.pdf', format='pdf')

topics_over_time = topic_model.topics_over_time(tweets, timestamps, nr_bins=40)
fig2 = topic_model.visualize_topics_over_time(topics_over_time, custom_labels=True)
pio.write_html(fig2, file='21_6_tot-neuroticism-custom_labels.html', auto_open=False)
#pio.write_image(fig2, file='tot-neuroticism-custom_labels.pdf', format='pdf')

fig3 = topic_model.visualize_term_rank(custom_labels=True)
pio.write_html(fig3, file='21_6_term_rank-neuroticism-custom_labels.html', auto_open=False)
#pio.write_image(fig3, file='term_rank-neuroticism-custom_labels.pdf', format='pdf')

fig4 = topic_model.visualize_barchart(top_n_topics=12, custom_labels=True)
pio.write_html(fig4, file='21_6_barchart-neuroticism-custom_labels.html', auto_open=False)
#pio.write_image(fig4, file='barchart-neuroticism-custom_labels.pdf', format='pdf')

hierarchical_topics = topic_model.hierarchical_topics(tweets)
fig5 = topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics, custom_labels=True)
pio.write_html(fig5, file='21_6_hierarchy-neuroticism-custom_labels.html', auto_open=False)
#pio.write_image(fig5, file='hierarchy-neuroticism-custom_labels.pdf', format='pdf')

fig6 = topic_model.visualize_hierarchy(custom_labels=True)
pio.write_html(fig6, file='21_6_simple_hierarchy-neuroticism-custom_labels.html', auto_open=False)
#pio.write_image(fig6, file='simple_hierarchy-neuroticism-custom_labels.pdf', format='pdf')

fig7 = topic_model.visualize_heatmap(custom_labels=True)
pio.write_html(fig7, file='21_6_heatmap-neuroticism-custom_labels.html', auto_open=False)