In [None]:
import pandas as pd
from bertopic import BERTopic
import plotly.express as px
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import re

In [None]:
bots=[]
with open("../data/for_analysis/bot_fake_ids_2.txt","r") as f:
    for line in f:
        bots.append(int(line.strip()))

In [None]:
data = pd.read_pickle("../data/for_analysis/data0.pkl")
data = data.loc[data.lang=="en"]
data = data.loc[data.text_rt != ""]
data = data.loc[~data.author_id.isin(bots)]
data.shape

In [None]:
def remove_mentions_and_links(text):
    new_text = []
    for t in text.split(" "):
        t = "" if (t.startswith('@') or t.startswith('#') ) and len(t) > 1 else t
        new_text.append(t)

    new_text = re.sub(r'http\S+', '', " ".join(new_text))
    return new_text

data["prep"] = data.text_rt.apply(remove_mentions_and_links)
data = data.reset_index(drop=True)

In [None]:
analyzer = SentimentIntensityAnalyzer()
data["sentiment"] = [analyzer.polarity_scores(text) for text in data.prep]

def categorize(scores_dict):
    compound=scores_dict['compound']
    if compound>=0.05:
        return "positive"
    elif compound <=-0.05:
        return "negative"
    elif (compound > -0.05) and(compound < 0.05):
        return "neutral"

data["score"] = data["sentiment"].apply(categorize)
data['compound'] = [i['compound'] for i in data["sentiment"]]

data.to_pickle("../data/for_analysis/data1_no_bots.pkl")

# Topic Modelling

In [None]:
data = pd.read_pickle("../data/for_analysis/data1_no_bots.pkl")

In [None]:
def words_only(text):
    regex=u"[a-zA-Z\.\?!:;\-']+"
    regex = re.compile(regex)
    return " ".join(regex.findall(text))

def basic_preprocessing(texts):
    texts = texts.str.replace("https:","")
    texts = texts.str.replace("t.co","")
    # remove tel and email
    texts = texts.str.replace("<email>","")
    texts = texts.str.replace("<tel>","")
    texts = texts.str.replace("<link>","")
    texts = [re.sub(r'anon\d*',"",t) for t in texts]
    # Remove new line characters
    texts = [re.sub('\s+', ' ', t) for t in texts]
    # Remove single quotes
    texts = [re.sub("\'", "", sent) for sent in texts]
    # remove some punctuation and numbers, emoji
    texts = [words_only(t.lower()).strip() for t in texts]

    return texts

In [None]:
data.prep = basic_preprocessing(data.prep)
data.prep = data.prep.str.replace("chatgpt","")

In [None]:
data = data.loc[data.prep !=""]
print("Non-empty tweets in English: ", data.shape[0])

sparking = data.loc[data.conversation_id == data.id]
print("Non-conversation tweets: ", sparking.shape[0])

convs = data.loc[data.conversation_id != data.id]
print("Conversation tweets: ", convs.shape[0])

data_no_retweets = sparking.loc[~sparking.text.str.startswith("RT ")]
#print("Non-conversational non-retweets: ", data_no_retweets.loc[~data_no_retweets.author_id.isin(bots)].shape[0])
print("Non-conversation non-retweets: ", data_no_retweets.shape[0])

retweets = sparking.loc[sparking.text.str.startswith("RT ")]
#print("Non-conversational retweets: ", retweets.loc[~retweets.author_id.isin(bots)].shape[0])
print("Non-conversation retweets: ", retweets.shape[0])

In [None]:
from bertopic.vectorizers import ClassTfidfTransformer
from flair.embeddings import TransformerDocumentEmbeddings


def topic_modelling(text_prep):
    docs=list(text_prep)
    ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
    #roberta = TransformerDocumentEmbeddings("cardiffnlp/twitter-roberta-base-sep2022")
    #model=BERTopic(embedding_model=roberta,ctfidf_model=ctfidf_model,n_gram_range = (1,2),verbose=True,language='English',low_memory=True,min_topic_size=100)
    model=BERTopic(ctfidf_model=ctfidf_model,n_gram_range = (1,2),verbose=True,language='English',low_memory=True,min_topic_size=500)
    topics=model.fit_transform(docs)
    new_topics = model.reduce_outliers(docs,topics[0])
    model.update_topics(docs, topics=new_topics)
    topic_info=model.get_topic_info()
    return model.topics_, topic_info,model

In [None]:
data_no_retweets["topics"],topic_info,model = topic_modelling(data_no_retweets["prep"])

In [None]:
data_no_retweets.to_pickle("../data/for_analysis/topics.pkl")
topic_info.to_excel("../data/for_analysis/topic_info_bert_sparking.xlsx")
data_no_retweets.loc[:,["text_rt","text","date","topics","sentiment"]].sample(10000).to_excel("../data/for_analysis/sample_topic_bert_sparking.xlsx")
model.save("../data/for_analysis/model_bertopic_sparking_2")

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

topic_labels = model.topic_labels_.values()
sim_matrix = cosine_similarity(model.topic_embeddings_)
sim_df = pd.DataFrame(data=sim_matrix,   
             index=topic_labels,    
               columns=topic_labels)

sim_df.to_excel("../analysis/topics_cosine_sim.xlsx")

### Extrapolation on the non-conversational retweets only

In [None]:
data_no_retweets = pd.read_pickle("../data/for_analysis/topics.pkl")

In [None]:
data_no_retweets = data_no_retweets.loc[~data_no_retweets.author_id.isin(bots)]
data_no_retweets=data_no_retweets.reset_index(drop=True)
print(data_no_retweets.shape)
data_no_retweets = data_no_retweets.drop_duplicates(subset = ["text_rt"])

In [None]:
retweets = retweets.loc[~retweets.author_id.isin(bots)]
retweets=retweets.reset_index(drop=True)
retweets.shape

In [None]:
merged = pd.merge(retweets,data_no_retweets.loc[:,["text_rt","topics"]],left_on="text_rt",right_on = "text_rt",how='left')
merged.shape

In [None]:
sum(merged.topics.isnull())

In [None]:
retweets_found = merged.loc[~merged.topics.isnull()].copy()
retweets_not_found = merged.loc[merged.topics.isnull()].copy()

In [None]:
topics,_ = model.transform(retweets_not_found.prep.to_list())

In [None]:
retweets_not_found["topics"] = topics
retweets = pd.concat([retweets_found,retweets_not_found],axis=0)
retweets.shape

## Merging topics and datasets

In [None]:
data_no_retweets = pd.read_pickle("../data/for_analysis/topics.pkl")
data_no_retweets = data_no_retweets.loc[~data_no_retweets.author_id.isin(bots)]
data_all = pd.concat([retweets,data_no_retweets],axis=0)
data_all.shape

In [None]:
topics ={
    "AI in general":[1,7,15,22,74,78,84],
    "Impact on search engines":	[2,25,60,63,70,75,87,119],
    "Education":[4,6,31,34,44,45,73,77,82,102,122],
    "Impact on art (poems and lyrics, movies,books)":	[94,80,9,10,19,37],
    "Openai and its Investors and products (Microsoft, Musk)":	[127,114,97,90,59,3,17,40,50],
    "Cybersecurity (writing malware)":	[12,30],
    "Programming":	[16,57,67,111,128],
    "Digital content generation(podcasts,youtube scripts,quizzes)":[85,81,38,39,43,54],
    "Access and price":	[14,23,55,65,66,104,108,110,117],
    "Business routine":	[18,79,89,103,115],
    "Social events on ChatGPT, discussion on media": [56,64],
    "Politics":	[5],
    "Recipes":	[20],
    "Legal issues":	[35,72],
    "Calculator, math":	[32],
    "Job loss":	[29],
    "ChatGPT's competitors":[126],
    "LLM technology":	[21,24,51,52,58,76,95,105,125],
    "Text to audio/voice":[62,71],
    "Translation":	[100],
    "Healthcare":	[26,69,96],
    "Criticism in terms of ethics":[68,116],
    "Q&A platforms":	[107],
    "Questions examples, prompt engineering":[0,27,33,41,101,112],
    "Entertainment":[42,109,113,120,123],
    "Robots":[53],
    "Christmas":[47],
    "Finance":	[11,13,36],
    "Climate change":[86],
    "Religion, sermons":[49],
    "Terrifying, insane":	[48,61,93],
    "Gender"	:[91],	
    "ChatGPT on social media": [8,106,118,121,124],
    "Real estate":	[99],
    "Sport":[28],
    "Quantum computing":[98],
    "Spam":[13],
    "Making money with ChatGPT":[88]
}

In [None]:
topics_flipped = {}
for key,item in topics.items():
    for i in item:
        topics_flipped[i] = key
        
data_all["topics_general"] = data_all['topics'].replace(topics_flipped)
data_all.loc[~data_all.topics_general.isin(topics.keys()),"topics_general"]=-1
data_all = data_all.reset_index(drop=True)

In [None]:
data_all['total']=1
volumes = data_all.loc[:,["topics_general","total"]].groupby("topics_general").sum()
volumes = volumes.sort_values("total",ascending=False).reset_index()
volumes["precent"] = volumes["total"]/sum(volumes["total"])*100
volumes

In [None]:
volumes.to_excel("../analysis/topics_volume_final_no_bots.xlsx")
#data_all.to_pickle("../data/for_analysis/data2.pkl")
data_all.to_pickle("../data/for_analysis/data2_no_bots.pkl")

In [None]:
data["total"]=1
sent_df = data.loc[(data.topics_general !="Outliers")&(data.topics_general !="Spam"),["date","total","score"]].groupby(["date","score"]).sum().reset_index()
sent_df["perc"] = sent_df['total'] / sent_df.groupby('date')['total'].transform('sum')
sent_df.columns = ["Date","Sentiment","Tweets per Day","Percent"]
sent_df.to_excel("../analysis/ALL_sentiment_graph_no_outliers.xlsx")
fig = px.line(sent_df, x="Date", y="Percent", title='Dynamics of sentiment',color="Sentiment",
                template="plotly_white", color_discrete_sequence=[ 'red','grey',"green"],
                 width=800, height=600)
fig.show()

## + Conversations

In [None]:
conv = pd.read_pickle("../data/for_analysis/data1.pkl")
conv = conv.loc[conv.conversation_id != conv.id]
conv = conv.loc[~conv.author_id.isin(bots)]
conv.shape

In [None]:
from collections import Counter
import numpy as np

dict_conv_avg = {}
conv["topics_general"] = -1

print(conv.shape)
conv = conv.loc[~conv.author_id.isin(bots)]
print(conv.shape)

for topic_name in data_all.topics_general.unique():
    conv_ids_unique = list(data_all.loc[data_all.topics_general == topic_name]['conversation_id'].unique())
    conv.loc[conv.conversation_id.isin(conv_ids_unique) , "topics_general"] = topic_name
    conv_ids_topic = conv.loc[conv.conversation_id.isin(conv_ids_unique)]["conversation_id"].to_list()
    dict_conv_avg[topic_name] =np.mean(list(Counter(conv_ids_topic).values()))

In [None]:
volumes["average_conv"] = volumes.topics_general.replace(dict_conv_avg)
volumes.head()

In [None]:
volumes.to_excel("../analysis/topics_volume_final_no_bots.xlsx")

### Sentiment Vis

In [None]:
data_topics = pd.read_pickle("../data/for_analysis/data2_no_bots.pkl")

# merge topics with text in data1_no_bots

In [None]:
spam = data_topics.loc[data_topics.topics_general=="Spam","edit_history_tweet_ids"].to_list()
spam = [str(spam) for spam in spam ]

In [None]:
len(spam)

In [None]:
spam_fake_conv_ids = data.loc[data.edit_history_tweet_ids.astype(str).isin(spam),"conversation_id"].to_list()
spam_fake_conv_ids = set(spam_fake_conv_ids)
len(spam_fake_conv_ids)

In [None]:
with open("../data/spam_fake_conv_ids.txt","w") as f:
    for conv_id in spam_fake_conv_ids:
        f.write(str(conv_id))
        f.write("\n")

In [None]:
data = data.loc[~data.conversation_id.isin(spam_fake_conv_ids)]
data.shape

In [None]:
data["total"]=1
sent_df = data.loc[:,["date","total","score"]].groupby(["date","score"]).sum().reset_index()
sent_df["perc"] = sent_df['total'] / sent_df.groupby('date')['total'].transform('sum')
sent_df.columns = ["Date","Sentiment","Tweets per Day","Percent"]

sent_df.to_excel("../analysis/ALL_sentiment_graph.xlsx")

fig = px.line(sent_df, x="Date", y="Percent", title='Dynamics of sentiment',color="Sentiment",
                template="plotly_white", color_discrete_sequence=[ 'red','grey',"green"],
                 width=800, height=600)
fig.show()

## Subset of Tweets on Education

In [None]:
edu = data_all.loc[data_all.topics_general == "Education"]
edu = edu.drop(["topics","total"],axis=1)

edu["spark"] = 1
conv_edu = conv.loc[conv.topics_general == "Education"]
conv_edu["spark"]=0
print(conv_edu.shape)
edu = pd.concat([edu,conv_edu],axis=0)
edu = edu.reset_index(drop=True)
edu.shape

In [None]:
edu.to_pickle("../data/for_analysis/edu_no_bots.pkl")