In [None]:
import pandas as pd
import plotly.express as px
from bertopic.vectorizers import ClassTfidfTransformer
from bertopic import BERTopic
from sklearn.metrics.pairwise import cosine_similarity
from collections import Counter

edu = pd.read_pickle("../data/for_analysis/edu_no_bots.pkl")
edu["total"] = 1

In [None]:
edu.shape

In [None]:
edu.loc[edu.conversation_id != edu.id].shape

In [None]:
edu["date"] = pd.to_datetime(edu.created_at, utc=True).dt.date

In [None]:

def plot_sentiment(df, title,date_column="date"):
    sent_df = df.loc[:,[date_column,"total","score"]].groupby([date_column,"score"]).sum().reset_index()
    sent_df["perc"] = sent_df['total'] / sent_df.groupby(date_column)['total'].transform('sum')
    sent_df.columns = ["Date","Sentiment","Tweets per Day","Percent"]

    fig = px.line(sent_df, x="Date", y="Percent", title=title,color="Sentiment",
                    template="plotly_white", color_discrete_sequence=[ 'red','grey',"green"],
                    width=800, height=600)
    fig.show()

plot_sentiment(edu,date_column = "date",title="Dynamics of sentiment (Education)")

In [None]:
def topic_modelling(text_prep):
    docs=list(text_prep)
    ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
    model=BERTopic(ctfidf_model=ctfidf_model,n_gram_range = (1,2),verbose=True,language='English',nr_topics=20)
    topics=model.fit_transform(docs)
    new_topics = model.reduce_outliers(docs,topics[0])
    model.update_topics(docs, topics=new_topics)
    topic_info=model.get_topic_info()
    return model.topics_, topic_info,model

In [None]:
edu.prep = edu.prep.str.replace("chatgpt","")
edu.prep

In [None]:
edu["topics_edu"],topic_info,model = topic_modelling(edu["prep"])
topic_info.to_excel("../data/for_analysis/topics_info_edu.xlsx")
model.save("../data/for_analysis/model_edu")
edu.loc[:,["text_rt","text","date","topics_edu","sentiment"]].sample(10000).to_excel("../data/for_analysis/sample_topic_edu.xlsx")

In [None]:
edu.to_pickle("../data/for_analysis/edu_topics.pkl")

In [None]:
edu_topics = {
    "Outliers":[0,5,-1],
    "Other AI Tools for Education":[4],
    "ChatGPT Used by Students to Write Essays and Cheat":[2,15],
    "ChatGPT in Academia":[6,7,8],
    "Ban ChatGPT in Educational Organizations or Not?": [11,12,19],
    "ChatGPT Passed/Failed Exams":[1,16],
    "ChatGPT Will Never Replicate Students' Papers":[3],
    "Lowering the Cost of Education":	[10],
    "ChatGPT Should Be Integrated to Educationional Process":[9,13,17],
    "How Teachers Could Use ChatGPT":[18],
    "ChatGPT Stimulates Creativity":[14]
}
topics_flipped = {}
for key,item in edu_topics.items():
    for i in item:
        topics_flipped[i] = key

In [None]:
edu["topics_renamed"] = edu.topics_edu.replace(topics_flipped)

In [None]:
edu["topics_renamed"].value_counts()

In [None]:
topics_to_merge = list(edu_topics.values())
model.merge_topics(list(edu["prep"]), topics_to_merge)
model.save("../data/for_analysis/model_edu_merged")

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

topic_labels = list(edu["topics_renamed"].value_counts().index)
sim_matrix = cosine_similarity(model.topic_embeddings_)
sim_df = pd.DataFrame(data=sim_matrix,   
             index=topic_labels,    
               columns=topic_labels)

sim_df.to_excel("../analysis/edu_topics_cosine_sim_v2.xlsx")

In [None]:
edu.loc[edu.conversation_id==edu.id].shape

In [None]:
edu.loc[edu.conversation_id!=edu.id].shape

In [None]:
edu = edu.loc[~edu.author_id.isin(bots) ]
edu['total']=1
volumes =edu.loc[:,["topics_renamed","total"]].groupby("topics_renamed").sum()
volumes = volumes.sort_values("total",ascending=False).reset_index()
volumes["precent"] = volumes["total"]/sum(volumes["total"])*100
volumes.to_excel("../analysis/volume_edu_topics_v2_no_bots.xlsx")
volumes

In [None]:
sent_edu = edu.loc[:,["topics_renamed","score","total"]].groupby(["topics_renamed","score"]).sum()
sent_edu = sent_edu /sent_edu.groupby(level=0).sum()
sent_edu.reset_index().to_excel("../analysis/sent_edu_topics_v2_no_bots.xlsx")

In [None]:
sent_df = edu.loc[edu.topics_renamed != "Outliers"]
sent_df = sent_df.loc[:,["date","total","score"]].groupby(["date","score"]).sum().reset_index()
sent_df["perc"] = sent_df['total'] / sent_df.groupby("date")['total'].transform('sum')
sent_df.columns = ["Date","Sentiment","Tweets per Day","Percent"]

sent_df.to_excel("../analysis/edu_sentiment_graph.xlsx")

In [None]:
plot_sentiment(edu.loc[edu.topics_renamed != "Outliers"],date_column = "date",title="Dynamics of sentiment (Education)")