In [1]:
import pandas as pd
import plotly.express as px
from bertopic.vectorizers import ClassTfidfTransformer
from bertopic import BERTopic
from sklearn.metrics.pairwise import cosine_similarity
from collections import Counter

edu = pd.read_pickle("../data/for_analysis/edu_no_bots.pkl")
edu["total"] = 1

In [4]:
edu.shape

(136726, 39)

In [15]:
edu.loc[edu.conversation_id != edu.id].shape

(34812, 41)

In [5]:
edu["date"] = pd.to_datetime(edu.created_at, utc=True).dt.date

In [58]:

def plot_sentiment(df, title,date_column="date"):
    sent_df = df.loc[:,[date_column,"total","score"]].groupby([date_column,"score"]).sum().reset_index()
    sent_df["perc"] = sent_df['total'] / sent_df.groupby(date_column)['total'].transform('sum')
    sent_df.columns = ["Date","Sentiment","Tweets per Day","Percent"]

    fig = px.line(sent_df, x="Date", y="Percent", title=title,color="Sentiment",
                    template="plotly_white", color_discrete_sequence=[ 'red','grey',"green"],
                    width=800, height=600)
    fig.show()

plot_sentiment(edu,date_column = "date",title="Dynamics of sentiment (Education)")

In [89]:
def topic_modelling(text_prep):
    docs=list(text_prep)
    ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)
    model=BERTopic(ctfidf_model=ctfidf_model,n_gram_range = (1,2),verbose=True,language='English',nr_topics=20)
    topics=model.fit_transform(docs)
    new_topics = model.reduce_outliers(docs,topics[0])
    model.update_topics(docs, topics=new_topics)
    topic_info=model.get_topic_info()
    return model.topics_, topic_info,model

In [69]:
edu.prep = edu.prep.str.replace("chatgpt","")
edu.prep

0         abstracts written by  fool scientists. they al...
1         interesting reddinvo on how  helps students wi...
2         teaching ai to lie. i asked : provide a summar...
3         most academics are fake: joe rogan amp; bret w...
4         a new app built by can detect whether your ess...
                                ...                        
138792    this is not evidence of how great  is it is ev...
138793             is only good as putting in filler words.
138794                                      you should ask 
138795    note i skipped one graphing question i couldnn...
138796    if you post more like this could you think abo...
Name: prep, Length: 138797, dtype: object

In [90]:
edu["topics_edu"],topic_info,model = topic_modelling(edu["prep"])
topic_info.to_excel("../data/for_analysis/topics_info_edu.xlsx")
model.save("../data/for_analysis/model_edu")
edu.loc[:,["text_rt","text","date","topics_edu","sentiment"]].sample(10000).to_excel("../data/for_analysis/sample_topic_edu.xlsx")

Batches: 100%|██████████| 4338/4338 [18:56<00:00,  3.82it/s]
2023-03-13 09:57:50,170 - BERTopic - Transformed documents to Embeddings
2023-03-13 10:02:47,026 - BERTopic - Reduced dimensionality
2023-03-13 10:03:27,441 - BERTopic - Clustered reduced embeddings
2023-03-13 10:06:48,481 - BERTopic - Reduced number of topics from 1577 to 21
100%|██████████| 87/87 [00:46<00:00,  1.88it/s]


In [99]:
edu.to_pickle("../data/for_analysis/edu_topics.pkl")

In [51]:
edu_topics = {
    "Outliers":[0,5,-1],
    "Other AI Tools for Education":[4],
    "ChatGPT Used by Students to Write Essays and Cheat":[2,15],
    "ChatGPT in Academia":[6,7,8],
    "Ban ChatGPT in Educational Organizations or Not?": [11,12,19],
    "ChatGPT Passed/Failed Exams":[1,16],
    "ChatGPT Will Never Replicate Students' Papers":[3],
    "Lowering the Cost of Education":	[10],
    "ChatGPT Should Be Integrated to Educationional Process":[9,13,17],
    "How Teachers Could Use ChatGPT":[18],
    "ChatGPT Stimulates Creativity":[14]
}
topics_flipped = {}
for key,item in edu_topics.items():
    for i in item:
        topics_flipped[i] = key

In [52]:
edu["topics_renamed"] = edu.topics_edu.replace(topics_flipped)

In [53]:
edu["topics_renamed"].value_counts()

ChatGPT Should Be Integrated to Educationional Process    27368
ChatGPT Used by Students to Write Essays and Cheat        21526
ChatGPT in Academia                                       19654
Ban ChatGPT in Educational Organizations or Not?          12084
ChatGPT Passed/Failed Exams                                9107
ChatGPT Stimulates Creativity                              7090
Other AI Tools for Education                               6422
Outliers                                                   6325
ChatGPT Will Never Replicate Students' Papers              5220
Lowering the Cost of Education                             4413
How Teachers Could Use ChatGPT                             2457
Name: topics_renamed, dtype: int64

In [169]:
topics_to_merge = list(edu_topics.values())
model.merge_topics(list(edu["prep"]), topics_to_merge)
model.save("../data/for_analysis/model_edu_merged")

In [171]:
from sklearn.metrics.pairwise import cosine_similarity

topic_labels = list(edu["topics_renamed"].value_counts().index)
sim_matrix = cosine_similarity(model.topic_embeddings_)
sim_df = pd.DataFrame(data=sim_matrix,   
             index=topic_labels,    
               columns=topic_labels)

sim_df.to_excel("../analysis/edu_topics_cosine_sim_v2.xlsx")

In [61]:
edu.loc[edu.conversation_id==edu.id].shape

(86934, 41)

In [62]:
edu.loc[edu.conversation_id!=edu.id].shape

(34732, 41)

In [55]:
edu = edu.loc[~edu.author_id.isin(bots) ]
edu['total']=1
volumes =edu.loc[:,["topics_renamed","total"]].groupby("topics_renamed").sum()
volumes = volumes.sort_values("total",ascending=False).reset_index()
volumes["precent"] = volumes["total"]/sum(volumes["total"])*100
volumes.to_excel("../analysis/volume_edu_topics_v2_no_bots.xlsx")
volumes

Unnamed: 0,topics_renamed,total,precent
0,ChatGPT Should Be Integrated to Educationional...,27368,22.49437
1,ChatGPT Used by Students to Write Essays and C...,21526,17.6927
2,ChatGPT in Academia,19654,16.154061
3,Ban ChatGPT in Educational Organizations or Not?,12084,9.932109
4,ChatGPT Passed/Failed Exams,9107,7.485246
5,ChatGPT Stimulates Creativity,7090,5.827429
6,Other AI Tools for Education,6422,5.278385
7,Outliers,6325,5.198659
8,ChatGPT Will Never Replicate Students' Papers,5220,4.290434
9,Lowering the Cost of Education,4413,3.627143


In [22]:
sent_edu = edu.loc[:,["topics_renamed","score","total"]].groupby(["topics_renamed","score"]).sum()
sent_edu = sent_edu /sent_edu.groupby(level=0).sum()
sent_edu.reset_index().to_excel("../analysis/sent_edu_topics_v2_no_bots.xlsx")

In [56]:
sent_df = edu.loc[edu.topics_renamed != "Outliers"]
sent_df = sent_df.loc[:,["date","total","score"]].groupby(["date","score"]).sum().reset_index()
sent_df["perc"] = sent_df['total'] / sent_df.groupby("date")['total'].transform('sum')
sent_df.columns = ["Date","Sentiment","Tweets per Day","Percent"]

sent_df.to_excel("../analysis/edu_sentiment_graph.xlsx")

In [59]:
plot_sentiment(edu.loc[edu.topics_renamed != "Outliers"],date_column = "date",title="Dynamics of sentiment (Education)")