In this notebook, exploratory analysis of the data was done:

1. Descriptive statistics for all tweets and sparking tweets only. Dynamics of the number of posts per day. Dynamics of the number of unique users.
2. Geolocation of tweets.
3. Language of tweets distribution.

In [None]:
import pandas as pd
import re
import matplotlib.pyplot as plt
import os
import json
import plotly.express as px

data = pd.read_pickle("../data/for_analysis/data0.pkl")
data.shape

In [None]:
bots=[]
with open("../data/for_analysis/bot_fake_ids_2.txt","r") as f:
    for line in f:
        bots.append(int(line.strip()))

data = data.loc[~data.author_id.isin(bots)]
data.shape


In [None]:
spam=[]
with open("../data/spam_fake_conv_ids.txt","r") as f:
    for line in f:
        spam.append(int(line.strip()))

data = data.loc[~data.conversation_id.isin(spam)]
data.shape

In [None]:
print("Number of unique users: ",len(data.author_id.unique()))

### 1. Descriptive Statistics

In [None]:
pub_metrics = data.loc[:,["public_metrics.retweet_count","public_metrics.reply_count","public_metrics.like_count",
                        "public_metrics.quote_count"]]
print(pub_metrics.shape[0])                       
round(pub_metrics.describe().iloc[1:,:],3)

In [None]:
data.loc[:,["public_metrics.retweet_count","public_metrics.reply_count","public_metrics.like_count",
                        "public_metrics.quote_count"]].median(axis=0)

In [None]:
conversations = data.groupby(["conversation_id"])["author_id"].count()
print(sum(conversations>1))
conversations = conversations[conversations>1].index.to_list()

In [None]:
sparking = data.loc[data.conversation_id.isin(conversations)]
sparking = sparking.loc[(sparking.id == sparking.conversation_id)&(sparking["public_metrics.reply_count"]>0)]
sparking = sparking.loc[:,["public_metrics.retweet_count","public_metrics.reply_count","public_metrics.like_count",
                        "public_metrics.quote_count"]]
print(sparking.shape)
round(sparking.describe(),3)

In [None]:
sparking.median(axis=0)

In [None]:
data["total"] = 1
tweets_day = data.loc[:,["date","total"]].groupby("date").sum().reset_index()
tweets_day.to_excel("../analysis/all_tweets_posts_dynamics.xlsx")

fig = px.line(tweets_day, x="date", y="total", title='Dynamics of the number of Tweets that mention ChatGPT and converstions',
                labels=dict(date="Date", total="Number of Tweets per day"),template="plotly_white",
                 width=800, height=600)

fig.show()

In [None]:
data = data.sort_values("date",ascending=True)
unique_users = data.loc[:,["author_id","date"]].groupby(["author_id"]).first().reset_index()

print(unique_users.shape)
unique_users['total'] = 1
unique_users = unique_users.loc[:,["total","date"]].groupby("date").sum().reset_index()
len(data.author_id.unique())

In [None]:
unique_users["Line color"] = "Unique users" 
tweets_day['Line color'] = "Tweets"
df = pd.concat([unique_users,tweets_day],axis=0)
df.to_excel("../analysis/unique_users_tweets.xlsx")

In [None]:
fig = px.line(df, x="date", y="total", title='Dynamics of the number of unique users/ Tweets',color="Line color",
                labels=dict(date="Date", total="Number of unique users/Tweets per day"),template="plotly_white",
                 width=800, height=600)

fig.show()

### 2. Geolocation

In [None]:
geo = data.loc[~data['geo.place_id'].isnull(),['created_at','id','lang','geo.place_id', 'geo.coordinates.type','geo.coordinates.coordinates']]
geo.shape

In [None]:
place_ids = geo["geo.place_id"].unique()
len(place_ids)

In [None]:
with open("../data/place_ids.txt","w") as f:
    for place_id in place_ids:
        f.write(str(place_id))
        f.write("\n")

In [None]:
from tqdm import tqdm
path='../data/geo/'
files = os.listdir(path)

count=0
for file in tqdm(files):
    with open(path+file, 'r',encoding="utf-8") as f:
        temp = json.load(f)
        if count==0:
            geo = pd.json_normalize(temp)
            count+=1
        else:
            df = pd.json_normalize(temp)
            geo = pd.concat([geo,df],axis=0)
geo = geo.reset_index(drop=True)
geo.to_json("../data/for_analysis/geo.json")

In [None]:
geo=pd.read_json("../data/for_analysis/geo.json")
geo.head()

In [None]:
geo_dict = geo.loc[:,["id","country"]].set_index("id").to_dict()
geo_subset = data.loc[~data["geo.place_id"].isnull()]
geo_subset["country"] = geo_subset["geo.place_id"].replace(geo_dict["country"])

In [None]:
pd.DataFrame(geo_subset.country.value_counts()).to_excel("../analysis/geo_no_bots.xlsx")

### 3. Languages

In [None]:
langs = data.loc[:,["lang","total"]].groupby("lang").sum().reset_index()
langs = langs.sort_values("total",ascending=False)
langs["total"] = round(langs["total"]/langs["total"].sum() *100,2)
langs["lang"]=langs["lang"].replace({"en":"English","ja":"Japanese","es":"Spanish","fr":"French","zh":"Chinese",
                                        "de":"German","ar":"Arabic","tr":"Turkish","ko":"Korean","pt":"Portuguese","nl":"Dutch"})

langs.to_excel("../analysis/langs_distribution.xlsx")

fig = px.bar(langs.iloc[[0,3,4,5,6,7,9,11,12,13],:], x='lang', y='total',text_auto=True,template="plotly_white",
                labels=dict(lang="Language", total="Per cent of Tweets"),
                title = "Top 10 languages")

fig.show()

In [None]:
jap = data.loc[data.lang == "ja",["date","total"]].groupby("date").sum().reset_index()
eng = data.loc[data.lang == "en",["date","total"]].groupby("date").sum().reset_index()

jap["Group"] = "Japanese" 
eng['Group'] = "English"

df = pd.concat([jap,eng],axis=0)

fig = px.line(df, x="date", y="total", title='Dynamics of the number of Tweets',color="Group",
                labels=dict(date="Date", total="Number of Tweets per day"),template="plotly_white",
                 width=800, height=600)

fig.show()