In this notebook, exploratory analysis of the data was done:

1. Descriptive statistics for all tweets and sparking tweets only. Dynamics of the number of posts per day. Dynamics of the number of unique users.
2. Geolocation of tweets.
3. Language of tweets distribution.

In [1]:
import pandas as pd
import re
import matplotlib.pyplot as plt
import os
import json
import plotly.express as px

data = pd.read_pickle("../data/for_analysis/data0.pkl")
data.shape

(16830997, 36)

In [2]:
bots=[]
with open("../data/for_analysis/bot_fake_ids_2.txt","r") as f:
    for line in f:
        bots.append(int(line.strip()))

data = data.loc[~data.author_id.isin(bots)]
data.shape


(16750608, 36)

In [3]:
spam=[]
with open("../data/spam_fake_conv_ids.txt","r") as f:
    for line in f:
        spam.append(int(line.strip()))

data = data.loc[~data.conversation_id.isin(spam)]
data.shape

(16743036, 36)

In [4]:
print("Number of unique users: ",len(data.author_id.unique()))

Number of unique users:  5537942


### 1. Descriptive Statistics

In [5]:
pub_metrics = data.loc[:,["public_metrics.retweet_count","public_metrics.reply_count","public_metrics.like_count",
                        "public_metrics.quote_count"]]
print(pub_metrics.shape[0])                       
round(pub_metrics.describe().iloc[1:,:],3)

16743036


Unnamed: 0,public_metrics.retweet_count,public_metrics.reply_count,public_metrics.like_count,public_metrics.quote_count
mean,229.981,0.391,4.568,0.036
std,1688.009,13.761,236.468,3.449
min,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0
75%,0.0,0.0,1.0,0.0
max,35579.0,17185.0,176848.0,4612.0


In [6]:
data.loc[:,["public_metrics.retweet_count","public_metrics.reply_count","public_metrics.like_count",
                        "public_metrics.quote_count"]].median(axis=0)

public_metrics.retweet_count    0.0
public_metrics.reply_count      0.0
public_metrics.like_count       0.0
public_metrics.quote_count      0.0
dtype: float64

In [7]:
conversations = data.groupby(["conversation_id"])["author_id"].count()
print(sum(conversations>1))
conversations = conversations[conversations>1].index.to_list()

189805


In [8]:
sparking = data.loc[data.conversation_id.isin(conversations)]
sparking = sparking.loc[(sparking.id == sparking.conversation_id)&(sparking["public_metrics.reply_count"]>0)]
sparking = sparking.loc[:,["public_metrics.retweet_count","public_metrics.reply_count","public_metrics.like_count",
                        "public_metrics.quote_count"]]
print(sparking.shape)
round(sparking.describe(),3)

(125151, 4)


Unnamed: 0,public_metrics.retweet_count,public_metrics.reply_count,public_metrics.like_count,public_metrics.quote_count
count,125151.0,125151.0,125151.0,125151.0
mean,13.026,6.556,76.837,1.708
std,186.667,100.024,972.979,27.571
min,0.0,1.0,0.0,0.0
25%,0.0,1.0,1.0,0.0
50%,0.0,1.0,4.0,0.0
75%,2.0,3.0,16.0,0.0
max,35568.0,17185.0,159790.0,4612.0


In [9]:
sparking.median(axis=0)

public_metrics.retweet_count    0.0
public_metrics.reply_count      1.0
public_metrics.like_count       4.0
public_metrics.quote_count      0.0
dtype: float64

In [10]:
data["total"] = 1
tweets_day = data.loc[:,["date","total"]].groupby("date").sum().reset_index()
tweets_day.to_excel("../analysis/all_tweets_posts_dynamics.xlsx")

fig = px.line(tweets_day, x="date", y="total", title='Dynamics of the number of Tweets that mention ChatGPT and converstions',
                labels=dict(date="Date", total="Number of Tweets per day"),template="plotly_white",
                 width=800, height=600)

fig.show()

In [11]:
data = data.sort_values("date",ascending=True)
unique_users = data.loc[:,["author_id","date"]].groupby(["author_id"]).first().reset_index()

print(unique_users.shape)
unique_users['total'] = 1
unique_users = unique_users.loc[:,["total","date"]].groupby("date").sum().reset_index()
len(data.author_id.unique())

(5537942, 2)


5537942

In [12]:
unique_users["Line color"] = "Unique users" 
tweets_day['Line color'] = "Tweets"
df = pd.concat([unique_users,tweets_day],axis=0)
df.to_excel("../analysis/unique_users_tweets.xlsx")

In [13]:
fig = px.line(df, x="date", y="total", title='Dynamics of the number of unique users/ Tweets',color="Line color",
                labels=dict(date="Date", total="Number of unique users/Tweets per day"),template="plotly_white",
                 width=800, height=600)

fig.show()

### 2. Geolocation

In [14]:
geo = data.loc[~data['geo.place_id'].isnull(),['created_at','id','lang','geo.place_id', 'geo.coordinates.type','geo.coordinates.coordinates']]
geo.shape

(160260, 6)

In [15]:
place_ids = geo["geo.place_id"].unique()
len(place_ids)

17748

In [None]:
with open("../data/place_ids.txt","w") as f:
    for place_id in place_ids:
        f.write(str(place_id))
        f.write("\n")

In [None]:
from tqdm import tqdm
path='../data/geo/'
files = os.listdir(path)

count=0
for file in tqdm(files):
    with open(path+file, 'r',encoding="utf-8") as f:
        temp = json.load(f)
        if count==0:
            geo = pd.json_normalize(temp)
            count+=1
        else:
            df = pd.json_normalize(temp)
            geo = pd.concat([geo,df],axis=0)
geo = geo.reset_index(drop=True)
geo.to_json("../data/for_analysis/geo.json")

In [16]:
geo=pd.read_json("../data/for_analysis/geo.json")
geo.head()

Unnamed: 0,id,name,full_name,country,country_code,url,place_type,centroid,contained_within,polylines,...,attributes.190533:id,attributes.567718:targetable,attributes.162763:id,attributes.162772:state_id,attributes.162813:id,attributes.162772:cousub_id,attributes.random_attr,attributes.162768:id,attributes.567718:metro,attributes.162834:id
0,1f4df2b4746ddea7,Brampton,"Brampton, Ontario",Canada,CA,https://api.twitter.com/1.1/geo/id/1f4df2b4746...,city,"[-79.743531925, 43.725062]","[{'id': '0192e4d84ed0ea4e', 'name': 'Toronto',...",[],...,,,,,,,,,,
1,80cf9987ff7e9762,Lake Elsinore,"Lake Elsinore, CA",United States,US,https://api.twitter.com/1.1/geo/id/80cf9987ff7...,city,"[-117.3451391257, 33.665765]","[{'id': 'e4315a8602024dcb', 'name': 'LOS ANGEL...",[],...,,,,,,,,,,
2,018e2bf71a3ef896,Prague,"Prague, Czech Republic",Czech Republic,CZ,https://api.twitter.com/1.1/geo/id/018e2bf71a3...,city,"[14.4464730987, 50.05957115]","[{'id': '0126ba1e341b038c', 'name': 'Prague', ...",[],...,,,,,,,,,,
3,3daa1b987e17d095,Dalton Piercy,"Dalton Piercy, England",United Kingdom,GB,https://api.twitter.com/1.1/geo/id/3daa1b987e1...,city,"[-1.2808998182, 54.67351755]","[{'id': '67bc7263f7b9047b', 'name': 'North Eas...",[],...,,,,,,,,,,
4,4ca13653c1a41e50,Nyköping,"Nyköping, Sverige",Sweden,SE,https://api.twitter.com/1.1/geo/id/4ca13653c1a...,city,"[16.9742389867, 58.72823315]","[{'id': '2258e8caf1fed45f', 'name': 'Södermanl...",[],...,,,,,,,,,,


In [17]:
geo_dict = geo.loc[:,["id","country"]].set_index("id").to_dict()
geo_subset = data.loc[~data["geo.place_id"].isnull()]
geo_subset["country"] = geo_subset["geo.place_id"].replace(geo_dict["country"])



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [18]:
pd.DataFrame(geo_subset.country.value_counts()).to_excel("../analysis/geo_no_bots.xlsx")

### 3. Languages

In [19]:
langs = data.loc[:,["lang","total"]].groupby("lang").sum().reset_index()
langs = langs.sort_values("total",ascending=False)
langs["total"] = round(langs["total"]/langs["total"].sum() *100,2)
langs["lang"]=langs["lang"].replace({"en":"English","ja":"Japanese","es":"Spanish","fr":"French","zh":"Chinese",
                                        "de":"German","ar":"Arabic","tr":"Turkish","ko":"Korean","pt":"Portuguese","nl":"Dutch"})

langs.to_excel("../analysis/langs_distribution.xlsx")

fig = px.bar(langs.iloc[[0,3,4,5,6,7,9,11,12,13],:], x='lang', y='total',text_auto=True,template="plotly_white",
                labels=dict(lang="Language", total="Per cent of Tweets"),
                title = "Top 10 languages")

fig.show()

In [20]:
jap = data.loc[data.lang == "ja",["date","total"]].groupby("date").sum().reset_index()
eng = data.loc[data.lang == "en",["date","total"]].groupby("date").sum().reset_index()

jap["Group"] = "Japanese" 
eng['Group'] = "English"

df = pd.concat([jap,eng],axis=0)

fig = px.line(df, x="date", y="total", title='Dynamics of the number of Tweets',color="Group",
                labels=dict(date="Date", total="Number of Tweets per day"),template="plotly_white",
                 width=800, height=600)

fig.show()