In this notebook:

1. Multiple datasets with main tweets collected by keyword search and corresponding conversations are concatenated. 

    There are 2 datasets for the tweets downloaded by keyword search (ChatGPT). 1. data_data.json is from 30 Nov 2022 to 15 Jan 2023. 2. data_31jan_data is from 15 Jan 2023 to 31 Jan 2023.

    There are 3 datasets for converstations: 1. conv_data.json contains the conversation tweets from 30 Nov 2022 to 15 Jan 2023 which were replies to the sparking tweets that mentioned ChatGPT. 2. conv_extended_data.json has the conversation tweets from 30 Nov 2022 to 15 Jan 2023 which were a part of conversation where at least one reply mentioned ChatGPT (sparking tweets did not necessarily mention ChatGPT). 3. conv_31jan_data.json contains the conversations from 16 Jan 2023 to 31 Jan 2023.

2. Retweets in English were augmented with full text.

3. Bots detection based on the number of tweets (more than 1000) and account description.

4. Data anonymization (author_id, tweet_id and mentions are replaced with random numbers).


In [None]:
import pandas as pd
import re
import matplotlib.pyplot as plt
import os
import json
import plotly.express as px

In [None]:
data_15jan = pd.read_json("../data/for_analysis/jsons_concatenated/data_data.json")
conv_15jan_spark = pd.read_json("../data/for_analysis/jsons_concatenated/conv_data.json")
conv_15jan_reply = pd.read_json("../data/for_analysis/jsons_concatenated/conv_extended_data.json")
data_31jan = pd.read_json('../data/for_analysis/jsons_concatenated/data_31jan_data.json')
conv_31jan = pd.read_json('../data/for_analysis/jsons_concatenated/conv_31jan_data.json')

In [None]:
def to_date(date_column,conv=False):
    if conv:
        return pd.to_datetime(date_column, utc=True).dt.date
    else:
        return pd.to_datetime(date_column).dt.date

conv = pd.concat([conv_15jan_spark,conv_15jan_reply,conv_31jan,data_31jan],axis=0)
conv=conv.reset_index(drop=True)
data = data_15jan.copy()

data['date'] = to_date(data['created_at'])
conv['date'] = to_date(conv['created_at'],conv=True)

data = pd.concat([data,conv],axis=0)
data=data.reset_index(drop=True)
conv_15jan_spark,conv_15jan_reply,conv_31jan,data_31jan,data_15jan = None,None,None,None,None

data.columns

### 1. Handling retweets

In [None]:
data = data.drop_duplicates(subset = ["text","id","conversation_id"])
data=data.reset_index(drop=True)
retweets = data.loc[(data.text.str.startswith("RT ")) & (~data.referenced_tweets.isnull())].copy()
originals = data.loc[(~data.text.str.startswith("RT ")) | (data.referenced_tweets.isnull()),["id","text"]].copy()
originals = originals.set_index("id")

In [None]:
originals = originals.to_dict()['text']
len(originals.keys())

In [None]:
text_rt = []
not_found = []
repl_retw = []

for r in retweets["referenced_tweets"]:
    if len(r)==1:
        try:
            text_rt.append(originals[int(r[0]['id'])])
            repl_retw.append(r[0]['type'])
            not_found.append(None)
        except:
            text_rt.append(None)
            repl_retw.append(r[0]['type'])
            not_found.append(int(r[0]['id']))
    else:
        try:
            text_rt.append(originals[int(r[0]['id'])])
            repl_retw.append(r[0]['type'])
            not_found.append(None)
        except:
            text_rt.append(None)
            repl_retw.append(r[0]['type'])
            not_found.append(int(r[0]['id']))        

In [None]:
retweets["text_rt"] = text_rt 
retweets["repl_retw"] = repl_retw
retweets["not_found"] = not_found 

In [None]:
sum(~retweets.not_found.isnull())

In [None]:
# Tweets in ENGLISH only
missing = retweets.loc[(~retweets["not_found"].isnull()) & (retweets["lang"]=="en"), "not_found"].to_list()
print(len(missing))
missing = set(missing)
print(len(missing))
with open("../data/missing_retweets.txt","w") as f:
    for tweet_id in missing:
        f.write(str(int(tweet_id)))
        f.write("\n")

In [None]:
with open("../data/missing_retweets.txt","r") as f:
    missing = f.readlines()
missing = [int(k.rstrip('\n')) for k in missing]
len(missing)

In [None]:
retweets = retweets.drop("repl_retw",axis=1)

### 2. Data augmentation with missing tweets

In [None]:
from tqdm import tqdm
path='../data/missing_tweets/'
files = os.listdir(path)

count=0
for file in tqdm(files):
    with open(path+file, 'r',encoding="utf-8") as f:
        temp = json.load(f)
        if count==0:
            mis = pd.json_normalize(temp)
            count+=1
        else:
            df = pd.json_normalize(temp)
            mis = pd.concat([mis,df],axis=0)
mis = mis.reset_index(drop=True)
mis.to_json("../data/for_analysis/missing_retweets.json")

In [None]:
mis = pd.read_json("../data/for_analysis/missing_retweets.json")
mis = mis.loc[:,["id","text"]]
mis.columns = ["id","text_rt"]
mis.id = mis.id.astype('int64')

originals = data.loc[(~data.text.str.startswith("RT ")) | (data.referenced_tweets.isnull())].copy()
originals["text_rt"] = originals["text"]
originals["not_found"] = -1

retweets.loc[retweets.not_found.isnull(),"not_found"] = -1
retweets.not_found = retweets.not_found.astype('int64')

retweets_not_found = retweets.loc[retweets.not_found != -1].copy()
retweets_found = retweets.loc[retweets.not_found == -1].copy()

In [None]:
print(len(mis.id))
print(len(mis.id.unique()))

In [None]:
print(len(retweets_not_found.not_found))
print(len(retweets_not_found.not_found.unique()))

In [None]:
merged = pd.merge(retweets_not_found, mis, left_on = "not_found", right_on ="id", how="left")
merged.shape

In [None]:
merged["text_rt"] = merged["text_rt_y"]
merged["id"] = merged["id_x"]
merged=merged.drop(["id_x","id_y","text_rt_y","text_rt_x"],axis=1)
sum(~merged["text_rt"].isnull())

In [None]:
retweets_augm = pd.concat([retweets_found,merged],axis=0)
retweets_augm.shape[0] == retweets.shape[0]

In [None]:
data_augm = pd.concat([originals, retweets],axis=0)
data.shape[0] == data_augm.shape[0]

In [None]:
sum(data_augm.text_rt.isnull())

In [None]:
data_augm = data_augm.reset_index(drop=True)
data_augm.to_pickle("../data/for_analysis/data_augm.pkl")

### 3. Bot Detection by Description

In [None]:
data = pd.read_pickle("../data/for_analysis/data_augm.pkl")
print(data.shape)
data = data.drop_duplicates(subset=['text','id'])
data = data.reset_index(drop=True)
data.shape

In [None]:
len(data.author_id.unique())

In [None]:
authors_eng = data.loc[data.lang=="en","author_id"].unique()
len(authors_eng)

In [None]:
data['total']=1
potential_bots = data.loc[:,["author_id","total"]].groupby("author_id").sum()
potential_bots.sort_values("total",ascending=False).head(20)

In [None]:
potential_bots.loc[potential_bots.total>1000].total.hist()

In [None]:
author_ids = potential_bots.loc[potential_bots.total>10].index.to_list()
print("Number of accounts that tweeted more than 10 tweets: ", len(author_ids))

with open("../data/potential_bots.txt","w") as f:
    for author_id in author_ids:
        f.write(str(int(author_id)))
        f.write("\n")

In [None]:
from tqdm import tqdm
path='../data/potential_bots/'
files = os.listdir(path)

count=0
for file in tqdm(files):
    with open(path+file, 'r',encoding="utf-8") as f:
        temp = json.load(f)
        if count==0:
            users = pd.json_normalize(temp)
            count+=1
        else:
            df = pd.json_normalize(temp)
            users = pd.concat([users,df],axis=0)
users = users.reset_index(drop=True)
users.to_json("../data/for_analysis/users.json")

In [None]:
users = pd.read_json("../data/for_analysis/users.json")
users.shape

In [None]:
users["bot"] = [re.findall(r"(?<![a-zA-Z])bot\W",descr.lower()) for descr in users.description]
users["bot"] = [1 if lst else 0 for lst in users["bot"]]
sum(users.bot==1)

In [None]:
users["app"] = [re.findall(r"app(?=[^a-zA-Z])",scr.lower()) for scr in users.screen_name]
users["app"] = [1 if lst else 0 for lst in users["app"]]
sum(users["app"]==1)

In [None]:
users["bot"] = 1*((users["bot"] + users["app"])>0)
sum(users["bot"]==1)

In [None]:
bots = users.loc[users["bot"]==1]
bots.to_json("../data/for_analysis/bots_2.json")

In [None]:
bots_active = potential_bots.loc[potential_bots.total>1000].index.to_list()
len(bots_active)

In [None]:
bots = set(bots_active+users.loc[users["bot"]==1,"id"].to_list())
len(bots)

### 4. Anonymization

In [None]:
data["total"]=1
print("The number of tweets: ", data.shape[0])
print("The number of unique users: ", len(data.author_id.unique()))
print("The number of conversation ids: ", sum(data.conversation_id.value_counts()>0))

In [None]:
# 1.1. Twitter-assigned tweet, user, and conversation identifiers with random, numeric, and anonymous identifiers.
ids_tweets = set(data['id'].to_list()+data["conversation_id"].to_list())
ids_authors = set(data['author_id'].to_list())

replace_tweets = dict(zip(ids_tweets,range(len(ids_tweets))))
replace_authors = dict(zip(ids_authors,range(len(ids_authors))))

data['author_id_real'] = data['author_id']
data['id_real'] = data['id']
data["conversation_id_real"] = data['conversation_id']


data['author_id'] = [replace_authors[i] for i in data['author_id']]
data['id'] = [replace_tweets[i] for i in data['id']]
data["conversation_id"] = [replace_tweets[i] for i in data['conversation_id']]

In [None]:
sum(data['text_rt'].isnull())

In [None]:
sum(~data.text_rt.isnull())

In [None]:
data['text_rt'] = data['text_rt'].fillna("")
data['text'] = data['text'].fillna("")

In [None]:
# 1.2. Replaced email addresses and phone numbers with placeholders (e.g., <tel> for telephone numbers). 
def replace_email_phone_links(text):
    text = re.sub('([A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,})', "<email>", text)
    text = re.sub('(\d{3}[-\.\s]??\d{3}[-\.\s]??\d{4}|\(\d{3}\)\s*\d{3}[-\.\s]??\d{4}|\d{3}[-\.\s]??\d{4})',"<tel>",text)
    text = re.sub('https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+', "<link>", text)
    return text


data['text_rt'] = [replace_email_phone_links(t) for t in data['text_rt']]
data['text'] = [replace_email_phone_links(t) for t in data['text']]

In [None]:
sum(data['text_rt'].str.contains("<tel>"))

In [None]:
sum(data['text_rt'].str.contains("<email>"))

In [None]:
sum(data['text_rt'].str.contains("<link>"))

In [None]:
# 1.3. Replaced all mentions of usernames in tweets with their new anonymous user identifier (e.g., “@18415125”). 
unique_mentions = [re.findall(r'@\S+', text) for text in data.text.to_list() + data.text_rt.to_list()]
unique_mentions = set([m for ml in unique_mentions for m in ml])
replacement = ["anon"+str(i) for i in range(len(unique_mentions))]
replace_authors_name = dict(zip(unique_mentions, replacement))

In [None]:
def replace_authors(text):
    mentions = re.findall(r'@\S+', text)
    if mentions:
        for mention in mentions:
            text = text.replace(mention,replace_authors_name[mention])
        return text
    else:
        return text

data['text'] = [replace_authors(t) for t in data['text']]
print("text is finished")
data['text_rt'] = [replace_authors(t) for t in data['text_rt']]

In [None]:
data.text

In [None]:
bot_fake_ids = set(data.loc[data.author_id_real.isin(bots),"author_id"].to_list())
with open("../data/for_analysis/bot_fake_ids_2.txt","w") as f:
    for i in bot_fake_ids:
          f.write(str(i)+"\n")

In [None]:
len(data.loc[data.author_id_real.isin(bots),"author_id"])

In [None]:
data.to_pickle("../data/for_analysis/data0.pkl")