In [None]:
import json
import pandas as pd
import scipy.stats

In [None]:
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick

In [None]:
plt.rcParams.update({'font.size': 14})

In [None]:
blue = '#1f77b4'
light_blue = '#aec7e8'
orange = '#ff7f0e'
light_orange = '#ffbb78'
gray = '#60636a'
light_gray = '#a5acaf'

# Load the tweets and bot scores

In [None]:
def load_json_objects(path_to_file):
    """
    Function to load JSON objects from .jsons file.
    Each line of the .jsons file should be a serialized JSON object 
    """
    json_objects = []
    with open(path_to_file) as f:
        for line in f:
            json_object = json.loads(line)
            json_objects.append(json_object)
    return json_objects

In [None]:
shib_tweets = load_json_objects("../data/shib_tweets.jsons")
shib_bot_scores = load_json_objects("../data/shib_bot_scores.jsons")

In [None]:
floki_tweets = load_json_objects("../data/floki_tweets.jsons")
floki_bot_scores = load_json_objects("../data/floki_bot_scores.jsons")

In [None]:
aapl_tweets = load_json_objects("../data/aapl_tweets.jsons")
aapl_bot_scores = load_json_objects("../data/aapl_bot_scores.jsons")

# Process the tweets and bot scores

In [None]:
tweet_info = []
for tweet in shib_tweets:
    tweet_info.append([
        tweet['id_str'],
        tweet['user']['id_str'],
        'shib'
    ])

for tweet in floki_tweets:
    tweet_info.append([
        tweet['id_str'],
        tweet['user']['id_str'],
        'floki'
    ])
    
for tweet in aapl_tweets:
    tweet_info.append([
        tweet['id_str'],
        tweet['user']['id_str'],
        'aapl'
    ])

tweet_info_df = pd.DataFrame(tweet_info, columns=['tid', 'user_id', 'cashtag'])

In [None]:
bot_scores = []
for bot_score in shib_bot_scores:
    bot_scores.append([
        bot_score['user']['user_data']['id_str'],
        bot_score['user']['majority_lang'],
        bot_score['raw_scores']['english']['overall'],
        bot_score['raw_scores']['universal']['overall']
    ])
    
for bot_score in floki_bot_scores:
    bot_scores.append([
        bot_score['user']['user_data']['id_str'],
        bot_score['user']['majority_lang'],
        bot_score['raw_scores']['english']['overall'],
        bot_score['raw_scores']['universal']['overall']
    ])
    

for bot_score in aapl_bot_scores:
    bot_scores.append([
        bot_score['user']['user_data']['id_str'],
        bot_score['user']['majority_lang'],
        bot_score['raw_scores']['english']['overall'],
        bot_score['raw_scores']['universal']['overall']
    ])
    
bot_scores_df = pd.DataFrame(bot_scores, columns=['user_id', 'lang', 'eng', 'uni'])

In [None]:
bot_scores_df.drop_duplicates(subset=['user_id'], inplace=True)

In [None]:
tweet_with_bot_score_df = tweet_info_df.merge(bot_scores_df, on='user_id')

# Analyze the data

## Check the number of tweets and unique users

In [None]:
tweet_info_df.groupby("cashtag").agg({
    'tid': 'nunique',
    'user_id': 'nunique'
})

## Check the use of different language

In [None]:
lang_freq = bot_scores_df.lang.value_counts()

In [None]:
lang_code_map = {
    'en': "English",
    'ja': "Japanese",
    'und': "Unknown",
    'es': "Spanish",
    'tr': "Turkish",
    'ar': "Arabic",
    'fr': "French",
    'in': "Hindi",
    'pt': "Portuguese",
    'it': "Italian"
}

In [None]:
lang_labels = []
lang_freq_count = []
for index, freq in lang_freq.head(6).iteritems():
    lang_labels.append(f"{lang_code_map.get(index)} ({freq / lang_freq.sum()*100:.1f}%)")
    lang_freq_count.append(freq)
    
lang_labels.append(f"Others ({lang_freq.tail(-6).sum() / lang_freq.sum()*100:.1f}%)")
lang_freq_count.append(lang_freq.tail(-6).sum())

In [None]:
plt.figure(figsize=(6, 6))
colors = [
    "#aec7e8",
    "#ffbb78",
    "#98df8a",
    "#ff9896",
    "#c5b0d5",
    "#c49c94",
    "#f7b6d2",
]
plt.pie(
    lang_freq_count,
    labels=lang_labels,
    colors=colors,
    counterclock=False,
    startangle=-35)
plt.gca().axis('equal');
plt.savefig("figures/language_freqency.pdf")

## Check the bot score distribution

In [None]:
dist_to_plot = [
    [1, 'shib', light_blue, '$SHIB'],
    [2, 'floki', light_orange, '$FLOKI'],
    [3, 'aapl', light_gray, '$AAPL']
]

plt.figure(figsize=(5.5, 4.5))
for index, key, color, title in dist_to_plot:
    plt.subplot(3, 1, index)
    plt.hist(
        tweet_with_bot_score_df.query(f'cashtag == "{key}"').eng,
        bins=50,
        color=color,
    );
    plt.annotate(title, xy=(0.03,0.7), xycoords='axes fraction')
    if index<3:
        plt.gca().set_xticklabels([])
    if index == 2:
        plt.ylabel("Frequency")
    if index==3:
        plt.xlabel('Bot score')
        
    plt.gca().spines['top'].set_visible(False)
    plt.gca().spines['right'].set_visible(False)
    plt.ylim([0, 150])
plt.tight_layout()
plt.savefig("figures/bot_score_dist.pdf")

## Boxplot

In [None]:
plt.figure(figsize=(5, 4.5))
for index, key, color, title in dist_to_plot:
    box = plt.boxplot(
        [
            tweet_with_bot_score_df.query(f'cashtag == "{key}"').eng
        ],
        positions=[4 - index],
        widths=0.3,
        patch_artist=True,
        notch=False,
        whis=(5, 95),
        vert=False,
        showmeans=True
    );
    for item in ['boxes', 'whiskers', 'fliers', 'medians', 'caps']:
            plt.setp(box[item], color=color)
    plt.setp(box['medians'], color='white', lw=1.2)
    plt.setp(box["boxes"], facecolor=color)
    plt.setp(box["fliers"], markerfacecolor=color, markeredgecolor='white', markersize=3, markeredgewidth=0.5)
    plt.setp(box['whiskers'], lw=2.5)
    plt.setp(box['caps'], lw=2.5)
    plt.setp(box['means'], marker='o', markerfacecolor='white', markeredgecolor='white', markersize=3)
plt.gca().xaxis.grid(which="major", color='gray', linestyle='--', linewidth=1, alpha=0.2)

plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
plt.gca().spines['bottom'].set_visible(False)
plt.gca().spines['left'].set_visible(False)

plt.yticks([3, 2, 1], ['$SHIB', '$FLOKI', '$AAPL'])
plt.xlabel('Bot score')

plt.tight_layout()
plt.savefig("figures/bot_score_boxplot.pdf")

In [None]:
plt.figure(figsize=(5, 4.5))
for index, key, color, title in dist_to_plot:
    box = plt.boxplot(
        [
            tweet_with_bot_score_df.query(f'cashtag == "{key}"').eng
        ],
        positions=[4 - index],
        widths=0.3,
        patch_artist=True,
        notch=False,
        whis=(5, 95),
        vert=False
    );
    for item in ['boxes', 'whiskers', 'fliers', 'medians', 'caps']:
            plt.setp(box[item], color=color)
    plt.setp(box['medians'], color='white', lw=1.2)
    plt.setp(box["boxes"], facecolor=color)
    plt.setp(box["fliers"], markerfacecolor=color, markeredgecolor='white', markersize=3, markeredgewidth=0.5)
    plt.setp(box['whiskers'], lw=2.5)
    plt.setp(box['caps'], lw=2.5)
plt.gca().xaxis.grid(which="major", color='gray', linestyle='--', linewidth=1, alpha=0.2)

plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
plt.gca().spines['bottom'].set_visible(False)
plt.gca().spines['left'].set_visible(False)

plt.yticks([3, 2, 1], ['$SHIB', '$FLOKI', '$AAPL'])
plt.xlabel('Bot score')

plt.tight_layout()
plt.savefig("figures/bot_score_boxplot.pdf")

In [None]:
scipy.stats.ttest_ind(
    tweet_with_bot_score_df.query('cashtag == "shib"').eng,
    tweet_with_bot_score_df.query('cashtag == "floki"').eng
)

In [None]:
scipy.stats.ttest_ind(
    tweet_with_bot_score_df.query('cashtag == "shib"').eng,
    tweet_with_bot_score_df.query('cashtag == "aapl"').eng
)

In [None]:
scipy.stats.ttest_ind(
    tweet_with_bot_score_df.query('cashtag == "floki"').eng,
    tweet_with_bot_score_df.query('cashtag == "aapl"').eng
)

## Different thresholds

In [None]:
len(tweet_with_bot_score_df.query('cashtag == "shib" and eng > 0.5'))

In [None]:
len(tweet_with_bot_score_df.query('cashtag == "floki" and eng > 0.5'))

In [None]:
len(tweet_with_bot_score_df.query('cashtag == "aapl" and eng > 0.5'))

In [None]:
len(tweet_with_bot_score_df.query('cashtag == "shib" and eng > 0.7'))

In [None]:
len(tweet_with_bot_score_df.query('cashtag == "floki" and eng > 0.7'))

In [None]:
len(tweet_with_bot_score_df.query('cashtag == "aapl" and eng > 0.7'))

In [None]:
plt.figure(figsize=(5, 4.5))
threshold = 0.5
bot_pct = [
    len(tweet_with_bot_score_df.query(f'cashtag == "shib" and eng > {threshold}')) / 2000,
    len(tweet_with_bot_score_df.query(f'cashtag == "floki" and eng > {threshold}')) / 2000,
    len(tweet_with_bot_score_df.query(f'cashtag == "aapl" and eng > {threshold}')) / 2000
]
plt.barh(
    [3, 2, 1],
    bot_pct,
    height=0.3,
    color=[light_blue, light_orange, light_gray]
)

for i, pct in enumerate(bot_pct):
    plt.text(pct-0.06, 3 - i, f"{pct*100:.1f}%", color='white', ha='center', va='center')

plt.ylim([0.5, 3.5])
plt.xlim([0, 0.62])

plt.gca().xaxis.set_major_formatter(mtick.PercentFormatter(xmax=1))

plt.gca().grid(axis='x', alpha=0.2, linestyle='--', linewidth=1,)

plt.yticks([3, 2, 1], ['$SHIB', '$FLOKI', '$AAPL'])
plt.xlabel(f"Percentage of tweets from likely bots (>{threshold})")

plt.gca().spines['top'].set_visible(False)
plt.gca().spines['left'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
plt.gca().spines['bottom'].set_visible(False)

plt.tight_layout()
plt.savefig("figures/bot_percent_05.pdf")

In [None]:
plt.figure(figsize=(5, 4.5))
threshold = 0.7
bot_pct = [
    len(tweet_with_bot_score_df.query(f'cashtag == "shib" and eng > {threshold}')) / 2000,
    len(tweet_with_bot_score_df.query(f'cashtag == "floki" and eng > {threshold}')) / 2000,
    len(tweet_with_bot_score_df.query(f'cashtag == "aapl" and eng > {threshold}')) / 2000
]
plt.barh(
    [3, 2, 1],
    bot_pct,
    height=0.3,
    color=[light_blue, light_orange, light_gray]
)

for i, pct in enumerate(bot_pct):
    plt.text(pct-0.04, 3 - i, f"{pct*100:.1f}%", color='white', ha='center', va='center')

plt.ylim([0.5, 3.5])
plt.xlim([0, 0.41])

plt.gca().xaxis.set_major_formatter(mtick.PercentFormatter(xmax=1, decimals=0))

plt.gca().grid(axis='x', alpha=0.2, linestyle='--', linewidth=1,)

plt.yticks([3, 2, 1], ['$SHIB', '$FLOKI', '$AAPL'])
plt.xlabel(f"Percentage of tweets from likely bots (>{threshold})")

plt.gca().spines['top'].set_visible(False)
plt.gca().spines['left'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
plt.gca().spines['bottom'].set_visible(False)

plt.tight_layout()
plt.savefig("figures/bot_percent_07.pdf")