In [91]:
import plotly.graph_objects as go
import plotly.express as px
import pandas as pd
from plotly.subplots import make_subplots

Read the dataframe. Then, convert the date column to datetime format and the number of total engagements and number of tweets column to integer format.

In [92]:
df_news_articles_stats = pd.read_csv("data/news_articles_stats.csv")
df_news_articles_stats["Date"]= pd.to_datetime(df_news_articles_stats["Date"])
df_news_articles_stats["Total number of engagements"] = pd.to_numeric(df_news_articles_stats["Total number of engagements"])

df_number_of_tweets = pd.read_csv("data/number_of_tweets.csv")
df_number_of_tweets["Date"]= pd.to_datetime(df_number_of_tweets["Date"])
df_number_of_tweets["Number of tweets"] = pd.to_numeric(df_number_of_tweets["Number of tweets"])

Checking the data types.

In [93]:
df_news_articles_stats.info()
df_number_of_tweets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 14 columns):
 #   Column                                 Non-Null Count  Dtype         
---  ------                                 --------------  -----         
 0   Date                                   10 non-null     datetime64[ns]
 1   News agency                            10 non-null     object        
 2   Color                                  9 non-null      object        
 3   Article title                          10 non-null     object        
 4   Notes                                  2 non-null      object        
 5   Number of Facebook engagements         7 non-null      object        
 6   Number of Twitter shares               7 non-null      object        
 7   Number of Twitter retweets and linkes  1 non-null      float64       
 8   Number of Reddit engagements           6 non-null      float64       
 9   Youtube views                          1 non-null      float64      

Create a diagram of the reach of news reportings and notable Twitter posts.

In [94]:
fig = go.Figure(data=[go.Scatter(
    x = df_news_articles_stats["Date"], 
    y = df_news_articles_stats["y"],
    name = "Reach of news articles and posts",
    mode = 'markers',
    text = df_news_articles_stats["News agency"],
    marker = dict(
        color = df_news_articles_stats["Color"],
        size = df_news_articles_stats["Total number of engagements"],
        sizemode = 'area',
        sizeref = 2.*max(df_news_articles_stats["Total number of engagements"])/(40.**2),
        sizemin = 4
    ),
    hovertemplate = "News agency: %{text} <br>Reach: %{marker.size} <extra></extra>"

)])

fig.update_yaxes(visible=False)
fig.update_layout(paper_bgcolor="rgb(0,0,0,0)")
fig.update_layout(plot_bgcolor="rgb(0,0,0,0)")
fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='grey')
fig.show()
fig.write_html("html/reach.html", include_plotlyjs="cdn", full_html = False)

Create a combined reach of news articles and number of tweets plot.

In [95]:
trace1 = go.Scatter(
    x = df_news_articles_stats["Date"], 
    y = df_news_articles_stats["y"],
    name = "Reach of news articles and posts",
    mode = 'markers',
    text = df_news_articles_stats["News agency"],
    marker = dict(
        color = df_news_articles_stats["Color"],
        size = df_news_articles_stats["Total number of engagements"],
        sizemode = 'area',
        sizeref = 2.*max(df_news_articles_stats["Total number of engagements"])/(40.**2),
        sizemin = 4
    ),
    hovertemplate = "News agency: %{text} <br>Reach: %{marker.size} <extra></extra>"
)


trace2 = go.Scatter(
    x = df_number_of_tweets["Date"], 
    y = df_number_of_tweets["Number of tweets"],
    marker_color="rgba(152, 0, 0, .8)",
    name = "Number of tweets"
)

fig = go.Figure()
fig.add_trace(trace1)
fig.add_trace(trace2)
fig.update_layout(
    hoverlabel_align = 'right')
fig.update_layout(hovermode="x unified")
fig.show()

In [96]:
fig.write_html("html/combined.html", include_plotlyjs="cdn", full_html = False)

Next, the number of unique accounts posting about the topic each day is counted. On days where there was a surge of tweets, the data only returns tweets for up to a certain time instead of the full day. Due to these limitations, the tweet data will be combined with the number of tweets dataset to estimate the number of unique accounts posting on that day. 

In [97]:
df_number_of_tweets["Date"] = df_number_of_tweets["Date"].astype(str)
df_number_of_tweets[['Day','Time']] = df_number_of_tweets.Date.str.split(expand=True)

df_number_of_tweets.head()


Unnamed: 0,Date,Number of tweets,Day,Time
0,2022-11-20 10:00:00,250,2022-11-20,10:00:00
1,2022-11-20 11:00:00,1053,2022-11-20,11:00:00
2,2022-11-20 12:00:00,949,2022-11-20,12:00:00
3,2022-11-20 13:00:00,879,2022-11-20,13:00:00
4,2022-11-20 14:00:00,711,2022-11-20,14:00:00


In [98]:
df_tweets_by_day = df_number_of_tweets.groupby("Day")["Number of tweets"].sum().reset_index(name='Number of tweets')

df_tweets_by_day.head(8)

Unnamed: 0,Day,Number of tweets
0,2022-11-20,9181
1,2022-11-21,131370
2,2022-11-22,99559
3,2022-11-23,44112
4,2022-11-24,45116
5,2022-11-25,33825
6,2022-11-26,15376
7,2022-11-27,2340


Read the tweet data to dataframes. Like for the wordclouds, only the data with tweets from accounts with over 20 followers is kept to get rid of spam.

In [99]:
dataframe = {}

for date in range(20, 28):
    file = "data/" + str(date) +"_nov_archive.csv"
    df = pd.read_csv(file)
    key = str("df_" + str(date) + "_nov")
    dataframe[key] = df

for date in dataframe:
    dataframe[date] = dataframe[date][dataframe[date]["user_followers_count"] > 49] 

Using the 'df_number_of_tweets_by_day' dataframe, add a column with the time of last tweet in tweet dataset and number of unique users. The dataset is sorted from descending time, so 


In [100]:
print(dataframe["df_20_nov"]["from_user"].unique().size)

6624


In [101]:
time_of_last_tweet = []
number_of_unique_users_in_dataset = []

for date in dataframe:
    df = dataframe[date]
    df[["Day","Time"]] = df.time.str.split(expand=True)
    time_of_last_tweet.append(df.iloc[-1]["Time"])
    number_of_unique_users_in_dataset.append(df["from_user"].unique().size)

df_tweets_by_day["Time of last tweet"] = time_of_last_tweet
df_tweets_by_day["Number of unique users"] = number_of_unique_users_in_dataset

df_tweets_by_day.head(8)

Unnamed: 0,Day,Number of tweets,Time of last tweet,Number of unique users
0,2022-11-20,9181,10:00:16,6624
1,2022-11-21,131370,21:48:01,8964
2,2022-11-22,99559,19:00:30,9025
3,2022-11-23,44112,14:44:55,10482
4,2022-11-24,45116,13:10:21,9995
5,2022-11-25,33825,08:38:08,9177
6,2022-11-26,15376,00:15:19,8385
7,2022-11-27,2340,00:00:10,1426


Only the data for 20 and 27 November is complete. So the number of seconds that we have data for will be calculated. Then, the number of tweets per second and number of unique users per second is calcuated to get the number of unique users per tweet.

In [102]:

df_tweets_by_day["Time of last tweet"]= pd.to_timedelta(df_tweets_by_day["Time of last tweet"])
df_tweets_by_day["Seconds"] = df_tweets_by_day["Time of last tweet"].dt.total_seconds()

df_tweets_by_day["Tweets per second"] = df_tweets_by_day["Number of tweets"]/(24*60*60)
df_tweets_by_day["Unique users per second"] = df_tweets_by_day["Number of unique users"]/df_tweets_by_day["Seconds"]
df_tweets_by_day["Total unique users"] = (df_tweets_by_day["Unique users per second"]/df_tweets_by_day["Tweets per second"])*(df_tweets_by_day["Number of tweets"])

df_tweets_by_day.loc[0, "Total unique users"] = df_tweets_by_day.loc[0, "Number of unique users"]
df_tweets_by_day.loc[7, "Total unique users"] = df_tweets_by_day.loc[7, "Number of unique users"]

print(df_tweets_by_day)

          Day  Number of tweets Time of last tweet  Number of unique users  \
0  2022-11-20              9181    0 days 10:00:16                    6624   
1  2022-11-21            131370    0 days 21:48:01                    8964   
2  2022-11-22             99559    0 days 19:00:30                    9025   
3  2022-11-23             44112    0 days 14:44:55                   10482   
4  2022-11-24             45116    0 days 13:10:21                    9995   
5  2022-11-25             33825    0 days 08:38:08                    9177   
6  2022-11-26             15376    0 days 00:15:19                    8385   
7  2022-11-27              2340    0 days 00:00:10                    1426   

   Seconds  Tweets per second  Unique users per second  Total unique users  
0  36016.0           0.106262                 0.183918         6624.000000  
1  78481.0           1.520486                 0.114219         9868.498108  
2  68430.0           1.152303                 0.131887        1139