# How to gather and visualize hashtag co-occurence data from Tweets with Twitter API v2

## Setup

In this first section, we import the necessary packages and define functions that we will use later.

NOTE: `searchtweets-v2` package must be installed

In [1]:
import os
import json
import time
from collections import defaultdict

import requests
import numpy as np
import pandas as pd
from tqdm import tqdm
import networkx as nx
from datetime import datetime

import plotly.graph_objects as go

import matplotlib.pyplot as plt
import matplotlib.colors
cmap = plt.cm.cividis

from searchtweets import gen_request_parameters, load_credentials, collect_results

In [2]:
# See documentation at https://github.com/twitterdev/search-tweets-python
# for info on credential handling
search_args = load_credentials()

cannot read file ~/.twitter_keys.yaml
Error parsing YAML file; searching for valid environment variables


## Querying Tweets from the API with `searchtweets-v2`

In this section, we craft the query and the information we want returned from the API. Here, we pick two parenting and COVID-related hashtags and time periods from Nov 2021 - Feb 2022. However, the query can be changed to query any topic of interest.

For more info on constructing queries, see [here](https://developer.twitter.com/en/docs/twitter-api/tweets/search/integrate/build-a-query)

For more info on fields available for Tweets, see [here](https://developer.twitter.com/en/docs/twitter-api/fields).

Keep in mind that any crafted query and subsequent data handling must comply with the [Twitter Developer Terms](https://developer.twitter.com/en/developer-terms/agreement-and-policy)

Also note that if you do not have Academic Access to the API, you should use the `recent` endpoint (see how to specify endpoint when loading credentials in the cell above) as you will not have access to historical Tweets

In [3]:
start_dates = ["2021-11-01", "2021-11-16", "2021-12-01", "2021-12-16", "2022-01-01", "2022-01-16", "2022-02-01", "2022-02-15"]
end_dates = ["2021-11-15", "2021-11-30", "2021-12-15", "2021-12-31", "2022-01-15", "2022-01-31", "2022-02-14", "2022-02-28"]

search_query = "has:hashtags lang:en -is:retweet (#teamreality OR #urgencyofnormal)"

tweet_fields="id,text,author_id,created_at,geo,lang,possibly_sensitive,public_metrics,entities"
user_fields="verified"
result_limit = 500
max_tweets = 10000

In [4]:
all_df = [None for _ in start_dates]

for i_date, (start_date, end_date) in enumerate(zip(start_dates, end_dates)):
    print(f"Doing time frame from {start_date} to {end_date}")
    rule = gen_request_parameters(
        search_query,
        granularity=None,
        results_per_call=result_limit,
        start_time=start_date,
        end_time=end_date,
        tweet_fields=tweet_fields,
        user_fields=user_fields,
        stringify=False
    )
    
    tweets = collect_results(rule, result_stream_args=search_args, max_tweets=max_tweets)
    
    n_tweets = 0

    for chunk in tweets:
        n_tweets += chunk['meta']['result_count']

    print("Total Tweets:", n_tweets)
    
    data_dict = {
        'tweet_id': [None for _ in range(n_tweets)],
        'author_id': [None for _ in range(n_tweets)],
        'tweet_text': [None for _ in range(n_tweets)],
        'hashtags': [None for _ in range(n_tweets)],
        'entities': [None for _ in range(n_tweets)],
        'possibly_sensitive_link': [None for _ in range(n_tweets)],
        'n_retweets': [None for _ in range(n_tweets)],
        'n_replies': [None for _ in range(n_tweets)],
        'n_likes': [None for _ in range(n_tweets)],
        'n_quotes': [None for _ in range(n_tweets)]
    }

    prev_tweets = 0

    for chunk in tqdm(tweets):
        for ind,tweet in enumerate(chunk["data"]):
            i = ind+prev_tweets
            data_dict['tweet_id'][i] = tweet['id']
            data_dict['author_id'][i] = tweet['author_id']
            data_dict['tweet_text'][i] = tweet['text']

            hashtags=""
            entities=""

            if "entities" in tweet:
                if "hashtags" in tweet['entities']:
                    for h in tweet['entities']['hashtags']:
                        hashtags += f"{h['tag']},"

                if "annotations" in tweet["entities"]:
                    for e in tweet['entities']['annotations']:
                        entities += f"{e['normalized_text']},"

                if "mentions" in tweet['entities']:
                    for m in tweet['entities']['mentions']:
                        entities += f"{m['username']},"

            data_dict["hashtags"][i] = hashtags
            data_dict["entities"][i] = entities

            if "possibly_sensitive" in tweet:
                data_dict['possibly_sensitive_link'][i] = tweet["possibly_sensitive"]
            else:
                data_dict['possibly_sensitive_link'][i] = False

            data_dict["n_retweets"][i] = tweet['public_metrics']['retweet_count']
            data_dict["n_replies"][i] = tweet['public_metrics']['reply_count']
            data_dict["n_likes"][i] = tweet['public_metrics']['like_count']
            data_dict["n_quotes"][i] = tweet['public_metrics']['quote_count']
        prev_tweets += chunk['meta']['result_count']
        
    curr_df = pd.DataFrame(data_dict)
    curr_df['start'] = start_date
    curr_df['end'] = end_date
    all_df[i_date] = curr_df
    time.sleep(1)

Doing time frame from 2021-11-01 to 2021-11-15
Total Tweets: 42


100%|███████████████████████████████████████████| 1/1 [00:00<00:00, 3146.51it/s]


Doing time frame from 2021-11-16 to 2021-11-30
Total Tweets: 43


100%|███████████████████████████████████████████| 1/1 [00:00<00:00, 4144.57it/s]


Doing time frame from 2021-12-01 to 2021-12-15
Total Tweets: 163


100%|████████████████████████████████████████████| 1/1 [00:00<00:00, 947.01it/s]


Doing time frame from 2021-12-16 to 2021-12-31
Total Tweets: 209


100%|████████████████████████████████████████████| 1/1 [00:00<00:00, 971.13it/s]


Doing time frame from 2022-01-01 to 2022-01-15
Total Tweets: 310


100%|████████████████████████████████████████████| 1/1 [00:00<00:00, 521.16it/s]


Doing time frame from 2022-01-16 to 2022-01-31
Total Tweets: 1332


100%|████████████████████████████████████████████| 3/3 [00:00<00:00, 483.29it/s]


Doing time frame from 2022-02-01 to 2022-02-14
Total Tweets: 2322


100%|████████████████████████████████████████████| 5/5 [00:00<00:00, 542.07it/s]


Doing time frame from 2022-02-15 to 2022-02-28
Total Tweets: 1281


100%|████████████████████████████████████████████| 3/3 [00:00<00:00, 558.57it/s]


In [5]:
df = pd.concat(all_df)

## Getting hashtag co-occurence from Tweets

Now that we have our Tweets nicely collected into a dataframe, let's extract an adjacency matrix and visualize it

In [6]:
def count_cs_list(in_list):
    if type(in_list) != str and np.isnan(in_list):
        return 0
    return len(in_list.split(',')) - 1

df['num_hashtags'] = df['hashtags'].apply(count_cs_list)
df['num_entities'] = df['entities'].apply(count_cs_list)

First, we select Tweets that have more than one hashtag

In [7]:
df_hashtags = df[df['num_hashtags'] > 1]

Next, we count all hashtag co-occurences

In [8]:
hashtag_counts = defaultdict(int)
hashtag_coinc = defaultdict(lambda: defaultdict(int))
    
def increment_hashtags(row):
    # NOTE: this replacement is specific to the query we are doing
    # as we want to collapse covid19, covid_19, covid, etc. to 
    # the same hashtag. You should modify this for your query
    hashtags = sorted(row.lower().replace("_", "").replace("19", "").split(',')[:-1])
    n_hash = len(hashtags)
    for i in range(n_hash):
        hashtag_counts[hashtags[i]] += 1
        for j in range(n_hash):
            hashtag_coinc[hashtags[i]][hashtags[j]] += 1

df_hashtags['hashtags'].apply(increment_hashtags)

ct_series = pd.Series(hashtag_counts)
frequent_hashes = ct_series[ct_series >= 10].index
all_hash_to_use = list(set(frequent_hashes))

We then do the same thing for each timeframe individually

In [9]:
grpd = df_hashtags.groupby('start')

data_dict = {
    "start": [],
    "end": [],
    "tweet_count": [],
    "first_hashtag": [],
    "second_hashtag": []
}

for start, grp_df in grpd:
    hashtag_counts_local = defaultdict(int)
    hashtag_coinc_local = defaultdict(lambda: defaultdict(int))
    
    def increment_hashtags_local(row):
        # NOTE: this replacement is specific to the query we are doing
        # as we want to collapse covid19, covid_19, covid, etc. to 
        # the same hashtag. You should modify this for your query
        hashtags = sorted(row.lower().replace("_", "").replace("19", "").split(',')[:-1])
        n_hash = len(hashtags)
        for i in range(n_hash):
            hashtag_counts_local[hashtags[i]] += 1
            for j in range(n_hash):
                hashtag_coinc_local[hashtags[i]][hashtags[j]] += 1
    
    grp_df['hashtags'].apply(increment_hashtags_local)
    
    for i in range(len(all_hash_to_use)):
        first_hash = all_hash_to_use[i]
        for j in range(len(all_hash_to_use)):
            second_hash = all_hash_to_use[j]
            data_dict["start"].append(start)
            data_dict["end"].append(grp_df.iloc[0]['end'])
            data_dict["first_hashtag"].append(first_hash)
            data_dict["second_hashtag"].append(second_hash)
            if i == j:
                data_dict["tweet_count"].append(hashtag_counts_local[first_hash])
            else:
                data_dict["tweet_count"].append(hashtag_coinc_local[first_hash][second_hash])

adj_df = pd.DataFrame(data_dict)

In [10]:
adj_df

Unnamed: 0,start,end,tweet_count,first_hashtag,second_hashtag
0,2021-11-01,2021-11-15,0,greatbarringtondeclaration,greatbarringtondeclaration
1,2021-11-01,2021-11-15,0,greatbarringtondeclaration,longcovidkids
2,2021-11-01,2021-11-15,0,greatbarringtondeclaration,donoharm
3,2021-11-01,2021-11-15,0,greatbarringtondeclaration,singleissuevoter
4,2021-11-01,2021-11-15,0,greatbarringtondeclaration,keepschoolsopen
...,...,...,...,...,...
19203,2022-02-15,2022-02-28,0,urgencyofequity,rationalground
19204,2022-02-15,2022-02-28,2,urgencyofequity,covid
19205,2022-02-15,2022-02-28,0,urgencyofequity,unmaskourkids
19206,2022-02-15,2022-02-28,0,urgencyofequity,pandemic


Now that we have our adjacency matrix, let's visualize it!

## Visualizing hashtag co-occurence

First, we do some preliminary vis setup. The template below and some of the visualization choices are chosen for aesthetic purposes, but it can be modified as you see fit.

In [18]:
template = go.layout.Template({
  'layout': {
    'font': {'family': 'Helvetica Neue, sans-serif'},
    'xaxis': {'color': '#aab8c2',
              'title': {'font': {'color': '#657786', 'family': 'Helvetica Neue, sans-serif'}}},
    'yaxis': {'color': '#aab8c2',
              'title': {'font': {'color': '#657786', 'family': 'Helvetica Neue, sans-serif'}}}
  }
})

In [12]:
data = adj_df

In [21]:
COLOR_LEVELS = [0.2, 0.1, 0.05, 0.01, 0]
COLOR_LEGEND = ['> 20%', '10% - 20%', '5% - 10%', '1% - 5%', '< 1%']
COLOR_RAMP = ['#F91880', '#E94494', '#D971A9', '#C99DBE', '#B9CAD3'] # Gradient colors Gray - Magenta: highest contrast!

def edge_color_discrete(val):
  for index, level in enumerate(COLOR_LEVELS):
    if val >= level:
      return {
        'id': index,
        'legend': COLOR_LEGEND[index],
        'color': COLOR_RAMP[index]
      }
  return {
    'id': len(COLOR_LEGEND) - 1,
    'legend': COLOR_LEGEND[-1],
    'color': COLOR_RAMP[-1]
  }

def get_coocurrence_score(intersect_weight, node1_weight, node2_weight):
  denominator = intersect_weight + node1_weight + node2_weight
  return intersect_weight / (intersect_weight + node1_weight + node2_weight) if denominator != 0 else 0

def weight_to_width_calc(weight, max_weight):
  """
  Calculate normalized line/stroke width based on weights.
  Return a number.
  """
  return 50 * np.sqrt(weight / max_weight)

def format_node_tooltip_str(node, graph):
  """
  Construct a string for the node tooltip.
  Return a str.
  """  
  graph_copy = graph.copy()
  graph_copy.remove_edges_from(nx.selfloop_edges(graph_copy))
  adj_edges = list(graph_copy.edges(node, data=True))
  tooltip_node_str = f"<b> {node} </b>: {graph[node][node]['tweet_count']:.0f} <br><br><b>Co-occurrences:</b><br>" if node in graph[node] else ''
  edges_processed =sorted(graph_copy.edges(node, data=True), key=lambda t: t[2].get('tweet_count', 1), reverse=True)
  edges_str = ['+ ' + e[1] + ': ' + str(e[2]['tweet_count']) + ' (' + "{:.1%}".format(get_coocurrence_score(e[2]['tweet_count'],
                                                                                                            graph[e[1]][e[1]]['tweet_count'] if e[1] in graph[e[1]] else 0, 
                                                                                                            graph[e[0]][e[0]]['tweet_count'] if e[0] in graph[e[0]] else 0
                                                                                                           )) + ')' for i, e in enumerate(edges_processed)]
  tooltip_edges_str = '<br>'.join(edges_str)
  
  return tooltip_node_str + tooltip_edges_str

def prepare_graph_vis(graph, max_weight, node_pos):
  """
  Prepare the trace data for drawing a Plotly graph vis based on an adjacency matrix.
  Return a list of traces including NxN line traces for edges and one scatter trace for nodes.
  """
  node_x = []
  node_y = []
  node_tooltip = []
  node_weight = []
  traces = []

  # Prepare the edge traces
  line_trace_map = {}
  for index1, node1 in enumerate(graph.nodes()):
    for index2, node2 in enumerate(graph.nodes()):
      if index1 >= index2 or node1 == node2:
        continue
      x0, y0 = node_pos[node1]
      x1, y1 = node_pos[node2]
      if node2 in graph[node1]:
        weight = graph[node1][node2]['tweet_count']
      elif node1 in graph[node2]:
        weight = graph[node2][node1]['tweet_count']
      else:
        weight = 0
      weight_node1 = graph[node1][node1]['tweet_count'] if node1 in graph[node1] else 0
      weight_node2 = graph[node2][node2]['tweet_count'] if node2 in graph[node2] else 0
      coocurrence_score = get_coocurrence_score(weight, weight_node1, weight_node2)
      color_level = edge_color_discrete(coocurrence_score)
      traces.append(
        go.Scatter(
          name=color_level['legend'],
          legendgroup=f"group{color_level['id']}",
          showlegend=False,
          x=[x0, x1],
          y=[y0, y1],
          hoverinfo='none',
          line=dict(
            width=weight_to_width_calc(weight, max_weight),
            color= color_level['color']
          ),
          mode='lines',
        )
      )

  # Add the color legend by adding invisible traces (Warning: super hacky)
  for index, legend_level in enumerate(COLOR_LEGEND):
    traces.append(
      go.Scatter(
        x=[0.5], y=[0.5],
        name=legend_level,
        legendgroup=f"group{index}",
        hoverinfo='none',
        showlegend=True,
        mode='markers',
        marker=dict(
          color=COLOR_RAMP[index],
          symbol='square',
          # size=0
        )
      )
    )
  
  # Prepare the node trace
  for node in graph.nodes():
    x, y = node_pos[node]
    weight = graph[node][node]['tweet_count'] if node in graph[node] else 0
    node_x.append(x)
    node_y.append(y)
    node_tooltip.append(format_node_tooltip_str(node, graph))
    node_weight.append(weight_to_width_calc(weight, max_weight))

  node_trace = go.Scatter(
    name='nodes',
    x=node_x, y=node_y,
    mode='markers',
    text=node_tooltip,
    textposition='top right',
    textfont=dict(size=15, color='#0F1419'),
    hoverinfo='text',
    marker=dict(
      size=[x + 15 if x > 0 else 0 for x in node_weight],
      line_width=node_weight,
      line_color='#829AAB',
      color='#6BC9FB'
    ),
    showlegend=False,
  )
  traces.append(node_trace)
  return traces

def draw_timeline_graph_vis(data, time_col, frequency_threshold=0, network_layout='shell'):
  """
  Draw a timeline graph vis.
  Arguments:
    data: pd.DataFrame
    time_col: str
  Return void.
  """
  dates = []

  graphs = []
  max_weight = max(data['tweet_count'])

  # Initialize the frames & slider steps (for all dates) for the graph vis
  frames = []
  slider_steps = []

  # Prepare node-related info (just one time, iteration not needed)
  graph_overall = nx.from_pandas_edgelist(data.query(f"tweet_count > {frequency_threshold}"), 
                                   source='first_hashtag', 
                                   target='second_hashtag', 
                                   edge_attr=['tweet_count'])
  
  if network_layout == 'shell':
    node_pos = nx.shell_layout(graph_overall)
  elif network_layout == 'force_directed':
    node_pos = nx.nx_pydot.graphviz_layout(graph_overall)
  elif network_layout == 'spring':
    node_pos = nx.spring_layout(graph_overall)
  node_labels = [dict(x=node_pos[node][0],
                      y=node_pos[node][1],
                      text=node,
                      showarrow=False,
                      xanchor='center',
                      font=dict(
                        size=16,
                      )
                     ) for node in graph_overall.nodes()]

  data_group_by_day = data.groupby(time_col)
  for start_date, group in data_group_by_day:
    date = start_date
    dates.append(date)
    graph = nx.from_pandas_edgelist(group.query(f"tweet_count > {frequency_threshold}"), 
                                   source='first_hashtag', 
                                   target='second_hashtag', 
                                   edge_attr=['tweet_count'])
    graph_sorted_nodes = nx.Graph()
    graph_sorted_nodes.add_nodes_from(sorted(graph_overall.nodes(data=True)))
    graph_sorted_nodes.add_edges_from(graph.edges(data=True))
    graphs.append(graph_sorted_nodes)

  # Iterate through the dates and prepare every frame
  for index, date in enumerate(dates):
    # Prepare the frames
    data_traces = prepare_graph_vis(
      graphs[index],
      max_weight,
      node_pos)
    frames.append({
      "data": data_traces,
      "name": date
    })

    # Prepare the slider steps
    slider_steps.append(
      {"args": [
          [date],
          {"frame": {"duration": 700, "redraw": False},
           "mode": "immediate",
           "transition": {"duration": 300}}],
        "label": date,
        "method": "animate"
    })

  # Set the layout variables
  slider_layout = {
    "active": 0,
    "yanchor": "top",
    "xanchor": "left",
    "currentvalue": {
        "font": {"size": 20},
        "prefix": "Date: ",
        "visible": True,
        "xanchor": "right"
    },
    "transition": {"duration": 300, "easing": "cubic-in-out"},
    "pad": {"b": 10, "t": 50},
    "len": 0.9,
    "x": 0.1,
    "y": 0,
    "steps": slider_steps
  }

  updatemenu_layout = {
    "buttons": [
      {
        "args": [None, {"frame": {"duration": 700, "redraw": False},
                        "fromcurrent": True, "transition": {"duration": 300,
                                                            "easing": "quadratic-in-out"}}],
        "label": "Play",
        "method": "animate"},
      {
        "args": [[None], {"frame": {"duration": 0, "redraw": False},
                          "mode": "immediate",
                          "transition": {"duration": 0}}],
        "label": "Pause",
        "method": "animate"}
      ],
    "direction": "left",
    "pad": {"r": 10, "t": 87},
    "showactive": False,
    "type": "buttons",
    "x": 0.1,
    "xanchor": "right",
    "y": 0,
    "yanchor": "top"
  }

  # Draw the entire figure
  fig = go.Figure(
    data=frames[0]['data'],
    frames=frames,
    layout=go.Layout(
      sliders=[slider_layout],
      annotations=node_labels,
      title='Which two tags often appear together?',
      titlefont_size=20,
      legend=dict(
        itemsizing='constant',
        title_text='Co-ocurrence Index',
        yanchor="top",
        y=0.99,
        xanchor="left",
        x=0.01
      ),
      hovermode='closest',
      margin=dict(b=20,l=5,r=5,t=50),
      xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
      yaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
      template=template,
      width=1000, height=1000,
      updatemenus=[updatemenu_layout]
    ),

  )
  fig.show()
  return fig

In [22]:
fig = draw_timeline_graph_vis(data, time_col='start', frequency_threshold=0, network_layout='force_directed')