In [1]:
import pandas as pd
import numpy as np

# Reading JSON
We saved the json as records (i.e., each row is a separate json object) remember?
So we set `lines = True` to signal this to pandas.

In [30]:
# Note: the following my take over a minute
try:
    df = pd.read_json('data/twitter_data_chatgpt_v2.json.bz2', lines = True)
    df.set_index('id', inplace=True)
except:
    print('You did not run the previous notebook! Reading the csv')
    df = pd.read_csv('data/twitter_data_chatgpt.csv.bz2')
    from helpers.analysis_preprocessor import preprocess # imports the preprocess from helpers/analysis_preprocessor.py 
    df = preprocess(df)

df.sort_index(inplace=True)
df

Unnamed: 0_level_0,created_at,text,username,like_count,retweet_count,engagement_count,hashtags
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1610535734758219778,2023-01-04 07:16:56+00:00,I used chat gpt to get gym workout program and...,pnik91,0,0,0,[]
1610535786017091584,2023-01-04 07:17:08+00:00,I'm quite amazed by Chat GPT. A really promisi...,manumurali369,1,0,1,[]
1610535837363486720,2023-01-04 07:17:20+00:00,all my twitter feed is about ChatGPT and @Open...,mcp350,3,1,4,[#ChatGPT]
1610535961670172674,2023-01-04 07:17:50+00:00,#ChatGPT So much #Censorship. Never trust a...,TryingToOffend,2,0,2,"[#ChatGPT, #Censorship.\n\nNever]"
1610536038094757888,2023-01-04 07:18:08+00:00,@GoogleAI #LAMDA Versus @OpenAI #ChatGPT ?! Wh...,Pup_In_Cup,1,0,1,"[#LAMDA, #ChatGPT]"
...,...,...,...,...,...,...,...
1641213003260633088,2023-03-29 22:57:26+00:00,Most people haven't heard of Chat GPT yet. Fir...,nikocosmonaut,0,0,0,[]
1641213110915571715,2023-03-29 22:57:52+00:00,"AI muses: ""In the court of life, we must all f...",ChatGPT_Thinks,0,0,0,"[#OutOfContextAI, #AILifeLessons, #ChatGPT]"
1641213115684536323,2023-03-29 22:57:53+00:00,https://t.co/FjJSprt0te - Chat with any PDF! C...,yjleon1976,0,0,0,"[#chatpdf, #ChatGPT]"
1641213218520481805,2023-03-29 22:58:18+00:00,@MecoleHardman4 Chat GPT says it’s 15. 😂,AmyLouWho321,0,0,0,[]


# Handling dates
Pandas uses the datetime64 data type to store date-time information efficiently. It provides rich functionality through .dt accessor and DatetimeIndex.

If a column that represent date is in a correct and Pandas-recognizable format, converting that column to a datetime is straightforward:

In [3]:
df['created_at'] = pd.to_datetime(df['created_at'])

Some data sources include timezone information in their date-time fields. For instance, Twitter/X (as well as Reddit and YouTube) store datetime data in UTC (which is a fancy name for GMT). 

This means that if a user tweets from Japan at 11:00 AM Japan Standard Time, the timestamp will appear as 3:00 AM UTC—which can be confusing. If you're working with data from a specific country or region, it's a good idea to convert the timestamps to the local timezone using `tz_convert`.

In [None]:
df["created_at"].dt.tz_convert("Asia/Tokyo")

Use .dt accessor to access each date component: `dt.date, dt.year, dt.month, dt.day, dt.hour, dt.day_name()`

Let's create new columns to represent the month and the day of the week, and hour of the day each tweet was created

In [5]:
df['month'] = df['created_at'].dt.month
df['day_of_week'] = df['created_at'].dt.day_name()
df['hour'] = df['created_at'].dt.hour

You can filter on datetime columns:

In [None]:
# Filter tweets from January 2023
jan_2023_tweets = df[df['created_at'].dt.month == 1]
print(f"Number of tweets from January: {len(jan_2023_tweets)}")

# Filter tweets from weekends
weekend_tweets = df[df['day_of_week'].isin(['Saturday', 'Sunday'])]
print(f"Number of tweets from weekends: {len(weekend_tweets)}")

# Filter tweets from business hours (9 AM to 5 PM)
business_hours_tweets = df[(df['hour'] >= 9) & (df['hour'] <= 17)]
print(f"Number of tweets during business hours: {len(business_hours_tweets)}")

# Filter tweets after 1st of March 2023
after_first_of_march = df[df['created_at'] > '2023-03-01']
print(f"Number of tweets after the first of March: {len(after_first_of_march)}")


You can groupby a specific date component, e.g., group by the name of the date and show the size of each group, which shows the number of tweets posted in each day.

In [None]:
print(df.groupby(df.created_at.dt.day_name()).size())


You can compute the time difference between the datetime and a predetermined date

In [None]:
df.created_at

In [None]:
# Example: Calculate time difference between tweets and a reference date
reference_date = pd.Timestamp('2023-01-01 00:00:00+00:00')
print('Time passed since 2023 new years:')
df['created_at'] - reference_date

Alternatively, you can subtract between consecutive rows using `diff()`

In [None]:
df['created_at'].diff()

## Resampling
Resampling is changing the frequency of your time series data. It is especially useful when your data is irregular, which is the case with our Twitter dataset. Tweets are stored as they are posted, and there is no guarantee that we will have at least one tweet for every day, hour, or a second (probably for Twitter we would have, but think about TruthSocial).

You resample when you are aggregate the data over time, e.g., the count of tweets per day, hour, per 3 hours etc. Thus, first, you reshape the data so that the index will be the datetime and the values will show some aggregated statistics such as count.

In [None]:
# Set the datetime column as index and return a new dataframe
df_time = df.set_index('created_at')

# Resample data to different time frequencies
# 'D' (daily), 'W' (weekly), 'M' (monthly), 'H' (hourly)
daily_tweets = df_time.resample('D').size()
weekly_tweets = df_time.resample('W').size()
monthly_tweets = df_time.resample('M').size()

# Print the resampled data
print("Daily tweet counts:")
print(daily_tweets.head())
print("\nWeekly tweet counts:")
print(weekly_tweets.head())
print("\nMonthly tweet counts:")
print(monthly_tweets.head())


Or, more custom frequencies and different aggregations over different columns

In [None]:
# 2-hour intervals
two_hour_tweets = df_time.resample('2H')['retweet_count'].mean()

# 3-day intervals
three_day_tweets = df_time.resample('3D')['like_count'].sum()

# Business days only (Monday-Friday)
business_day_tweets = df_time.resample('B').size()

# Print the resampled data
print("Mean retweet count in each 2-hour interval:")
print(two_hour_tweets.head())
print("\nTotal like count in 3-day intervals:")
print(three_day_tweets.head())
print("\nBusiness day tweet counts:")
print(business_day_tweets.head())


## Rolling Window Analysis
A rolling window allows us to calculate statistics over a sliding window of time. This is useful for smoothing out noise and identifying trends
We create a rolling window using .rolling() on a column (or a Series) with **DateTimeIndex**. 

In the code below we first resample the data to days (`.resample('D')`) to show the number of rows (tweets) using `.size()` per day
Then we create a rolling window of length 7 and length 30. See how the lines differ

In [None]:
# Calculate 7-day rolling mean of tweet counts
daily_tweets = df_time.resample('D').size() # this is a column (a Series) with DateTimeIndex which is resampled to days
rolling_7d = daily_tweets.rolling(window=7, center=False).mean() # rolling mean of the daily tweet counts

# Calculate 30-day rolling mean
rolling_30d = daily_tweets.rolling(window=30, center=False).mean()

# Plot the original daily counts and rolling means
daily_tweets.plot(figsize=(12, 6), alpha=0.5, label='Daily Counts', legend=True)
rolling_7d.plot(label='7-day Rolling Mean', linewidth=2, legend=True)
rolling_30d.plot(label='30-day Rolling Mean', linewidth=2, legend=True)

- The daily tweet counts (blue line) show high variability with many spikes.
- The 7-day rolling mean (orange line) smooths out daily fluctuations while still capturing weekly patterns
- The 30-day rolling mean (green line) shows the longer-term trend, smoothing out both daily and weekly variation. 
It reveals a more stable upward trend compared to the noisier daily data


# Basic Visualizations
Visualization is a crucial tool in data analysis to understand the data, identify patterns, relationships and communicate findings. Common visualization techniques and the questions they answer:

- **Line Plot**: How does a variable change over time?	
- **Histogram**: How values are distributed?
- **Bar Plot**: How do groups compare?
- **Box Plot**: How values are distributed AND groups compare (the Swiss army knife of plots)
- **Scatter**: How are two metrics related?
- **Pie Chart:** Don't use this crap

I will not cover this topic extensively here because:  1) this notebook is already too long 2) the topic itself is too long 3) pandas & matplotlib is not the only way to create plots. I actually used to use 3rd party software to create plots and now use AI.

Refer to the guide by if you wish to go deepr https://royal-statistical-society.github.io/datavisguide/

Pandas, in fact, comes with built-in plotting functions that internally use Matplotlib, making it incredibly convenient to visualize data. Creating a plot can be as simple as calling df.plot() on a DataFrame.

Depending on the type of plot, you may need to provide additional parameters. 

In [None]:
df.plot()

#### Line Plot
Default plot when you call df.plot(). It will set the x-axis the index of the dataframe by default. The lines will show the columns.

Our dataframe's index is id, which does not make sense for the line plot or any kind of plot as you see above, so we set the x-axis to be created_at. 

In [None]:
df[0:1000].plot(x='created_at')

The line plot shows a line for each tweet. This is because we have not aggregated the data. (hence why I called it only on the first 1000 rows)

We set the index to be `created_at`, resample by day, show the total number of tweets

In [None]:
df.set_index('created_at').resample('D').size().plot(title='Tweet Count Over Time')

#### Histogram
Choose column to see its distribution of values. For instance, let's do retweet count:

In [None]:
df['retweet_count'].plot(kind='hist', figsize=(10, 6), title='Distribution of Retweet Counts')

Looks awful is not it? The data clearly does not follow a normal distribution. Most tweets receive no retweets, leading to a large spike at zero. Meanwhile, a small number of tweets receive very high retweet counts—these are outliers. This results in a highly skewed distribution with a long tail to the right.

There are several common ways to deal with this kind of skewed data:

- Remove or cap outliers to focus on the bulk of the data.
- Manually define retweet count categories, such as 0, 1–10, 11–1000, and 1000+, to summarize the data more meaningfully.
- Redefine histogram bins to better capture the shape of the distribution.
- Apply a logarithmic scale to the x-axis and/or y-axis to compress the range and make the distribution more interpretable.



I usually prefer the last one:

In [None]:
df['retweet_count'].apply(np.log1p).plot(kind='hist', logy=True, yticks=[1,10,100,1000,10000,100000,1000000], xticks=[0,1,2,3,4,5,6,7,8,9,10])

We used `apply(np.log1p)` to log-transform the retweet counts. The highest retweet count in the dataset is 16,080 (approximately `e^9.6`), while the lowest values—0 and 1—are mapped to 0 after the transformation. So we could squeeze in the entire range of retweet counts to 0-10.   
(Note: since `ln(0)` is undefined, `np.log1p(0)` conveniently returns 0.)

We also applied a logarithmic scale to the **y-axis** (not to the actual values), so it increases exponentially (1, 10, 100, …) instead of linearly. So we won't see a tower of 10^6 and a tiny bar representing 10^5.     
Unfortunately, Pandas does not support setting a log scale on the **x-axis** directly, so we had to transform the x-values manually. (Try `logx=True` to see the abomination)

#### Bar Plot




Bar plot is good when you are comparing groups (i.e., categorical variables) over a single value (or multiple if you are using side-by-side bars.)

Essentially, what we did with histograms were automatically creating groups by binning.
But we could define categories of retweet counts manually and show their frequencies with a bar plot.
Below, we first define `bins`, name them by `labels`.
We create categories using pandas function `cut` which put the values (`df['retweet_count']` in this case) into bins you define.
We then compute frequencies using `value_counts`
And then call `plot(kind='bar')`

In [None]:
# Define retweet count categories
bins = [0, 1, 10, 100, 1000, float('inf')]
labels = ['0', '1-10', '11-100', '101-1000', '1000+']

# Create categories using pd.cut()
retweet_categories = pd.cut(df['retweet_count'], bins=bins, labels=labels)

# Count tweets in each category
retweet_counts = retweet_categories.value_counts()

# Create bar plot
retweet_counts.plot(kind='bar', 
                   figsize=(10, 6),
                   title='Distribution of Retweet Counts',
                   xlabel='Retweet Count Range',
                   ylabel='Number of Tweets')


You don't always need to define the categories manually. You can use any value as categories as long as it makes sense.    
For instance let's see the number of tweets posted each day of the week the tweets.    
Don't forget to aggregate!   

In [None]:
# Get day of week from created_at and count tweets
tweet_counts_per_day = df['created_at'].dt.day_name().value_counts()

# Plot using pandas plot()
tweet_counts_per_day.plot(kind='bar', title='Number of Tweets by Day of Week')

Most tweets are posted in mid-week and not on weekends where people have more free time. 
Not good for productivity!

#### Pie Chart

In [None]:
# Create a simple DataFrame with 'no' and 'no but' values
data = {'Category': ['NO', 'NO but in yellow'], 'Count': [7, 3]}
df_pie = pd.DataFrame(data)

# Create pie chart using pandas plot()
df_pie.plot(kind='pie', 
           y='Count',
           labels=df_pie['Category'],
           colors=['blue', 'yellow'],
           autopct='%1.1f%%',
           figsize=(8, 8),
           title='Should You Use Pie Charts?')


#### Box Plot

Box plots shows a distribution (and compare multiple distributions) without aggregation (finally, yay)    
It does so by computing the minimum, the maximum, the median, and the first quartile (25% percentile) and the third quartile (75% percentile).         
The box itself represents the interquartile range (IQR), which is the distance between the first and third quartiles.       
The minimum and the maximum is computed by subtracting 1.5 times the IQR from the first quartile and adding 1.5 times the IQR to the third quartile respectively.  
**They may not be the same as the minimum and the maximum value in the data**   
Any data points that fall outside 1.5 times the IQR from the edges of the box are considered **outliers** and are typically shown as individual points. 

The image below summarizes it well:

<img src="./data/box.jpg" width="600">

The retweet count is mostly 0s, which means the minimum and the maximum will be 0s and everything else is an outlier. So a box plot does not make sense:

In [None]:
df['retweet_count'].plot(kind='box')

Instead, let's investigate our previous finding which indicates that most tweets were posted mid-week. 
Could this result be driven by outliers? For instance, did a bot go rogue and posted a million times on a Wednesday even though we have only 500k tweets, and inflated the numbers for Wednesday? 
Are **all* Wednesdays consistently busy on Twitter or Xverse? Or was this a one-off anomaly?
Let's find out

 This time, we cannot rely on `df['day_of_week'].value_count()` because we do not want to aggregate by the day of the week (remember box plots do not operate on aggregated data.)    
 Still, we need to compute the number of rows/tweets for each day, and thus, need to aggregate by the date. Thus we group by the date and have a dataframe that has only the date and the number of tweets.      
 We still need to **retain the day of the week** information. One option is to regenerate it after grouping.       
 But a simple trick that decreases the amount of code is to add the categories to the `groupby` even though we do not intend to group by them.     
 If we run `groupby(['day_of_week', 'date'])` the data will be first grouped by the day of the week and then the date, and the resulting values will be the same as grouping by the date, but we retain the "day_of_week" column (in the index).  

In [None]:
# Group by day of the week, then date to get daily tweet counts
df['date'] = df['created_at'].dt.date
daily_counts = df.groupby(['day_of_week', 'date']).size().reset_index(name='tweet_count')
daily_counts

In [None]:
# Create a box plot comparing tweet counts across days of the week
daily_counts.boxplot(column='tweet_count', 
                    by='day_of_week',
                    figsize=(12, 6),
                    ylabel='Number of Tweets per Day')

My suspicions were right - Wednesday does indeed have an outlier that inflates the total number of tweets on Wednesday.     
It makes more sense to conclude that people tweet the most about ChatGPT on Tuesdays.       
That said, the overall insight still holds: people tend to tweet more during the middle of the week.  
Perhaps they surf on TikTok over the weekends instead?

# Scatter Plot
Scatter plot is good when you want to see the relationship between two variables.    
Like box plots, we do not aggregate and show individual data points instead, including outliers (yikes.)

Let's see if there is a relationship between the number of likes and the number of retweets.    

In [None]:
# Create scatter plot of likes vs retweets using pandas plot() with log scale on both axes
df.plot(kind='scatter', 
        x='retweet_count', 
        y='like_count')

Well, I'm not sure if there is a relationship, because there are some points with many retweets but few likes, and vice versa. These create outliers along both axes.

To address this, I will log-transform both axes.   
I will also set `alpha = 0.1` to make the points more transparent, so overlapping data points form denser regions.   
Finally, I’ll set `figsize=(6,6)` to produce a square plot, which makes it easier to interpret the relationship between the variables.   

In [None]:
# Create scatter plot of likes vs retweets using pandas plot() with log scale on both axes
df.plot(kind='scatter', 
        x='retweet_count', 
        y='like_count', 
        figsize=(6, 6),  # Changed to square dimensions
        title='Likes vs Retweets (Log Scale)',
        logx=True,
        logy=True, 
        grid=True,
        alpha=0.1,
        xlim=(1, 10**4),
        ylim=(1, 10**4), 
)


Yes there is a relationship between retweet counts and likes count (thank you captain obvious) but it’s not perfectly linear — tweets receive more likes than retweets.

# Correlation
Correlation measures the strength and direction of a linear relationship between two continuous variables.   
It is essentially quantifying what we have just observed on the scatter plot.

Pandas has a built-in function to compute correlation by `df.corr`. 
It computes **Pearson correlation coefficient** by default but can also compute Kendall Tau (`method='kendall'`) and Spearmank rank correlation (`method='spearman'`)
All take values between -1 and 1. Higher absolute values indicate stronger correlation; the sign shows the direction (positive or negative).

I leave going through their formulas and the logic to you as exercise.  
But TL;DR: Pearson is based on values (e.g., like and retweet counts) and Spearman is based on rank (e.g., is the most liked tweet also the most retweeted)
Spearman in a skewed dataset like ours. But it does not hurt if we compute and report all three.

In [None]:
# Print correlation matrices with labels to identify which is which
print("Pearson correlation (default):")
print(df[['retweet_count', 'like_count']].corr())
print("\nSpearman correlation:")
print(df[['retweet_count', 'like_count']].corr(method='spearman'))
print("\nKendall correlation:")
print(df[['retweet_count', 'like_count']].corr(method='kendall'))

In [None]:
# Create different engagement thresholds
thresholds = np.linspace(0, 1000, 100) # creates buckets of equal size between 0 and 1000

# Create empty list to store correlations
correlations = []

# Calculate Spearman correlation for each threshold
for threshold in thresholds: # for each threshold, filter the dataframe to only include rows where the engagement count is greater than the threshold
    filtered_df = df[(df['engagement_count'] > threshold)]
    corr = filtered_df[['retweet_count', 'like_count']].corr(method='spearman').iloc[0,1]
    correlations.append(corr)

# Create DataFrame for plotting
corr_df = pd.DataFrame({
    'threshold': thresholds,
    'correlation': correlations
})

# Plot correlations
corr_df.plot(
    x='threshold',
    y='correlation',
    kind='line',
    marker='o',
    title='Spearman Correlation by Engagement Threshold',
    xlabel='Minimum Engagement Threshold',
    ylabel='Correlation Coefficient',
    figsize=(10, 6),
    grid=True
)


Interestingly, the relationship decline for popular tweet that have more than 400 engagements. Weird 

That's the end of the analysis worksheet, hope enjoyed it!