# Biden Tweets Collection and Exploratory Analysis

## Loading Biden's Tweets

In [None]:
## load libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statistics
import nltk
nltk.download('punkt')

In [None]:
def tokenize(phrase):
    '''
    Returns an array of individual words in the string phrase
    phrase: The string to be tokenized
    '''
    tokens = nltk.word_tokenize(phrase)
    return tokens

In [None]:
# Load Biden's tweets
biden_tweets = pd.read_csv("biden_tweets.csv")
number_of_biden_tweets = len(biden_tweets.index) #Number of tweets we collected
print(biden_tweets.head())
print("Number of tweets: " + str(number_of_biden_tweets))

## Summary Statistics for Selected Variables

In [None]:
# Summary stats for the "full_text" column
biden_total_words = 0 # The number of words Biden used across all the Tweets we collected
biden_word_counts_list = [] # A list of the word count for each tweet
for tweet in biden_tweets["full_text"]:
    tweet_length = len(tokenize(tweet))
    biden_word_counts_list.append(tweet_length)                 
    biden_total_words += tweet_length
print("Biden words tweeted: " + str(biden_total_words))
biden_mean_wordcount = np.mean(biden_word_counts_list)
print("Mean number of words in Biden tweets: " + str(biden_mean_wordcount))
biden_median_wordcount = np.median(biden_word_counts_list)
print("Median number of words in Biden tweets: " + str(biden_median_wordcount))
biden_mode_wordcount = statistics.mode(biden_word_counts_list)
print("Mode number of words in Biden tweets: " + str(biden_mode_wordcount))
biden_minimum_words = min(biden_word_counts_list)
print("The least number of words Biden used in a tweet was: " + str(biden_minimum_words))
biden_maximum_words = max(biden_word_counts_list)
print("The most number of words Biden used in a tweet was: " + str(biden_maximum_words))
biden_stddev_wordcount = np.std(biden_word_counts_list)
print("Standard deviation of number of words in Biden tweets: " + str(biden_stddev_wordcount))

In [None]:
# Summary stats for the "favourite" column
biden_favorites = biden_tweets["favourite"]
biden_total_favorites = biden_favorites.sum()
print("Biden tweet favorites: " + str(biden_total_favorites))
biden_mean_favorites = np.mean(biden_favorites)
print("Mean number of favorites for Biden tweets: " + str(biden_mean_favorites))
biden_median_favorites = np.median(biden_favorites)
print("Median number of favorites for Biden tweets: " + str(biden_median_favorites))
# Mode may not be meaningful due to the wide range between the minimum and maximum number of favorites
# biden_mode_favorites = statistics.mode(biden_favorites)
# print("Mode number of favorites in each Biden tweet: " + str(biden_mode_favorites))
biden_minimum_favorites = min(biden_favorites)
print("The least number of favorites a Biden tweet received was: " + str(biden_minimum_favorites))
biden_maximum_favorites = max(biden_favorites)
print("The most number of favorites a Biden tweet received was: " + str(biden_maximum_favorites))
biden_stddev_favorites = np.std(biden_favorites)
print("Standard deviation of number of retweets for Biden tweets: " + str(biden_stddev_favorites))

In [None]:
# Summary stats for the "retweets" column
biden_retweets = biden_tweets["retweets"]
biden_total_retweets = biden_retweets.sum()
print("Biden retweets: " + str(biden_total_retweets))
biden_mean_retweets = np.mean(biden_retweets)
print("Mean number of retweets for Biden tweets: " + str(biden_mean_retweets))
biden_median_retweets = np.median(biden_retweets)
print("Median number of retweets for Biden tweets: " + str(biden_median_retweets))
# Mode may not be meaningful due to the wide range between the minimum and maximum number of retweets
# biden_mode_retweets = statistics.mode(biden_retweets)
# print("Mode number of retweets in each Biden tweet: " + str(biden_mode_retweets))
biden_minimum_retweets = min(biden_retweets)
print("The least number of retweets a Biden tweet received was: " + str(biden_minimum_retweets))
biden_maximum_retweets = max(biden_retweets)
print("The most number of retweets a Biden tweet received was: " + str(biden_maximum_retweets))
biden_stddev_retweets = np.std(biden_retweets)
print("Standard deviation of number of retweets for each Biden tweet: " + str(biden_stddev_retweets))

In [None]:
# Statistics for "language" column

print("Raw number of tweets in each language: " + "\n" + str(biden_tweets["language"].value_counts()))
print("Fraction of tweets in each language: " + "\n" + str(biden_tweets["language"].value_counts() / 
    number_of_biden_tweets)) # Percentage breakdown of each language



Thoughts: The sample size for tweets in languages other than english is likely too small to make any meaningful conclusions about those tweets

## Distributions of Variables

### Histograms

In [None]:
# Number of words

plt.hist(biden_word_counts_list)
plt.title("Distribution of Number of Words in Biden Tweets")
plt.xlabel("Number of Words")

In [None]:
# Favorites

plt.hist(biden_favorites)
plt.title("Distribution of Number of Favorites for Biden Tweets")
plt.xlabel("Favorites")

In [None]:
# Retweets

plt.hist(biden_retweets)
plt.title("Distribution of Number of Retweets for Each Biden Tweet")
plt.xlabel("Number of Retweets")

### Boxplots

In [None]:
# Number of Words

plt.boxplot(biden_word_counts_list)
plt.title("Distribution of Number of Words in Biden Tweets")
plt.ylabel("Number of Words")

In [None]:
# Favorites

plt.boxplot(biden_favorites)
plt.title("Distribution of Number of Favorites for Biden Tweets")
plt.ylabel("Number of Words")

In [None]:
# Retweets

plt.boxplot(biden_retweets)
plt.title("Distribution of Number of Retweets for Each Biden Tweet")
plt.ylabel("Number of Retweets")

### Plotting Multiple Variables Together

In [None]:
# Number of Words vs. Favorites

plt.scatter(x = biden_word_counts_list, y = biden_favorites, alpha = 0.6)
plt.title("Number of Words vs. Favorites for Biden Tweets")
plt.xlabel("Number of Words")

In [None]:
# Number of Words vs. Retweets

plt.scatter(x = biden_word_counts_list, y = biden_retweets, alpha = 0.6)
plt.title("Number of Words vs. Retweets for Biden Tweets")
plt.xlabel("Number of Words in Tweet")

In [None]:
# Number of Favorites vs. Retweets for Biden Tweets

plt.scatter(x = biden_favorites, y = biden_retweets, alpha = 0.6)
plt.title("Number of Favorites vs. Retweets for Biden Tweets")
plt.xlabel("Number of Words in Tweet")