In [1]:
import pandas as pd
import re

In [2]:
tweets = pd.read_csv("data/realdonaldtrump.csv")

In [3]:
tweets.date[0]

'2009-05-04 13:54:25'

In [4]:
re.findall(r"\d{2}:\d{2}:\d{2}", tweets.date[0])[0]

'13:54:25'

In [5]:
# Reshaping the date column into two columns : YYYY-MM-DD & HH:MM:SS
    # To do it, I define two functions that take date and time
    
def date(row):
    return re.findall(r"\d{4}-\d{2}-\d{2}", row["date"])[0]

def time(row):
    return re.findall(r"\d{2}:\d{2}:\d{2}", row["date"])[0]

    # Once the functions are defined, I create the columns YYYY-MM-DD and 
    # HH:MM:SS and apply the function on the original date column to 
    # populate the new two columns
    
tweets['YYYY-MM-DD'] = tweets.apply(date, axis=1)
tweets["HH:MM:SS"] = tweets.apply(time, axis = 1)

    # We will also calculate the engagement of each tweet has

tweets["engagement"] = tweets["retweets"] + tweets["favorites"]

In [6]:
tweets.head()

Unnamed: 0,id,link,content,date,retweets,favorites,mentions,hashtags,YYYY-MM-DD,HH:MM:SS,engagement
0,1698308935,https://twitter.com/realDonaldTrump/status/169...,Be sure to tune in and watch Donald Trump on L...,2009-05-04 13:54:25,510,917,,,2009-05-04,13:54:25,1427
1,1701461182,https://twitter.com/realDonaldTrump/status/170...,Donald Trump will be appearing on The View tom...,2009-05-04 20:00:10,34,267,,,2009-05-04,20:00:10,301
2,1737479987,https://twitter.com/realDonaldTrump/status/173...,Donald Trump reads Top Ten Financial Tips on L...,2009-05-08 08:38:08,13,19,,,2009-05-08,08:38:08,32
3,1741160716,https://twitter.com/realDonaldTrump/status/174...,New Blog Post: Celebrity Apprentice Finale and...,2009-05-08 15:40:15,11,26,,,2009-05-08,15:40:15,37
4,1773561338,https://twitter.com/realDonaldTrump/status/177...,"""My persona will never be that of a wallflower...",2009-05-12 09:07:28,1375,1945,,,2009-05-12,09:07:28,3320


In [7]:
# Once date has been separated into date and time,
    # we have to reorder the columns

tweets.columns

# We will place first date, YYYY-MM-DD, HH:MM:SS, then the tweet content and
# its engagement and finally the id and link

tweets = tweets[["date", "YYYY-MM-DD", "HH:MM:SS", "content", 
                 "engagement", "retweets", "favorites", 
                 "mentions", "hashtags", "link", "id"]]
tweets.head()
tweets.to_csv('data/tweets_clean.csv', sep = ';', decimal = ',')

In [8]:
# We check how many NaNs are there in each category

tweets.info()

# We can see that, the only two columns with lots of NaNs are
# mentions and hashtags, which can be due to the lack of use of 
# these functions rather than a gap in our data

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43352 entries, 0 to 43351
Data columns (total 11 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   date        43352 non-null  object
 1   YYYY-MM-DD  43352 non-null  object
 2   HH:MM:SS    43352 non-null  object
 3   content     43352 non-null  object
 4   engagement  43352 non-null  int64 
 5   retweets    43352 non-null  int64 
 6   favorites   43352 non-null  int64 
 7   mentions    20386 non-null  object
 8   hashtags    5583 non-null   object
 9   link        43352 non-null  object
 10  id          43352 non-null  int64 
dtypes: int64(4), object(7)
memory usage: 3.6+ MB


In [9]:
daily_tweets = pd.DataFrame(tweets.groupby("YYYY-MM-DD", as_index = False).aggregate({"content":"count", "engagement":"sum"}))
daily_tweets["engagement/tweet"] = daily_tweets["engagement"] / daily_tweets["content"]
daily_tweets.to_csv('data/daily_tweets.csv', sep = ';', decimal = ',')