In [1]:
import json
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')

## Create Shortened DataFrame

In [2]:
# Load Data...
data = [json.loads(line) for line in open("./data/data.jsonl", 'r', encoding='utf-8')]
df = pd.DataFrame(data)


### Create Summary DataFrame for Reference
# Create list for columns used, 1 for used feature, 0 for not used feature
use = [1      # created_at
       , 0    # id
       , 1    # id_str
       , 1    # full_text
       , 0    # truncated
       , 1    # display_text_range
       , 1    # entities
       , 1    # source
       , 0    # in_reply_to_status_id
       , 0    # in_reply_to_status_id_str
       , 0    # in_reply_to_user_id
       , 0    # in_reply_to_user_id_str
       , 0    # in_reply_to_screen_name
       , 1    # user
       , 0    # geo 
       , 0    # coordinates
       , 0    # place (maybe)
       , 0    # contributors
       , 1    # is_quote_status
       , 0    # quoted_status_id 
       , 0    # quoted_status_id_str 
       , 0    # quoted_status_permalink
       , 0    # quoted_status (maybe)
       , 1    # retweet_count 
       , 1    # favorite_count
       , 0    # favorited
       , 0    # retweeted
       , 1    # possibly_sensitive 
       , 1    # lang
       , 1    # retweeted_status
       , 0    # extended_entities (maybe)
       , 0    # withheld_in_countries
       ]

# Create a dictionary with summary information
dict_ = {'feature': list(df.columns)
         , 'nulls': df.isna().sum().values
         , 'non_nulls': df.notnull().sum().values
         , 'use': use
        }
summary_df = pd.DataFrame(dict_)


### Shorten Columns
# create a list of dropped column names
drop_col = list(summary_df.feature[summary_df.use !=1])

# create a df_short dataframe with only the columns we need
df = df.drop(columns = drop_col)

# create a list of kept column names
use_col = list(summary_df.feature[summary_df.use==1])

# verify used columns are present
print(f'Used Columns Are Present: {use_col == list(df.columns)}')

# create datetime column
df['datetime'] = pd.to_datetime(df.created_at)

# drop created_at
df.drop('created_at'
        , axis=1
        , inplace=True
       )

Used Columns Are Present: True


In [3]:
df.head()

Unnamed: 0,id_str,full_text,display_text_range,entities,source,user,is_quote_status,retweet_count,favorite_count,possibly_sensitive,lang,retweeted_status,datetime
0,1311093385688735744,Biden a professional debater?? Hardly!! More ...,"[0, 71]","{'hashtags': [], 'symbols': [], 'user_mentions...","<a href=""https://mobile.twitter.com"" rel=""nofo...","{'id': 117904481, 'id_str': '117904481', 'name...",True,0,0,False,en,,2020-09-29 23:59:56+00:00
1,1311093394400333824,RT @DarDar458: #PresidentialDebate #Debates202...,"[0, 73]","{'hashtags': [{'text': 'PresidentialDebate', '...","<a href=""http://twitter.com/download/iphone"" r...","{'id': 30140760, 'id_str': '30140760', 'name':...",True,1,0,,und,{'created_at': 'Tue Sep 29 23:55:42 +0000 2020...,2020-09-29 23:59:58+00:00
2,1311093401652277252,How can Biden stand a chance against Trump in ...,"[0, 280]","{'hashtags': [{'text': 'VarneyCo', 'indices': ...","<a href=""https://studio.twitter.com"" rel=""nofo...","{'id': 281610530, 'id_str': '281610530', 'name...",False,25,100,False,en,,2020-09-30 00:00:00+00:00
3,1311093410737070086,RT @abbydphillip: Very reminiscent of pre-2018...,"[0, 108]","{'hashtags': [], 'symbols': [], 'user_mentions...","<a href=""http://twitter.com/download/iphone"" r...","{'id': 88223320, 'id_str': '88223320', 'name':...",True,351,0,,en,{'created_at': 'Tue Sep 29 23:00:28 +0000 2020...,2020-09-30 00:00:02+00:00
4,1311093419100561408,RT @RudyGiuliani: Why was Corrupt Biden’s son ...,"[0, 110]","{'hashtags': [], 'symbols': [], 'user_mentions...","<a href=""http://twitter.com/download/iphone"" r...","{'id': 44295160, 'id_str': '44295160', 'name':...",False,15672,0,,en,{'created_at': 'Tue Sep 29 19:07:59 +0000 2020...,2020-09-30 00:00:04+00:00


In [4]:
df.columns

Index(['id_str', 'full_text', 'display_text_range', 'entities', 'source',
       'user', 'is_quote_status', 'retweet_count', 'favorite_count',
       'possibly_sensitive', 'lang', 'retweeted_status', 'datetime'],
      dtype='object')

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 140337 entries, 0 to 140336
Data columns (total 13 columns):
 #   Column              Non-Null Count   Dtype              
---  ------              --------------   -----              
 0   id_str              140337 non-null  object             
 1   full_text           140337 non-null  object             
 2   display_text_range  140337 non-null  object             
 3   entities            140337 non-null  object             
 4   source              140337 non-null  object             
 5   user                140337 non-null  object             
 6   is_quote_status     140337 non-null  bool               
 7   retweet_count       140337 non-null  int64              
 8   favorite_count      140337 non-null  int64              
 9   possibly_sensitive  32333 non-null   object             
 10  lang                140337 non-null  object             
 11  retweeted_status    106900 non-null  object             
 12  datetime        

## Min and Max Dates

In [6]:
print(f'Earliest date: {df.datetime.min()}')
print(f'Latest date: {df.datetime.max()}')

Earliest date: 2020-09-29 23:59:56+00:00
Latest date: 2020-10-02 23:59:54+00:00


## Create columns with numbers

## id_str column review

In [7]:
# there are 140,337 tweet id's (I think)
df.id_str.nunique()

140337

### display_text_range column to int()

In [8]:
type(df.display_text_range.iloc[0])

list

In [9]:
df.display_text_range.iloc[0][-1]

71

In [10]:
def text_length(lst):
    """
    Parses the end of list [0, 15] returns 15
    
    Input: lst, column entry as from display_text_range
    Output: int, length of tweet
    """
    return lst[-1]

In [11]:
df['text_length'] = df.display_text_range.apply(text_length)

In [12]:
# Drop display_text_range
df.drop('display_text_range'
        , axis = 1
        , inplace = True
       )

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 140337 entries, 0 to 140336
Data columns (total 13 columns):
 #   Column              Non-Null Count   Dtype              
---  ------              --------------   -----              
 0   id_str              140337 non-null  object             
 1   full_text           140337 non-null  object             
 2   entities            140337 non-null  object             
 3   source              140337 non-null  object             
 4   user                140337 non-null  object             
 5   is_quote_status     140337 non-null  bool               
 6   retweet_count       140337 non-null  int64              
 7   favorite_count      140337 non-null  int64              
 8   possibly_sensitive  32333 non-null   object             
 9   lang                140337 non-null  object             
 10  retweeted_status    106900 non-null  object             
 11  datetime            140337 non-null  datetime64[ns, UTC]
 12  text_length     

### entities column (come back to this wtih Julia and Mathias)

In [14]:
df.entities.iloc[0]

{'hashtags': [],
 'symbols': [],
 'user_mentions': [],
 'urls': [{'url': 'https://t.co/cQuxzexXYt',
   'expanded_url': 'https://twitter.com/IngrahamAngle/status/1311081374510710784',
   'display_url': 'twitter.com/IngrahamAngle/…',
   'indices': [72, 95]}]}

In [15]:
df.entities.iloc[1]

{'hashtags': [{'text': 'PresidentialDebate', 'indices': [15, 34]},
  {'text': 'Debates2020', 'indices': [35, 47]},
  {'text': 'TrumpIsANationalDisgrace', 'indices': [48, 73]}],
 'symbols': [],
 'user_mentions': [{'screen_name': 'DarDar458',
   'name': 'DarDar45 🌊 🆘',
   'id': 1084234892840484864,
   'id_str': '1084234892840484864',
   'indices': [3, 13]}],
 'urls': []}

In [16]:
df.entities.iloc[100]

{'hashtags': [],
 'symbols': [],
 'user_mentions': [{'screen_name': 'HizbkKhan',
   'name': 'Hizbullah Khan',
   'id': 1015029427284127745,
   'id_str': '1015029427284127745',
   'indices': [3, 13]}],
 'urls': [],
 'media': [{'id': 1310731423184191493,
   'id_str': '1310731423184191493',
   'indices': [93, 116],
   'media_url': 'http://pbs.twimg.com/ext_tw_video_thumb/1310731423184191493/pu/img/kXb4Oq2Wb0p8on5F.jpg',
   'media_url_https': 'https://pbs.twimg.com/ext_tw_video_thumb/1310731423184191493/pu/img/kXb4Oq2Wb0p8on5F.jpg',
   'url': 'https://t.co/XnGBT6jpLl',
   'display_url': 'pic.twitter.com/XnGBT6jpLl',
   'expanded_url': 'https://twitter.com/HizbkKhan/status/1310731479748669440/video/1',
   'type': 'photo',
   'sizes': {'thumb': {'w': 150, 'h': 150, 'resize': 'crop'},
    'medium': {'w': 628, 'h': 360, 'resize': 'fit'},
    'large': {'w': 628, 'h': 360, 'resize': 'fit'},
    'small': {'w': 628, 'h': 360, 'resize': 'fit'}},
   'source_status_id': 1310731479748669440,
   'sourc

### source column

In [17]:
df.source.iloc[0]

'<a href="https://mobile.twitter.com" rel="nofollow">Twitter Web App</a>'

In [18]:
# write a function to prase between <a></a>

from bs4 import BeautifulSoup

def btw_a(txt):
    """
    Returns text between the <a></a>
    Input: txt
    Output: txt
    """
    soup = BeautifulSoup(txt)
    for x in soup.find_all('a'):
        return x.get_text()    

In [19]:
# test the function btw_a
txt = '<a href="https://mobile.twitter.com" rel="nofollow">Twitter Web App</a>'
btw_a(txt)

'Twitter Web App'

In [20]:
# apply it to source column
df.source = df.source.apply(btw_a)

In [21]:
# check work
df.source.head()

0         Twitter Web App
1      Twitter for iPhone
2    Twitter Media Studio
3      Twitter for iPhone
4      Twitter for iPhone
Name: source, dtype: object

In [22]:
# find the unique values
df.source.value_counts()

Twitter for iPhone         62080
Twitter for Android        38641
Twitter Web App            29655
Twitter for iPad            8046
TweetDeck                    459
                           ...  
buzybuzz3                      1
De-duped Economist             1
Clicky Sound                   1
CoSchedule                     1
Guanduania Humanitarian        1
Name: source, Length: 310, dtype: int64

In [23]:
len(df.source.value_counts())

310

In [24]:
source_lst = df.source.value_counts().index
source_lst

Index(['Twitter for iPhone', 'Twitter for Android', 'Twitter Web App',
       'Twitter for iPad', 'TweetDeck', 'Tweetbot for iΟS', 'WordPress.com',
       'dlvr.it', 'IFTTT', 'Twitter for Mac',
       ...
       'Financial Performance Rater', 'KXChg', 'PublikaMD', 'SocialGest',
       'Alerta Digital', 'buzybuzz3', 'De-duped Economist', 'Clicky Sound',
       'CoSchedule', 'Guanduania Humanitarian'],
      dtype='object', length=310)

### user column

In [25]:
df.user.iloc[0]

{'id': 117904481,
 'id_str': '117904481',
 'name': 'SharonS',
 'screen_name': 'sfshores53',
 'location': 'America ',
 'description': 'Conservative/#MAGA/ #WWG1WGA /MbrNRA/#Trump2020/Pro-Life/Pro-2nd Amend/parler@SharonA53/ NO DMs-/Patriots-will follow all/will unfollow those who do not follow',
 'url': None,
 'entities': {'description': {'urls': []}},
 'protected': False,
 'followers_count': 4065,
 'friends_count': 4998,
 'listed_count': 16,
 'created_at': 'Sat Feb 27 00:36:23 +0000 2010',
 'favourites_count': 3659,
 'utc_offset': None,
 'time_zone': None,
 'geo_enabled': True,
 'verified': False,
 'statuses_count': 6156,
 'lang': None,
 'contributors_enabled': False,
 'is_translator': False,
 'is_translation_enabled': False,
 'profile_background_color': '0099B9',
 'profile_background_image_url': 'http://abs.twimg.com/images/themes/theme4/bg.gif',
 'profile_background_image_url_https': 'https://abs.twimg.com/images/themes/theme4/bg.gif',
 'profile_background_tile': False,
 'profile_ima

In [26]:
type(df.user.iloc[0])

dict

In [27]:
# create function to pull out user profile value from user dictionary
def user_id(dict_):
    """
    Pulls out user id from user dictionary
    Input: dict, user dictionary
    Output: str, user_id
    """    
    return dict_['id_str']

In [28]:
# create user id column
df['user_id'] = df.user.apply(user_id)

In [29]:
# there are 121,040 unique user id's
df.user_id.nunique()

121040

In [30]:
# user follower_count sample
df.user.iloc[0]['followers_count']

4065

In [31]:
# create function to pull out user profile value from user dictionary
def followers_count(dict_):
    """
    Pulls out user id from user dictionary
    Input: dict, user dictionary
    Output: int, followers_count
    """    
    return dict_['followers_count']

In [32]:
# create followers count column
df['followers_count'] = df.user.apply(followers_count)

In [33]:
# check data type
type(df.followers_count.iloc[0])

numpy.int64

In [34]:
# user friends__count sample
df.user.iloc[0]['friends_count']

4998

In [35]:
# create function to pull out user profile value from user dictionary
def friends_count(dict_):
    """
    Pulls out user id from user dictionary
    Input: dict, user dictionary
    Output: int, friends_count
    """    
    return dict_['friends_count']

In [36]:
# create followers count column
df['friends_count'] = df.user.apply(friends_count)

In [37]:
# check data type
type(df.friends_count.iloc[0])

numpy.int64

In [38]:
# user location sample
df.user.iloc[0]['location']

'America '

In [39]:
# create function to pull out user profile value from user dictionary
def location(dict_):
    """
    Pulls out user id from user dictionary
    Input: dict, user dictionary
    Output: str, location
    """    
    return dict_['location']

In [40]:
# create user locations column
df['user_location'] = df.user.apply(location)

In [41]:
# check data type
type(df.user_location.iloc[0])

str

In [42]:
# user created sample
df.user.iloc[0]['created_at']

'Sat Feb 27 00:36:23 +0000 2010'

In [43]:
# create function to pull out user profile value from user dictionary
def acc_created(dict_):
    """
    Pulls out user id from user dictionary
    Input: dict, user dictionary
    Output: str, account created
    """    
    return dict_['created_at']

In [44]:
# create user locations column
df['acct_created'] = df.user.apply(acc_created)

In [45]:
# check data type
type(df.acct_created.iloc[0])

str

In [46]:
# user statuses_count sample
df.user.iloc[0]['statuses_count']

6156

In [47]:
# create function to pull out user profile value from user dictionary
def statuses_count(dict_):
    """
    Pulls out user id from user dictionary
    Input: dict, user dictionary
    Output: int, statuses_count
    """    
    return dict_['statuses_count']

In [48]:
# create user statuses_count column
df['statuses_count'] = df.user.apply(statuses_count)

In [49]:
# check data type
type(df.statuses_count.iloc[0])

numpy.int64

In [50]:
# user following sample
df.user.iloc[0]['following']

False

In [51]:
# create function to pull out user profile value from user dictionary
def following(dict_):
    """
    Pulls out user id from user dictionary
    Input: dict, user dictionary
    Output: bool, following
    """    
    return dict_['following']

In [52]:
# create user following column
df['following'] = df.user.apply(following)

In [53]:
# check data type
type(df.following.iloc[0])

numpy.bool_

In [54]:
# user following_request_sent sample
df.user.iloc[0]['follow_request_sent']

False

In [55]:
# create function to pull out user profile value from user dictionary
def following_request(dict_):
    """
    Pulls out user id from user dictionary
    Input: dict, user dictionary
    Output: bool, following
    """    
    return dict_['follow_request_sent']

In [56]:
# create user following column
df['following_request'] = df.user.apply(following_request)

In [57]:
# check data type
type(df.following_request.iloc[0])

numpy.bool_

In [58]:
# user following_request_sent sample
df.user.iloc[0]['notifications']

False

In [59]:
# create function to pull out user profile value from user dictionary
def notifications(dict_):
    """
    Pulls out user id from user dictionary
    Input: dict, user dictionary
    Output: bool, notifications
    """    
    return dict_['notifications']

In [60]:
# create user following column
df['notifications'] = df.user.apply(notifications)

In [61]:
# check data type
type(df.notifications.iloc[0])

numpy.bool_

In [62]:
# user description sample
df.user.iloc[0]['description']

'Conservative/#MAGA/ #WWG1WGA /MbrNRA/#Trump2020/Pro-Life/Pro-2nd Amend/parler@SharonA53/ NO DMs-/Patriots-will follow all/will unfollow those who do not follow'

In [63]:
# create function to pull out user profile value from user dictionary
def user_descriptions(dict_):
    """
    Pulls out user id from user dictionary
    Input: dict, user dictionary
    Output: str, user_descriptions
    """    
    return dict_['description']

In [64]:
# create user following column
df['user_descriptions'] = df.user.apply(user_descriptions)

In [65]:
# check data type
type(df.user_descriptions.iloc[0])

str

## Creating A Influencing Score

In [66]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 140337 entries, 0 to 140336
Data columns (total 23 columns):
 #   Column              Non-Null Count   Dtype              
---  ------              --------------   -----              
 0   id_str              140337 non-null  object             
 1   full_text           140337 non-null  object             
 2   entities            140337 non-null  object             
 3   source              140319 non-null  object             
 4   user                140337 non-null  object             
 5   is_quote_status     140337 non-null  bool               
 6   retweet_count       140337 non-null  int64              
 7   favorite_count      140337 non-null  int64              
 8   possibly_sensitive  32333 non-null   object             
 9   lang                140337 non-null  object             
 10  retweeted_status    106900 non-null  object             
 11  datetime            140337 non-null  datetime64[ns, UTC]
 12  text_length     

## The int() columns seems interesting, investigate further

In [67]:
# retweet_count stats:
df.retweet_count.mean(), df.retweet_count.median(), df.retweet_count.std()

(21028.81549413198, 859.0, 66438.94603761076)

In [68]:
# favorite_count stats:
df.favorite_count.mean(), df.favorite_count.median(), df.favorite_count.std()

(4.9153395041934775, 0.0, 661.1712932699155)

In [69]:
# text_length stats:
df.text_length.mean(), df.text_length.median(), df.text_length.std()

(118.03955478597946, 139.0, 52.76082452811671)

In [70]:
# followers_count stats:
df.followers_count.mean(), df.followers_count.median(), df.followers_count.std()

(6257.545629449111, 423.0, 349213.32380061835)

In [72]:
# friends_count stats:
df.friends_count.mean(), df.friends_count.median(), df.friends_count.std()

(2503.2387253539696, 690.0, 7632.612775724196)