In [1]:
import json
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')

## Create Shortened DataFrame

In [2]:
# Load Data...
data = [json.loads(line) for line in open("./data/data.jsonl", 'r', encoding='utf-8')]
df = pd.DataFrame(data)


### Create Summary DataFrame for Reference
# Create list for columns used, 1 for used feature, 0 for not used feature
use = [1      # created_at
       , 0    # id
       , 1    # id_str
       , 1    # full_text
       , 0    # truncated
       , 1    # display_text_range
       , 1    # entities
       , 1    # source
       , 0    # in_reply_to_status_id
       , 0    # in_reply_to_status_id_str
       , 0    # in_reply_to_user_id
       , 0    # in_reply_to_user_id_str
       , 0    # in_reply_to_screen_name
       , 1    # user
       , 0    # geo 
       , 0    # coordinates
       , 0    # place (maybe)
       , 0    # contributors
       , 1    # is_quote_status
       , 0    # quoted_status_id 
       , 0    # quoted_status_id_str 
       , 0    # quoted_status_permalink
       , 0    # quoted_status (maybe)
       , 1    # retweet_count 
       , 1    # favorite_count
       , 0    # favorited
       , 0    # retweeted
       , 1    # possibly_sensitive 
       , 1    # lang
       , 1    # retweeted_status
       , 0    # extended_entities (maybe)
       , 0    # withheld_in_countries
       ]

# Create a dictionary with summary information
dict_ = {'feature': list(df.columns)
         , 'nulls': df.isna().sum().values
         , 'non_nulls': df.notnull().sum().values
         , 'use': use
        }
summary_df = pd.DataFrame(dict_)


### Shorten Columns
# create a list of dropped column names
drop_col = list(summary_df.feature[summary_df.use !=1])

# create a df_short dataframe with only the columns we need
df = df.drop(columns = drop_col)

# create a list of kept column names
use_col = list(summary_df.feature[summary_df.use==1])

# verify used columns are present
print(f'Used Columns Are Present: {use_col == list(df.columns)}')

# create datetime column
df['datetime'] = pd.to_datetime(df.created_at)

# drop created_at
df.drop('created_at'
        , axis=1
        , inplace=True
       )

Used Columns Are Present: True


In [3]:
df.head()

Unnamed: 0,id_str,full_text,display_text_range,entities,source,user,is_quote_status,retweet_count,favorite_count,possibly_sensitive,lang,retweeted_status,datetime
0,1311093385688735744,Biden a professional debater?? Hardly!! More ...,"[0, 71]","{'hashtags': [], 'symbols': [], 'user_mentions...","<a href=""https://mobile.twitter.com"" rel=""nofo...","{'id': 117904481, 'id_str': '117904481', 'name...",True,0,0,False,en,,2020-09-29 23:59:56+00:00
1,1311093394400333824,RT @DarDar458: #PresidentialDebate #Debates202...,"[0, 73]","{'hashtags': [{'text': 'PresidentialDebate', '...","<a href=""http://twitter.com/download/iphone"" r...","{'id': 30140760, 'id_str': '30140760', 'name':...",True,1,0,,und,{'created_at': 'Tue Sep 29 23:55:42 +0000 2020...,2020-09-29 23:59:58+00:00
2,1311093401652277252,How can Biden stand a chance against Trump in ...,"[0, 280]","{'hashtags': [{'text': 'VarneyCo', 'indices': ...","<a href=""https://studio.twitter.com"" rel=""nofo...","{'id': 281610530, 'id_str': '281610530', 'name...",False,25,100,False,en,,2020-09-30 00:00:00+00:00
3,1311093410737070086,RT @abbydphillip: Very reminiscent of pre-2018...,"[0, 108]","{'hashtags': [], 'symbols': [], 'user_mentions...","<a href=""http://twitter.com/download/iphone"" r...","{'id': 88223320, 'id_str': '88223320', 'name':...",True,351,0,,en,{'created_at': 'Tue Sep 29 23:00:28 +0000 2020...,2020-09-30 00:00:02+00:00
4,1311093419100561408,RT @RudyGiuliani: Why was Corrupt Biden’s son ...,"[0, 110]","{'hashtags': [], 'symbols': [], 'user_mentions...","<a href=""http://twitter.com/download/iphone"" r...","{'id': 44295160, 'id_str': '44295160', 'name':...",False,15672,0,,en,{'created_at': 'Tue Sep 29 19:07:59 +0000 2020...,2020-09-30 00:00:04+00:00


In [4]:
df.columns

Index(['id_str', 'full_text', 'display_text_range', 'entities', 'source',
       'user', 'is_quote_status', 'retweet_count', 'favorite_count',
       'possibly_sensitive', 'lang', 'retweeted_status', 'datetime'],
      dtype='object')

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 140337 entries, 0 to 140336
Data columns (total 13 columns):
 #   Column              Non-Null Count   Dtype              
---  ------              --------------   -----              
 0   id_str              140337 non-null  object             
 1   full_text           140337 non-null  object             
 2   display_text_range  140337 non-null  object             
 3   entities            140337 non-null  object             
 4   source              140337 non-null  object             
 5   user                140337 non-null  object             
 6   is_quote_status     140337 non-null  bool               
 7   retweet_count       140337 non-null  int64              
 8   favorite_count      140337 non-null  int64              
 9   possibly_sensitive  32333 non-null   object             
 10  lang                140337 non-null  object             
 11  retweeted_status    106900 non-null  object             
 12  datetime        

## Min and Max Dates

In [24]:
print(f'Earliest date: {df.datetime.min()}')
print(f'Latest date: {df.datetime.max()}')

Earliest date: 2020-09-29 23:59:56+00:00
Latest date: 2020-10-02 23:59:54+00:00


## Create columns with numbers

### display_text_range column to int()

In [6]:
type(df.display_text_range.iloc[0])

list

In [7]:
df.display_text_range.iloc[0][-1]

71

In [13]:
def text_length(lst):
    """
    Parses the end of list [0, 15] returns 15
    
    Input: lst, column entry as from display_text_range
    Output: int, length of tweet
    """
    return lst[-1]

In [17]:
df['text_lengh'] = df.display_text_range.apply(text_length)

In [18]:
# Drop display_text_range
df.drop('display_text_range'
        , axis = 1
        , inplace = True
       )

In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 140337 entries, 0 to 140336
Data columns (total 13 columns):
 #   Column              Non-Null Count   Dtype              
---  ------              --------------   -----              
 0   id_str              140337 non-null  object             
 1   full_text           140337 non-null  object             
 2   entities            140337 non-null  object             
 3   source              140337 non-null  object             
 4   user                140337 non-null  object             
 5   is_quote_status     140337 non-null  bool               
 6   retweet_count       140337 non-null  int64              
 7   favorite_count      140337 non-null  int64              
 8   possibly_sensitive  32333 non-null   object             
 9   lang                140337 non-null  object             
 10  retweeted_status    106900 non-null  object             
 11  datetime            140337 non-null  datetime64[ns, UTC]
 12  text_lengh      

### entities column (come back to this wtih Julia and Mathias

In [19]:
df.entities.iloc[0]

{'hashtags': [],
 'symbols': [],
 'user_mentions': [],
 'urls': [{'url': 'https://t.co/cQuxzexXYt',
   'expanded_url': 'https://twitter.com/IngrahamAngle/status/1311081374510710784',
   'display_url': 'twitter.com/IngrahamAngle/…',
   'indices': [72, 95]}]}

In [21]:
df.entities.iloc[1]

{'hashtags': [{'text': 'PresidentialDebate', 'indices': [15, 34]},
  {'text': 'Debates2020', 'indices': [35, 47]},
  {'text': 'TrumpIsANationalDisgrace', 'indices': [48, 73]}],
 'symbols': [],
 'user_mentions': [{'screen_name': 'DarDar458',
   'name': 'DarDar45 🌊 🆘',
   'id': 1084234892840484864,
   'id_str': '1084234892840484864',
   'indices': [3, 13]}],
 'urls': []}

In [22]:
df.entities.iloc[100]

{'hashtags': [],
 'symbols': [],
 'user_mentions': [{'screen_name': 'HizbkKhan',
   'name': 'Hizbullah Khan',
   'id': 1015029427284127745,
   'id_str': '1015029427284127745',
   'indices': [3, 13]}],
 'urls': [],
 'media': [{'id': 1310731423184191493,
   'id_str': '1310731423184191493',
   'indices': [93, 116],
   'media_url': 'http://pbs.twimg.com/ext_tw_video_thumb/1310731423184191493/pu/img/kXb4Oq2Wb0p8on5F.jpg',
   'media_url_https': 'https://pbs.twimg.com/ext_tw_video_thumb/1310731423184191493/pu/img/kXb4Oq2Wb0p8on5F.jpg',
   'url': 'https://t.co/XnGBT6jpLl',
   'display_url': 'pic.twitter.com/XnGBT6jpLl',
   'expanded_url': 'https://twitter.com/HizbkKhan/status/1310731479748669440/video/1',
   'type': 'photo',
   'sizes': {'thumb': {'w': 150, 'h': 150, 'resize': 'crop'},
    'medium': {'w': 628, 'h': 360, 'resize': 'fit'},
    'large': {'w': 628, 'h': 360, 'resize': 'fit'},
    'small': {'w': 628, 'h': 360, 'resize': 'fit'}},
   'source_status_id': 1310731479748669440,
   'sourc

### source column

In [23]:
df.source.iloc[0]

'<a href="https://mobile.twitter.com" rel="nofollow">Twitter Web App</a>'

In [38]:
# write a function to prase between <a></a>

from bs4 import BeautifulSoup

def btw_a(txt):
    """
    Returns text between the <a></a>
    Input: txt
    Output: txt
    """
    soup = BeautifulSoup(txt)
    for x in soup.find_all('a'):
        return x.get_text()    

In [40]:
# test the function btw_a
txt = '<a href="https://mobile.twitter.com" rel="nofollow">Twitter Web App</a>'
btw_a(txt)

'Twitter Web App'

In [41]:
# apply it to source column
df.source = df.source.apply(btw_a)

In [42]:
# check work
df.source.head()

0         Twitter Web App
1      Twitter for iPhone
2    Twitter Media Studio
3      Twitter for iPhone
4      Twitter for iPhone
Name: source, dtype: object

In [43]:
# find the unique values
df.source.value_counts()

Twitter for iPhone           62080
Twitter for Android          38641
Twitter Web App              29655
Twitter for iPad              8046
TweetDeck                      459
                             ...  
Merchant Media Bot               1
Chirp for Twitter watchOS        1
SecDevBot                        1
akidthaine-Yugioh                1
tbapptwt                         1
Name: source, Length: 310, dtype: int64

In [44]:
len(df.source.value_counts())

310

In [45]:
source_lst = df.source.value_counts().index
source_lst

Index(['Twitter for iPhone', 'Twitter for Android', 'Twitter Web App',
       'Twitter for iPad', 'TweetDeck', 'Tweetbot for iΟS', 'WordPress.com',
       'dlvr.it', 'IFTTT', 'Twitter for Mac',
       ...
       'ambul news bot', 'gabge', 'Breaking News & Religion',
       'Share_Twitter_Office365', 'resistnews', 'Merchant Media Bot',
       'Chirp for Twitter watchOS', 'SecDevBot', 'akidthaine-Yugioh',
       'tbapptwt'],
      dtype='object', length=310)