In [1]:
import json
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')

## Create Shortened DataFrame for Memory Considerations

In [8]:
# Load Data...
data = [json.loads(line) for line in open("./data/data.jsonl", 'r', encoding='utf-8')]
df = pd.DataFrame(data)


### Create Summary DataFrame for Reference
# Create list for columns used, 1 for used feature, 0 for not used feature
use = [1      # created_at
       , 0    # id
       , 1    # id_str
       , 1    # full_text
       , 0    # truncated
       , 1    # display_text_range
       , 1    # entities
       , 1    # source
       , 0    # in_reply_to_status_id
       , 0    # in_reply_to_status_id_str
       , 0    # in_reply_to_user_id
       , 0    # in_reply_to_user_id_str
       , 0    # in_reply_to_screen_name
       , 1    # user
       , 0    # geo 
       , 0    # coordinates
       , 0    # place (maybe)
       , 0    # contributors
       , 1    # is_quote_status
       , 0    # quoted_status_id 
       , 0    # quoted_status_id_str 
       , 0    # quoted_status_permalink
       , 0    # quoted_status (maybe)
       , 1    # retweet_count 
       , 1    # favorite_count
       , 0    # favorited
       , 0    # retweeted
       , 1    # possibly_sensitive 
       , 1    # lang
       , 1    # retweeted_status
       , 0    # extended_entities (maybe)
       , 0    # withheld_in_countries
       ]

# Create a dictionary with summary information
dict_ = {'feature': list(df.columns)
         , 'nulls': df.isna().sum().values
         , 'non_nulls': df.notnull().sum().values
         , 'use': use
        }
summary_df = pd.DataFrame(dict_)


### Shorten Columns
# create a list of dropped column names
drop_col = list(summary_df.feature[summary_df.use !=1])

# create a df_short dataframe with only the columns we need
df = df.drop(columns = drop_col)

# create a list of kept column names
use_col = list(summary_df.feature[summary_df.use==1])

# verify used columns are present
print(f'Used Columns Are Present: {use_col == list(df.columns)}')


### Save df as .csv to load to readuce memory useage
df.to_csv('./data/data_short.csv'
          , index=False
         )

Used Columns Are Present: True


In [9]:
df.shape

(140337, 13)

In [10]:
df.isna().sum()

created_at                 0
id_str                     0
full_text                  0
display_text_range         0
entities                   0
source                     0
user                       0
is_quote_status            0
retweet_count              0
favorite_count             0
possibly_sensitive    108004
lang                       0
retweeted_status       33437
dtype: int64

### Load data_short.csv From Now On...

In [41]:
df = pd.read_csv('./data/data_short.csv'
                 , low_memory=False
                )

In [42]:
df.shape

(140338, 13)

### A Look at Nulls

In [43]:
df.head()

Unnamed: 0,created_at,id_str,full_text,display_text_range,entities,source,user,is_quote_status,retweet_count,favorite_count,possibly_sensitive,lang,retweeted_status
0,Tue Sep 29 23:59:56 +0000 2020,1311093385688735744,Biden a professional debater?? Hardly!! More ...,"[0, 71]","{'hashtags': [], 'symbols': [], 'user_mentions...","<a href=""https://mobile.twitter.com"" rel=""nofo...","{'id': 117904481, 'id_str': '117904481', 'name...",True,0,0,False,en,
1,Tue Sep 29 23:59:58 +0000 2020,1311093394400333824,RT @DarDar458: #PresidentialDebate #Debates202...,"[0, 73]","{'hashtags': [{'text': 'PresidentialDebate', '...","<a href=""http://twitter.com/download/iphone"" r...","{'id': 30140760, 'id_str': '30140760', 'name':...",True,1,0,,und,{'created_at': 'Tue Sep 29 23:55:42 +0000 2020...
2,Wed Sep 30 00:00:00 +0000 2020,1311093401652277252,How can Biden stand a chance against Trump in ...,"[0, 280]","{'hashtags': [{'text': 'VarneyCo', 'indices': ...","<a href=""https://studio.twitter.com"" rel=""nofo...","{'id': 281610530, 'id_str': '281610530', 'name...",False,25,100,False,en,
3,Wed Sep 30 00:00:02 +0000 2020,1311093410737070086,RT @abbydphillip: Very reminiscent of pre-2018...,"[0, 108]","{'hashtags': [], 'symbols': [], 'user_mentions...","<a href=""http://twitter.com/download/iphone"" r...","{'id': 88223320, 'id_str': '88223320', 'name':...",True,351,0,,en,{'created_at': 'Tue Sep 29 23:00:28 +0000 2020...
4,Wed Sep 30 00:00:04 +0000 2020,1311093419100561408,RT @RudyGiuliani: Why was Corrupt Biden’s son ...,"[0, 110]","{'hashtags': [], 'symbols': [], 'user_mentions...","<a href=""http://twitter.com/download/iphone"" r...","{'id': 44295160, 'id_str': '44295160', 'name':...",False,15672,0,,en,{'created_at': 'Tue Sep 29 19:07:59 +0000 2020...


In [44]:
df.isna().sum()

created_at                 0
id_str                     0
full_text                  0
display_text_range         1
entities                   1
source                    19
user                       1
is_quote_status            1
retweet_count              1
favorite_count             1
possibly_sensitive    108006
lang                       2
retweeted_status       33438
dtype: int64

In [45]:
# Pandas created an extra row
df[df.user.isna()]

Unnamed: 0,created_at,id_str,full_text,display_text_range,entities,source,user,is_quote_status,retweet_count,favorite_count,possibly_sensitive,lang,retweeted_status
110181,Fri Oct 02 08:23:01 +0000 2020,1311944766872059904,@rudybouma The Cornell University study: '#Cor...,,,,,,,,,,


In [46]:
df.drop(110181, axis=0, inplace=True).reset_index()
df.shape

AttributeError: 'NoneType' object has no attribute 'reset_index'

In [40]:
# Check extra row is gon
df[df.user.isna()]

Unnamed: 0,created_at,id_str,full_text,display_text_range,entities,source,user,is_quote_status,retweet_count,favorite_count,possibly_sensitive,lang,retweeted_status
