In [1]:
import pandas as pd

In [2]:
# load input JSON file into a dataframe
df = pd.read_json('AOC_recent_tweets.txt')
print (df)

                    created_at                   id               id_str  \
0    2021-02-06 20:22:38+00:00  1358149122264563712  1358149122264563712   
1    2021-02-06 20:16:39+00:00  1358147616400408576  1358147616400408576   
2    2021-02-06 20:07:35+00:00  1358145332316667909  1358145332316667904   
3    2021-02-06 20:07:07+00:00  1358145218407759875  1358145218407759872   
4    2021-02-06 20:03:06+00:00  1358144207333036040  1358144207333036032   
...                        ...                  ...                  ...   
3242 2019-10-09 14:14:19+00:00  1181935928249606146  1181935928249606144   
3243 2019-10-09 14:00:54+00:00  1181932554552827905  1181932554552827904   
3244 2019-10-09 14:00:32+00:00  1181932460516478976  1181932460516478976   
3245 2019-10-09 13:41:17+00:00  1181927615340453899  1181927615340453888   
3246 2019-10-09 05:32:34+00:00  1181804625588051968  1181804625588051968   

                                              full_text  truncated  \
0     RT @RepEsco

In [3]:
# set the 'id' column as index. Without this, pandas assigns a auto-incrementing integer index,
# as seen in the previous print command
df.set_index('id', inplace=True)
print(df)

                                   created_at               id_str  \
id                                                                   
1358149122264563712 2021-02-06 20:22:38+00:00  1358149122264563712   
1358147616400408576 2021-02-06 20:16:39+00:00  1358147616400408576   
1358145332316667909 2021-02-06 20:07:35+00:00  1358145332316667904   
1358145218407759875 2021-02-06 20:07:07+00:00  1358145218407759872   
1358144207333036040 2021-02-06 20:03:06+00:00  1358144207333036032   
...                                       ...                  ...   
1181935928249606146 2019-10-09 14:14:19+00:00  1181935928249606144   
1181932554552827905 2019-10-09 14:00:54+00:00  1181932554552827904   
1181932460516478976 2019-10-09 14:00:32+00:00  1181932460516478976   
1181927615340453899 2019-10-09 13:41:17+00:00  1181927615340453888   
1181804625588051968 2019-10-09 05:32:34+00:00  1181804625588051968   

                                                             full_text  \
id             

In [4]:
# when pandas loads the input JSON file, the created_at column is already converted
# to datetime.datetime type, so there's no need to do conversion from string
def time_in_hours(dt):
    return dt.hour + dt.minute/60 + dt.second/(60*60)

In [5]:
#extract column 'created_at'
created_at = df['created_at']
print(created_at)
type(created_at)

id
1358149122264563712   2021-02-06 20:22:38+00:00
1358147616400408576   2021-02-06 20:16:39+00:00
1358145332316667909   2021-02-06 20:07:35+00:00
1358145218407759875   2021-02-06 20:07:07+00:00
1358144207333036040   2021-02-06 20:03:06+00:00
                                 ...           
1181935928249606146   2019-10-09 14:14:19+00:00
1181932554552827905   2019-10-09 14:00:54+00:00
1181932460516478976   2019-10-09 14:00:32+00:00
1181927615340453899   2019-10-09 13:41:17+00:00
1181804625588051968   2019-10-09 05:32:34+00:00
Name: created_at, Length: 3247, dtype: datetime64[ns, UTC]


pandas.core.series.Series

In [6]:
# apply function to every element in series 'created_at' and save it to 'hours'
hours = created_at.apply(time_in_hours)
print(hours)
type(hours)

id
1358149122264563712    20.377222
1358147616400408576    20.277500
1358145332316667909    20.126389
1358145218407759875    20.118611
1358144207333036040    20.051667
                         ...    
1181935928249606146    14.238611
1181932554552827905    14.015000
1181932460516478976    14.008889
1181927615340453899    13.688056
1181804625588051968     5.542778
Name: created_at, Length: 3247, dtype: float64


pandas.core.series.Series

In [7]:
# add series 'hours' to df as column 'hours'
df['hours'] = hours
print(df)

                                   created_at               id_str  \
id                                                                   
1358149122264563712 2021-02-06 20:22:38+00:00  1358149122264563712   
1358147616400408576 2021-02-06 20:16:39+00:00  1358147616400408576   
1358145332316667909 2021-02-06 20:07:35+00:00  1358145332316667904   
1358145218407759875 2021-02-06 20:07:07+00:00  1358145218407759872   
1358144207333036040 2021-02-06 20:03:06+00:00  1358144207333036032   
...                                       ...                  ...   
1181935928249606146 2019-10-09 14:14:19+00:00  1181935928249606144   
1181932554552827905 2019-10-09 14:00:54+00:00  1181932554552827904   
1181932460516478976 2019-10-09 14:00:32+00:00  1181932460516478976   
1181927615340453899 2019-10-09 13:41:17+00:00  1181927615340453888   
1181804625588051968 2019-10-09 05:32:34+00:00  1181804625588051968   

                                                             full_text  \
id             

In [8]:
# new dataframe with select columns from df
df2 = df[['created_at', 'hours', 'full_text']]
print(df2)

                                   created_at      hours  \
id                                                         
1358149122264563712 2021-02-06 20:22:38+00:00  20.377222   
1358147616400408576 2021-02-06 20:16:39+00:00  20.277500   
1358145332316667909 2021-02-06 20:07:35+00:00  20.126389   
1358145218407759875 2021-02-06 20:07:07+00:00  20.118611   
1358144207333036040 2021-02-06 20:03:06+00:00  20.051667   
...                                       ...        ...   
1181935928249606146 2019-10-09 14:14:19+00:00  14.238611   
1181932554552827905 2019-10-09 14:00:54+00:00  14.015000   
1181932460516478976 2019-10-09 14:00:32+00:00  14.008889   
1181927615340453899 2019-10-09 13:41:17+00:00  13.688056   
1181804625588051968 2019-10-09 05:32:34+00:00   5.542778   

                                                             full_text  
id                                                                      
1358149122264563712  RT @RepEscobar: Our country has the moral obli...  


In [9]:
# Save df2 to CSV file, without the 'id' index
df2.to_csv('AOC_recent_tweets.csv', index=False)