In [1]:
from time import time
import pandas as pd
from IPython.display import display # use display() instead of print to make DataFrames look pretty

In [2]:
nan = float("nan")
table = [[1, 2],[nan,nan],[5,6],[nan, nan],[9,10]]
# convert the table into a DataFrame with column names A and B
# see 10 minutes to pandas or DataFrame documentation for help
df = pd.DataFrame(table, columns = ["A","B"])
display(df)

Unnamed: 0,A,B
0,1.0,2.0
1,,
2,5.0,6.0
3,,
4,9.0,10.0


In [3]:
# Try removing the nans using logical indexing
# see 10 minutes to pandas for help with logical (Boolean) indexing.
# see pandas notnull documentation, too. 
# Note that notnull is a non-member function; it accepts a Series (DataFrame column) as an argument
# remember that DataFrames are immutable - logical indexing doesn't change the DataFrame; it returns a copy
df = df[pd.notnull(df.A)]
display(df)

Unnamed: 0,A,B
0,1.0,2.0
2,5.0,6.0
4,9.0,10.0


In [4]:
# make column A the index column
# see pandas set_index documentation
# Note that set_index is a member function / method; it is invoked on a particular DataFrame. 
# It does not accept the DataFrame as an argument.
# again, DataFrames are immutable. set_index does not change the DataFrame; it returns a copy.
df = df.set_index("A")
display(df)

Unnamed: 0_level_0,B
A,Unnamed: 1_level_1
1.0,2.0
5.0,6.0
9.0,10.0


In [5]:
# This will give you the number of seconds since January 1, 1970
t = time()
print(t)

1615721487.6886802


In [6]:
# Convert the timestamp to a DateTime object using pandas to_datetime
# Be sure to choose the correct unit of the timestamp
# also make sure that the DateTime is timezone aware - it needs to know it represents a UTC time
# The output below is for a DateTime object the is NOT timezone aware
dt = pd.to_datetime(t, utc = False, unit = "s")
print(dt)

2021-03-14 11:31:27.688680172


In [7]:
# Now use the DateTime's tz_convert method to convert to a different time zone
# List of timezones: https://en.wikipedia.org/wiki/List_of_tz_database_time_zones
# If you didn't construct your DateTime object to be "timezone aware" (if it doesn't know it's in UTC), this won't work
dt2 = dt.tz_convert("US/Pacific")
print(dt2)

TypeError: Cannot convert tz-naive Timestamp, use tz_localize to localize

In [8]:
# The output below is for a DateTime object that IS UTC timezone aware (note the +00:00)
dt3 = pd.to_datetime(t, utc = True, unit = "s")
print(dt3)

2021-03-14 11:31:27.688680172+00:00


In [9]:
# Because the original DateTime knows it's UTC, all you need to specify is the timezone you want to convert it TO
# Here I've converted to Pacific time, which appears to be 7:00 earlier than UTC
dt4 = dt3.tz_convert("US/Pacific")
print(dt4)

2021-03-14 04:31:27.688680172-07:00


In [10]:
# This just generates a range of dates and some corresponding numbers for you to work with
dates = pd.date_range('20130101', '20130106', freq = "6H")
numbers = [i for i in range(21)]
df = pd.DataFrame(numbers, index = dates, columns = ["A"])
display(df)

Unnamed: 0,A
2013-01-01 00:00:00,0
2013-01-01 06:00:00,1
2013-01-01 12:00:00,2
2013-01-01 18:00:00,3
2013-01-02 00:00:00,4
2013-01-02 06:00:00,5
2013-01-02 12:00:00,6
2013-01-02 18:00:00,7
2013-01-03 00:00:00,8
2013-01-03 06:00:00,9


In [11]:
# Use the DataFrame's asfreq method to interpolate at 9 hour intervals (rather than the 6 hour intervals provided)
# The format for specifying the frequency is pretty self-explanatory. You need to choose the method for interpolation.
df = df.asfreq("9H", method = "pad")
display(df)

Unnamed: 0,A
2013-01-01 00:00:00,0
2013-01-01 09:00:00,1
2013-01-01 18:00:00,3
2013-01-02 03:00:00,4
2013-01-02 12:00:00,6
2013-01-02 21:00:00,7
2013-01-03 06:00:00,9
2013-01-03 15:00:00,10
2013-01-04 00:00:00,12
2013-01-04 09:00:00,13


In [12]:
# Add a new column to the DataFrame that is the modulo of the data in A when divided by two. 
# If 0, the number in A is even. If 1, the number in A is odd.
df["B"] = df.A %2
display(df)

Unnamed: 0,A,B
2013-01-01 00:00:00,0,0
2013-01-01 09:00:00,1,1
2013-01-01 18:00:00,3,1
2013-01-02 03:00:00,4,0
2013-01-02 12:00:00,6,0
2013-01-02 21:00:00,7,1
2013-01-03 06:00:00,9,1
2013-01-03 15:00:00,10,0
2013-01-04 00:00:00,12,0
2013-01-04 09:00:00,13,1


In [13]:
# Use the DataFrame groupby method to group the data by column B. 
# When you're done, this code will display two separate DataFrames:
#    one containing only rows in which column B is 0, one containing only rows in which column B is 1
gb = df.groupby("B")
for name,group in gb:
    display(group)

Unnamed: 0,A,B
2013-01-01 00:00:00,0,0
2013-01-02 03:00:00,4,0
2013-01-02 12:00:00,6,0
2013-01-03 15:00:00,10,0
2013-01-04 00:00:00,12,0
2013-01-05 03:00:00,16,0
2013-01-05 12:00:00,18,0


Unnamed: 0,A,B
2013-01-01 09:00:00,1,1
2013-01-01 18:00:00,3,1
2013-01-02 21:00:00,7,1
2013-01-03 06:00:00,9,1
2013-01-04 09:00:00,13,1
2013-01-04 18:00:00,15,1
2013-01-05 21:00:00,19,1


In [14]:
# Use the GroupBy object aggregate method to calculate the sum of column A within each group
# Aggregate simply applies the function you specify separately to each group, then reports the results
# The sum of all the entries in column A of group 0 is 66
# The sum of all the entries in column A of group 0 is 67
gb.aggregate(sum)

Unnamed: 0_level_0,A
B,Unnamed: 1_level_1
0,66
1,67


In [15]:
gb = df.groupby("B")
a = []
for name,group in gb:
    a.append(group)
# this code has actually separated the groups into two separate DataFrames
display(a[0])
display(a[1])

Unnamed: 0,A,B
2013-01-01 00:00:00,0,0
2013-01-02 03:00:00,4,0
2013-01-02 12:00:00,6,0
2013-01-03 15:00:00,10,0
2013-01-04 00:00:00,12,0
2013-01-05 03:00:00,16,0
2013-01-05 12:00:00,18,0


Unnamed: 0,A,B
2013-01-01 09:00:00,1,1
2013-01-01 18:00:00,3,1
2013-01-02 21:00:00,7,1
2013-01-03 06:00:00,9,1
2013-01-04 09:00:00,13,1
2013-01-04 18:00:00,15,1
2013-01-05 21:00:00,19,1


In [16]:
# Put them back together again with concat
df = pd.concat(a)
display(df)

Unnamed: 0,A,B
2013-01-01 00:00:00,0,0
2013-01-02 03:00:00,4,0
2013-01-02 12:00:00,6,0
2013-01-03 15:00:00,10,0
2013-01-04 00:00:00,12,0
2013-01-05 03:00:00,16,0
2013-01-05 12:00:00,18,0
2013-01-01 09:00:00,1,1
2013-01-01 18:00:00,3,1
2013-01-02 21:00:00,7,1
