In [1]:
# pandas.pydate.org
# mybinder.org 
#numfocus.org

In [41]:
import pandas as pd
import numpy as np

### pandas module 

In [42]:
# Dictionary with raw data
data = {'day_of_year': [1,2,3,4,5],
       'wind_speed': [2.2, 3.2, -9999.0, 4.1, 2.9],
       'wind_direction': ['E', 'NW', 'NW', 'N', 'S'],
       'precipitation': [0,18,25, 2, 0]}

In [43]:
# Create DataFrame
df = pd.DataFrame(data)

In [44]:
print(df)

   day_of_year  wind_speed wind_direction  precipitation
0            1         2.2              E              0
1            2         3.2             NW             18
2            3     -9999.0             NW             25
3            4         4.1              N              2
4            5         2.9              S              0


In [45]:
df.head()

Unnamed: 0,day_of_year,wind_speed,wind_direction,precipitation
0,1,2.2,E,0
1,2,3.2,NW,18
2,3,-9999.0,NW,25
3,4,4.1,N,2
4,5,2.9,S,0


In [46]:
df.tail(3) #last 3 roles 

Unnamed: 0,day_of_year,wind_speed,wind_direction,precipitation
2,3,-9999.0,NW,25
3,4,4.1,N,2
4,5,2.9,S,0


In [47]:
df.columns

Index(['day_of_year', 'wind_speed', 'wind_direction', 'precipitation'], dtype='object')

In [48]:
#for columns in df.columns

In [49]:
df.dtypes

day_of_year         int64
wind_speed        float64
wind_direction     object
precipitation       int64
dtype: object

In [50]:
df.size # total number of elements in dataframe

20

In [51]:
df.shape #no pararenthesis bc it's an attribute not a method

(5, 4)

In [89]:
df.shape[0]

5

In [53]:
indx_missing = df.isin([-9999.0])
print(indx_missing)

   day_of_year  wind_speed  wind_direction  precipitation
0        False       False           False          False
1        False       False           False          False
2        False        True           False          False
3        False       False           False          False
4        False       False           False          False


In [54]:
df["wind_speed"] == -9999.0

0    False
1    False
2     True
3    False
4    False
Name: wind_speed, dtype: bool

In [55]:
type(np.nan) #not a number

float

In [56]:
df[indx_missing] = np.nan  #logical slicing 
df 

Unnamed: 0,day_of_year,wind_speed,wind_direction,precipitation
0,1,2.2,E,0
1,2,3.2,NW,18
2,3,,NW,25
3,4,4.1,N,2
4,5,2.9,S,0


In [57]:
df.describe()

Unnamed: 0,day_of_year,wind_speed,precipitation
count,5.0,4.0,5.0
mean,3.0,3.1,9.0
std,1.581139,0.787401,11.7047
min,1.0,2.2,0.0
25%,2.0,2.725,0.0
50%,3.0,3.05,2.0
75%,4.0,3.425,18.0
max,5.0,4.1,25.0


In [60]:
print(df['wind_speed'].mean())
print(df['wind_speed'].min())
print(df['wind_speed'].max())
print(df['wind_speed'].std())
print(df['wind_speed'].median())
print(df['wind_speed'].quantile(0.5))

3.1
2.2
4.1
0.7874007874011809
3.05
3.05


In [62]:
df['precipitation'].cumsum() #calling a data #string 

0     0
1    18
2    43
3    45
4    45
Name: precipitation, dtype: int64

In [63]:
df.precipitation.cumsum() # not as clear

0     0
1    18
2    43
3    45
4    45
Name: precipitation, dtype: int64

In [64]:
df['wind_direction'].unique()

array(['E', 'NW', 'N', 'S'], dtype=object)

In [66]:
#select roles
df[0:3] #does not include the last value 

Unnamed: 0,day_of_year,wind_speed,wind_direction,precipitation
0,1,2.2,E,0
1,2,3.2,NW,18
2,3,,NW,25


In [68]:
#slect columns
df[['wind_speed','wind_direction']]

Unnamed: 0,wind_speed,wind_direction
0,2.2,E
1,3.2,NW
2,,NW
3,4.1,N
4,2.9,S


In [72]:
#interger loc or iloc notation (here we ONLY use the row and column number)

df.iloc[0:3, 2:4]


Unnamed: 0,day_of_year,precipitation
0,1,0
1,2,18
2,3,25


In [73]:
#non-inclusive in call

df.iloc[0:3, [0,3]]

Unnamed: 0,day_of_year,precipitation
0,1,0
1,2,18
2,3,25


In [74]:
#non-inclusive in call

df.iloc[:,2]

0     E
1    NW
2    NW
3     N
4     S
Name: wind_direction, dtype: object

In [75]:
#location or loc (here we use the column names)

df.loc[0:3,['wind_speed','precipitation']]

Unnamed: 0,wind_speed,precipitation
0,2.2,0
1,3.2,18
2,,25
3,4.1,2


In [78]:
indx_wind = df['wind_speed'] > 3
indx_wind

0    False
1     True
2    False
3     True
4    False
Name: wind_speed, dtype: bool

In [80]:
df.loc[indx_wind,'precipitation']

1    18
3     2
Name: precipitation, dtype: int64

In [90]:
dates = pd.date_range('20200101', periods = df.shape[0], freq = 'm')
dates

DatetimeIndex(['2020-01-31', '2020-02-29', '2020-03-31', '2020-04-30',
               '2020-05-31'],
              dtype='datetime64[ns]', freq='M')

In [91]:
df.insert(0,'dates',dates)
df

Unnamed: 0,dates,day_of_year,wind_speed,wind_direction,precipitation
0,2020-01-31,1,2.2,E,0
1,2020-02-29,2,3.2,NW,18
2,2020-03-31,3,,NW,25
3,2020-04-30,4,4.1,N,2
4,2020-05-31,5,2.9,S,0


In [92]:
df.drop(columns=['dates'])

Unnamed: 0,day_of_year,wind_speed,wind_direction,precipitation
0,1,2.2,E,0
1,2,3.2,NW,18
2,3,,NW,25
3,4,4.1,N,2
4,5,2.9,S,0
