In [3]:
import pandas as pd
import numpy as np

# Creating DataFrame

In [26]:
df = pd.DataFrame({
        'one': pd.Series(np.random.randn(3), index=['a', 'b', 'c']),
        'two': pd.Series(np.random.randn(4), index=['a', 'b', 'c', 'd']),
        'three': pd.Series(np.random.randn(3), index=['b', 'c', 'd'])})
df

Unnamed: 0,one,two,three
a,0.246619,-0.570219,
b,0.67631,0.482506,-0.150805
c,-1.317104,0.135745,-1.575886
d,,-0.262067,-0.540236


In [27]:
df2 = df.copy()
df.gt(df2)

Unnamed: 0,one,two,three
a,False,False,False
b,False,False,False
c,False,False,False
d,False,False,False


In [28]:
# np.nan == np.nan returns False.
df2.ne(df)

Unnamed: 0,one,two,three
a,False,False,True
b,False,False,False
c,False,False,False
d,True,False,False


# Boolean Comparisons
- eq (equivalent to ==) — equals to
- ne (equivalent to !=) — not equals to
- le (equivalent to <=) — less than or equals to
- lt (equivalent to <) — less than
- ge (equivalent to >=) — greater than or equals to
- gt (equivalent to >) — greater tha

In [29]:
(df > 0).all()

one      False
two      False
three    False
dtype: bool

In [30]:
(df > 0).any()

one       True
two       True
three    False
dtype: bool

In [31]:
(df > 0).any().any()

True

In [32]:
(df > 0).any().all()

False

In [33]:
pd.Series(['foo', 'bar', 'baz']) == 'foo'


0     True
1    False
2    False
dtype: bool

In [34]:
pd.Series(['foo', 'bar', 'baz']) == pd.Index(['foo', 'bar', 'qux'])

0     True
1     True
2    False
dtype: bool

### Create a DataFrame by passing a Numpy array with a datetime index and labeled columns.

In [64]:
dates = pd.date_range('20130101', periods=6)
print(dates)
type(dates)

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')


pandas.core.indexes.datetimes.DatetimeIndex

In [40]:
dates = pd.date_range('20130101', periods=6)
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [41]:
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD'))
df

Unnamed: 0,A,B,C,D
2013-01-01,-0.134772,0.670491,0.614065,-0.079269
2013-01-02,-0.51516,0.29328,-0.621966,0.592382
2013-01-03,0.852644,0.726139,-0.115264,-1.664506
2013-01-04,0.338062,1.170494,0.068594,-0.232622
2013-01-05,-0.874027,-0.31534,-0.515381,-0.734613
2013-01-06,0.756373,-0.255169,0.692513,-0.931905


### Create a DataFrame from a dict of objects

In [46]:
df2 = pd.DataFrame({'A': 1.0,
                        'B': pd.Timestamp('20130102'),
                        'C': pd.Series(1, index=list(range(4)), dtype='float32'),
                        'D': np.array([3] * 4, dtype='int32'),
                        'E': pd.Categorical(["test", "train", "test", "train"]),
                        'F': 'foo'})
df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [47]:
 df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

In [49]:
df2.columns

Index(['A', 'B', 'C', 'D', 'E', 'F'], dtype='object')

In [50]:
df['A'] #or df.A

2013-01-01   -0.134772
2013-01-02   -0.515160
2013-01-03    0.852644
2013-01-04    0.338062
2013-01-05   -0.874027
2013-01-06    0.756373
Freq: D, Name: A, dtype: float64

In [52]:
df[0:3]

Unnamed: 0,A,B,C,D
2013-01-01,-0.134772,0.670491,0.614065,-0.079269
2013-01-02,-0.51516,0.29328,-0.621966,0.592382
2013-01-03,0.852644,0.726139,-0.115264,-1.664506


In [53]:
df['20130102':'20130104']

Unnamed: 0,A,B,C,D
2013-01-02,-0.51516,0.29328,-0.621966,0.592382
2013-01-03,0.852644,0.726139,-0.115264,-1.664506
2013-01-04,0.338062,1.170494,0.068594,-0.232622


### Convert DataFrame to numpy array.

In [69]:
df = pd.DataFrame({
        'one': pd.Series(np.random.randn(3), index=['a', 'b', 'c']),
        'two': pd.Series(np.random.randn(4), index=['a', 'b', 'c', 'd']),
        'three': pd.Series(np.random.randn(3), index=['b', 'c', 'd'])})
df

Unnamed: 0,one,two,three
a,-0.099115,-0.223328,
b,0.317007,2.491321,-0.159752
c,1.119023,-0.013469,0.293467
d,,0.092789,0.300903


In [70]:
df.to_numpy()

array([[-0.09911548, -0.22332781,         nan],
       [ 0.31700675,  2.49132077, -0.15975224],
       [ 1.11902297, -0.01346897,  0.2934674 ],
       [        nan,  0.0927889 ,  0.30090333]])

Tranpose your data

In [71]:
df.T

Unnamed: 0,a,b,c,d
one,-0.099115,0.317007,1.119023,
two,-0.223328,2.491321,-0.013469,0.092789
three,,-0.159752,0.293467,0.300903


In [79]:
#sort by column name
df.sort_index(axis=1, ascending=False)

Unnamed: 0,two,three,one
a,-0.223328,,-0.099115
b,2.491321,-0.159752,0.317007
c,-0.013469,0.293467,1.119023
d,0.092789,0.300903,


In [80]:
#sort by row name
df.sort_index(axis=0, ascending=False)

Unnamed: 0,one,two,three
d,,0.092789,0.300903
c,1.119023,-0.013469,0.293467
b,0.317007,2.491321,-0.159752
a,-0.099115,-0.223328,


In [83]:
df.sort_values(by='three')

Unnamed: 0,one,two,three
b,0.317007,2.491321,-0.159752
c,1.119023,-0.013469,0.293467
d,,0.092789,0.300903
a,-0.099115,-0.223328,


# how to access the data in Pandas

Cheat Sheet:
![Pandas cheat-sheet](/pics/Get-values-DataFrame.png)

## Selection by Label

In [54]:
df.loc["2013-01-01"]

A   -0.134772
B    0.670491
C    0.614065
D   -0.079269
Name: 2013-01-01 00:00:00, dtype: float64

In [55]:
df.loc[:, ['A', 'B']]

Unnamed: 0,A,B
2013-01-01,-0.134772,0.670491
2013-01-02,-0.51516,0.29328
2013-01-03,0.852644,0.726139
2013-01-04,0.338062,1.170494
2013-01-05,-0.874027,-0.31534
2013-01-06,0.756373,-0.255169


In [56]:
df.loc['20130102':'20130104', ['A', 'B']]

Unnamed: 0,A,B
2013-01-02,-0.51516,0.29328
2013-01-03,0.852644,0.726139
2013-01-04,0.338062,1.170494


In [57]:
df.loc['20130102', ['A', 'B']]

A   -0.51516
B    0.29328
Name: 2013-01-02 00:00:00, dtype: float64

In [59]:
df.loc[dates[0], 'A']

-0.13477219424408182

In [60]:
df.iloc[3]

A    0.338062
B    1.170494
C    0.068594
D   -0.232622
Name: 2013-01-04 00:00:00, dtype: float64

In [63]:
df.iloc[3,2]

0.0685936682504372

In [61]:
df.iloc[3:5, 0:2]

Unnamed: 0,A,B
2013-01-04,0.338062,1.170494
2013-01-05,-0.874027,-0.31534


In [62]:
df.iloc[1:3, :]

Unnamed: 0,A,B,C,D
2013-01-02,-0.51516,0.29328,-0.621966,0.592382
2013-01-03,0.852644,0.726139,-0.115264,-1.664506


## Selection by dtype

In [84]:
df = pd.DataFrame({'string': list('abc'),
                       'int64': list(range(1, 4)),
                       'uint8': np.arange(3, 6).astype('u1'),
                       'float64': np.arange(4.0, 7.0),
                       'bool1': [True, False, True],
                       'bool2': [False, True, False],
                       'dates': pd.date_range('now', periods=3),
                       'category': pd.Series(list("ABC")).astype('category')})
df

Unnamed: 0,string,int64,uint8,float64,bool1,bool2,dates,category
0,a,1,3,4.0,True,False,2023-02-19 02:00:09.667609,A
1,b,2,4,5.0,False,True,2023-02-20 02:00:09.667609,B
2,c,3,5,6.0,True,False,2023-02-21 02:00:09.667609,C


In [85]:
df.select_dtypes(include=[bool])

Unnamed: 0,bool1,bool2
0,True,False
1,False,True
2,True,False


## Boolean Indexing

In [86]:
df[df['float64'] >= 5]

Unnamed: 0,string,int64,uint8,float64,bool1,bool2,dates,category
1,b,2,4,5.0,False,True,2023-02-20 02:00:09.667609,B
2,c,3,5,6.0,True,False,2023-02-21 02:00:09.667609,C


In [88]:
df2 = df.copy()
df2

Unnamed: 0,string,int64,uint8,float64,bool1,bool2,dates,category
0,a,1,3,4.0,True,False,2023-02-19 02:00:09.667609,A
1,b,2,4,5.0,False,True,2023-02-20 02:00:09.667609,B
2,c,3,5,6.0,True,False,2023-02-21 02:00:09.667609,C


### Create a new column

In [89]:
df2['E'] = ['one', 'two', 'three']
df2

Unnamed: 0,string,int64,uint8,float64,bool1,bool2,dates,category,E
0,a,1,3,4.0,True,False,2023-02-19 02:00:09.667609,A,one
1,b,2,4,5.0,False,True,2023-02-20 02:00:09.667609,B,two
2,c,3,5,6.0,True,False,2023-02-21 02:00:09.667609,C,three


In [90]:
#Now we can use function isin() to take only rows where E is one or two.

df2[df2['E'].isin(['one','two'])]

Unnamed: 0,string,int64,uint8,float64,bool1,bool2,dates,category,E
0,a,1,3,4.0,True,False,2023-02-19 02:00:09.667609,A,one
1,b,2,4,5.0,False,True,2023-02-20 02:00:09.667609,B,two
