# Data Creation
There are several ways to create dataframe:

In [3]:
import pandas as pd

a = [1, 2, 3, 4]
b = [21, 22, 23, 24]
name = ["C1", "C2", "C3", "C4"]
df1 = pd.DataFrame([a, b])
df1.columns = name
df1

Unnamed: 0,C1,C2,C3,C4
0,1,2,3,4
1,21,22,23,24


or:

In [46]:
a = [1, "a", "a1"]
b = [21, "b", "b1"]
c = [31, "c", "c1"]
name = ["C1", "D1", "D2"]
df2 = pd.DataFrame([a, b, c])
df2.columns = name
df2

Unnamed: 0,C1,D1,D2
0,1,a,a1
1,21,b,b1
2,31,c,c1


In [47]:
a = [1, 11, 111]
b = [21, 22, 222]
c = [41, 44, 444]
name = ["E1", "E2", "E3"]
df3 = pd.DataFrame([a, b, c])
df3.columns = name
df3

Unnamed: 0,E1,E2,E3
0,1,11,111
1,21,22,222
2,41,44,444


In [180]:
a = [1, 11, 111]
b = [21, 22, 222]
c = [51, 52]
name = ["F1", "F2", "F3"]
df4 = pd.DataFrame([a, b, c])
df4.columns = name
df4

Unnamed: 0,F1,F2,F3
0,1,11,111.0
1,21,22,222.0
2,51,52,


# Basic Properties of the Data
The number of rows and columns of the dataframe:

In [118]:
print(df1)
print('-------------')
print(df1.shape)

   C1  C2  C3  C4
0   1   2   3   4
1  21  22  23  24
-------------
(2, 4)


Get a more detailed info on the data structure:

In [69]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2 entries, 0 to 1
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   C1      2 non-null      int64
 1   C2      2 non-null      int64
 2   C3      2 non-null      int64
 3   C4      2 non-null      int64
dtypes: int64(4)
memory usage: 192.0 bytes


Get some basic analysis:

In [68]:
df1.describe()

Unnamed: 0,C1,C2,C3,C4
count,2.0,2.0,2.0,2.0
mean,11.0,12.0,13.0,14.0
std,14.142136,14.142136,14.142136,14.142136
min,1.0,2.0,3.0,4.0
25%,6.0,7.0,8.0,9.0
50%,11.0,12.0,13.0,14.0
75%,16.0,17.0,18.0,19.0
max,21.0,22.0,23.0,24.0


# Select Data
Column:

In [124]:
print(df3)
print('--------')
print(df3['E1']) # column
print('--------')
print(df3.E1)

   E1  E2   E3
0   1  11  111
1  21  22  222
2  41  44  444
--------
0     1
1    21
2    41
Name: E1, dtype: int64
--------
0     1
1    21
2    41
Name: E1, dtype: int64


Row:

In [94]:
print(df3)
print('--------')
print(df3[1:2]) # row

   E1  E2   E3
0   1  11  111
1  21  22  222
2  41  44  444
--------
   E1  E2   E3
1  21  22  222


Subset of the data:

In [113]:
print(df3)
print('--------')
print(df3.iloc[0:2,1:2] ) # row 0 and 1, column 1
print('--------')
df3.loc[1:2, ['E1','E3']] # note that, for slices, loc defaults to non-pythonic index: starting from 1 and endpoint is included

   E1  E2   E3
0   1  11  111
1  21  22  222
2  41  44  444
--------
   E2
0  11
1  22
--------


Unnamed: 0,E1,E3
1,21,222
2,41,444


Specific item in the dataframe:

In [181]:
print(df3)
print('--------')
print(df3.iloc[1,1] ) # row 1, column 1
print('--------')
df3.loc[1, 'E2'] # note that, for non-slices case, loc is still pythonic

   E1  E2   E3
0   1  11  111
1  21  22  222
2  41  44  444
--------
22
--------


22

# Manipulation of Data

In [141]:
df_tmp = df1.copy() # deep copy of dataframe. If use =, it only passes reference.
print(df_tmp)
df_tmp['C5'] = [5, 25] # add one column
print(df_tmp)
del df_tmp

   C1  C2  C3  C4
0   1   2   3   4
1  21  22  23  24
   C1  C2  C3  C4  C5
0   1   2   3   4   5
1  21  22  23  24  25


In [156]:
df_tmp = df1.copy()
print(df_tmp)
############# something wrong here: df_tmp[2:3] = [1, 2, 3, 4] # add one row 
#df_tmp

   C1  C2  C3  C4
0   1   2   3   4
1  21  22  23  24


# Filtering Data

Specifying conditions for filtering data:

In [187]:
print(df3)
print('------------------')
print(df3[df3['E2'] > 20])
print('------------------')
print(df3[df3['E2'].isin([11, 44])])

   E1  E2   E3
0   1  11  111
1  21  22  222
2  41  44  444
------------------
   E1  E2   E3
1  21  22  222
2  41  44  444
------------------
   E1  E2   E3
0   1  11  111
2  41  44  444


Remove "not available":

In [189]:
print(df4)
print('--------')
print(df4[df4['F3'].notna()]) # notna = not NA = not not-available

   F1  F2     F3
0   1  11  111.0
1  21  22  222.0
2  51  52    NaN
--------
   F1  F2     F3
0   1  11  111.0
1  21  22  222.0


# Combination of Dataframes
## Join
```data_frame_a.join(data_frame_b)``` will join the two dataframes `data_frame_a` and `data_frame_b` together:

In [48]:
print(df1)
print(df2)
df1.join(df2, lsuffix = '_left', rsuffix = '_right') # will give error if suffix is not supplied (but somehow it accepts the same suffix)

   C1  C2  C3  C4
0   1   2   3   4
1  21  22  23  24
   C1 D1  D2
0   1  a  a1
1  21  b  b1
2  31  c  c1


Unnamed: 0,C1_left,C2,C3,C4,C1_right,D1,D2
0,1,2,3,4,1,a,a1
1,21,22,23,24,21,b,b1


## Merge


In [43]:
print(df1)
print(df2)
df1.merge(df2)

   C1  C2  C3  C4
0  11  22  33  44
1  91  92  93  94
   C1 D1  D2
0  11  a  a1
1  91  b  b1
2  11  c  c1


Unnamed: 0,C1,C2,C3,C4,D1,D2
0,11,22,33,44,a,a1
1,11,22,33,44,c,c1
2,91,92,93,94,b,b1


## Concatenate

In [44]:
print(df1)
print(df2)
pd.concat([df1, df2], axis=0)

   C1  C2  C3  C4
0  11  22  33  44
1  91  92  93  94
   C1 D1  D2
0  11  a  a1
1  91  b  b1
2  11  c  c1


Unnamed: 0,C1,C2,C3,C4,D1,D2
0,11,22.0,33.0,44.0,,
1,91,92.0,93.0,94.0,,
0,11,,,,a,a1
1,91,,,,b,b1
2,11,,,,c,c1


# Data Read-in

In [21]:
df_iris = pd.read_csv("data/iris.csv")
df_iris.info()
df_iris.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  150 non-null    float64
 1   sepal_width   150 non-null    float64
 2   petal_length  150 non-null    float64
 3   petal_width   150 non-null    float64
 4   species       150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
count,150.0,150.0,150.0,150.0
mean,5.843333,3.054,3.758667,1.198667
std,0.828066,0.433594,1.76442,0.763161
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


## Time Data

In [1]:
# read a file and parse time at certain column
#df = DataFrame("data/time.csv", parse_dates=["date_1", "date_2"])

In [7]:
# days in a month 
pd.Period('2018-2-17').days_in_month

28