# Other notes on DataFrame

The index, on the one hand, indicates the difference in rows, while the column names indicate the difference in columns.



In [1]:
import numpy as np
import pandas as pd

In [11]:
data = np.array([['', 'Col1', 'Col2', 'Col3'], 
                ['Row1', 1, 2, 3],
                ['Row2', 4, 5, 6],
                ['Row3', 7, 8, 9]])
data

array([['', 'Col1', 'Col2', 'Col3'],
       ['Row1', '1', '2', '3'],
       ['Row2', '4', '5', '6'],
       ['Row3', '7', '8', '9']], dtype='<U4')

In [12]:
df = pd.DataFrame(data=data[1:, 1:], # all data except first row + first col
                 index=data[1:, 0], # all rows, cols 0
                 columns=data[0, 1:]) # row 0, all cols
df

Unnamed: 0,Col1,Col2,Col3
Row1,1,2,3
Row2,4,5,6
Row3,7,8,9


In [13]:
my_2darr = np.array([[1,2,3], [4,5,6]])
my_2darr

array([[1, 2, 3],
       [4, 5, 6]])

In [14]:
my_dict = {
    1: ['1', '3'],
    2: ['1', '2'],
    3: ['2', '4']
}
my_dict

{1: ['1', '3'], 2: ['1', '2'], 3: ['2', '4']}

In [29]:
df0 = pd.DataFrame(data=[1,2,3,4], index=range(0,4), columns=['A'])
df0

Unnamed: 0,A
0,1
1,2
2,3
3,4


In [32]:
df1 = pd.DataFrame(data=[[1, 2], [2,3], [3,4], [4,5]], index=range(0,4), columns=['A', 'B'])
df1

Unnamed: 0,A,B
0,1,2
1,2,3
2,3,4
3,4,5


In [33]:
info = {"Belgium":"Brussels", "India":"New Delhi", "United Kingdom":"London", "United States":"Washington"}
info

{'Belgium': 'Brussels',
 'India': 'New Delhi',
 'United Kingdom': 'London',
 'United States': 'Washington'}

In [39]:
df_info = pd.DataFrame(info.items())
df_info

Unnamed: 0,0,1
0,Belgium,Brussels
1,India,New Delhi
2,United Kingdom,London
3,United States,Washington


In [40]:
df_info.index

RangeIndex(start=0, stop=4, step=1)

### Create Series from dict.

The keys of dictionary will become the index of the series. The indexes are auto sorted.

In [34]:
my_series = pd.Series(info)
my_series

Belgium             Brussels
India              New Delhi
United Kingdom        London
United States     Washington
dtype: object

In [38]:
my_series.index

Index(['Belgium', 'India', 'United Kingdom', 'United States'], dtype='object')

In [54]:
df2 = pd.DataFrame(np.array([[1,2,3,4], [4,5,6,7], [np.nan, 9, 2, np.nan]]), columns=['c1', 'c2', 'c3', 'c4'])
df2

Unnamed: 0,c1,c2,c3,c4
0,1.0,2.0,3.0,4.0
1,4.0,5.0,6.0,7.0
2,,9.0,2.0,


In [55]:
# dimension
df2.shape

(3, 4)

In [56]:
# height

len(df2)

3

In [58]:
# also gives the height but excludes NaN

df2['c1'].count()

2

In [59]:
list(df2.columns.values)

['c1', 'c2', 'c3', 'c4']

In [60]:
df2.iloc[0][0] # index by number

1.0

In [61]:
df2.loc[0]['c2']

2.0

In [62]:
df2.at[0, 'c3']

3.0

In [64]:
df2.iat[0, 1] # index by number

2.0

In [65]:
df3 = pd.DataFrame(np.array([[1,2,3,4], [4,5,6,7], [np.nan, 9, 2, np.nan]]), index=['r1', 'r2', 'r3'], columns=['c1', 'c2', 'c3', 'c4'])
df3

Unnamed: 0,c1,c2,c3,c4
r1,1.0,2.0,3.0,4.0
r2,4.0,5.0,6.0,7.0
r3,,9.0,2.0,


In [66]:
df3.iloc[0]

c1    1.0
c2    2.0
c3    3.0
c4    4.0
Name: r1, dtype: float64

In [67]:
df3.iloc[0][1]

2.0

In [69]:
df3.loc['r3']

c1    NaN
c2    9.0
c3    2.0
c4    NaN
Name: r3, dtype: float64

In [71]:
# df3.loc['c2'] won't work

df3.loc[:, 'c2']

r1    2.0
r2    5.0
r3    9.0
Name: c2, dtype: float64

In [73]:
df3.loc['r1']['c3']

3.0

In [75]:
df3.at['r1', 'c3']

3.0

In [76]:
df3

Unnamed: 0,c1,c2,c3,c4
r1,1.0,2.0,3.0,4.0
r2,4.0,5.0,6.0,7.0
r3,,9.0,2.0,


In [81]:
df3 = df3.set_index('c1')

In [82]:
df3

Unnamed: 0_level_0,c2,c3,c4
c1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1.0,2.0,3.0,4.0
4.0,5.0,6.0,7.0
,9.0,2.0,


In [83]:
df3.index

Float64Index([1.0, 4.0, nan], dtype='float64', name='c1')

In [93]:
df3 = df3.reset_index()

In [96]:
df3

Unnamed: 0,c1,c2,c3,c4
0,1.0,2.0,3.0,4.0
1,4.0,5.0,6.0,7.0
2,,9.0,2.0,


In [106]:
df3.index

RangeIndex(start=0, stop=3, step=1)

In [107]:
df3.loc[0]

c1    1.0
c2    2.0
c3    3.0
c4    4.0
Name: 0, dtype: float64

In [108]:
df = pd.DataFrame(data=np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]), index= [2, 'A', 4], columns=[48, 49, 50])
df

Unnamed: 0,48,49,50
2,1,2,3
A,4,5,6
4,7,8,9


In [109]:
df.loc[2]

48    1
49    2
50    3
Name: 2, dtype: int64

In [110]:
df.loc['A']

48    4
49    5
50    6
Name: A, dtype: int64

In [111]:
df.loc[4]

48    7
49    8
50    9
Name: 4, dtype: int64

In [112]:
df.iloc[2]

48    7
49    8
50    9
Name: 4, dtype: int64

# Add row

In [113]:
df.loc['B'] = [10,11,12]

In [114]:
df

Unnamed: 0,48,49,50
2,1,2,3
A,4,5,6
4,7,8,9
B,10,11,12


In [115]:
df[48]

2     1
A     4
4     7
B    10
Name: 48, dtype: int64

In [118]:
df = pd.DataFrame(data=np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]), columns=['A', 'B', 'C'])

In [119]:
df

Unnamed: 0,A,B,C
0,1,2,3
1,4,5,6
2,7,8,9


In [120]:
df['D'] = df.index
df

Unnamed: 0,A,B,C,D
0,1,2,3,0
1,4,5,6,1
2,7,8,9,2


# Add column

In [123]:
df.loc[:, 'E'] = pd.Series(['5', '6', '7'], index=df.index)
df

Unnamed: 0,A,B,C,D,4,E
0,1,2,3,0,5,5
1,4,5,6,1,6,6
2,7,8,9,2,7,7


In [124]:
df.index

RangeIndex(start=0, stop=3, step=1)

In [126]:
df = pd.DataFrame(data=np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9], [40, 50, 60], [23, 35, 37]]), 
                  index= [2.5, 12.6, 4.8, 4.8, 2.5], 
                  columns=[48, 49, 50])
df

Unnamed: 0,48,49,50
2.5,1,2,3
12.6,4,5,6
4.8,7,8,9
4.8,40,50,60
2.5,23,35,37
