In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
import numpy as np
np.__version__

import pandas as pd
pd.__version__

'1.23.1'

'1.4.3'

# creation

In [3]:
# series
s =  pd.Series([1, 3, 5, np.nan, 6, 8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [4]:
# dataframe
df = pd.DataFrame({'X':[1, 2, 3], 'Y':[4, 5, 6]})
df

df = pd.DataFrame( np.arange(1,7).reshape(3, 2, order = 'F'), columns = ['X', 'Y'], index = ['a', 'b', 'c'] )
df

df.index
df.columns
df.values
df.X
df.X.values

Unnamed: 0,X,Y
0,1,4
1,2,5
2,3,6


Unnamed: 0,X,Y
a,1,4
b,2,5
c,3,6


Index(['a', 'b', 'c'], dtype='object')

Index(['X', 'Y'], dtype='object')

array([[1, 4],
       [2, 5],
       [3, 6]])

a    1
b    2
c    3
Name: X, dtype: int64

array([1, 2, 3])

# indexing

In [5]:
df = pd.DataFrame( np.arange(1,7).reshape(3, 2, order = 'F'), columns = ['X', 'Y'], index = ['a', 'b', 'c'] )
df

df[['X']]                # by col name  df.X,  df['X']

df.loc[['a','b']]   # by row name
df.loc[[True, True, False]]  # by row indictor
df.loc[df.X > 2]

Unnamed: 0,X,Y
a,1,4
b,2,5
c,3,6


Unnamed: 0,X
a,1
b,2
c,3


Unnamed: 0,X,Y
a,1,4
b,2,5


Unnamed: 0,X,Y
a,1,4
b,2,5


Unnamed: 0,X,Y
c,3,6


In [6]:
df.iloc[1, :]         # by row index
df.iloc[:, 0:2]         # by col index

X    2
Y    5
Name: b, dtype: int64

Unnamed: 0,X,Y
a,1,4
b,2,5
c,3,6


# combining & splitting

In [7]:
df_r = pd.concat([df, df], axis = 0)   # combining rows, identical row names allowed
df_r.reset_index(drop = True, inplace = True)   # resets row names in place
df_r

Unnamed: 0,X,Y
0,1,4
1,2,5
2,3,6
3,1,4
4,2,5
5,3,6


In [8]:
df_c = pd.concat([df, df], axis = 1)   # combining cols, identical col names allowed
df_c.columns = ['X1', 'Y1', 'X2', 'Y2']
df_c

Unnamed: 0,X1,Y1,X2,Y2
a,1,4,1,4
b,2,5,2,5
c,3,6,3,6


In [9]:
df.sample(n = 2, axis = 0)
df.sample(frac = 0.5, axis = 0)

Unnamed: 0,X,Y
b,2,5
a,1,4


Unnamed: 0,X,Y
b,2,5
a,1,4


# reshaping

In [10]:
df_wide = pd.DataFrame({'Month':['Jan','Jan','Feb', 'mar'], 
                        'Date': [15, 31, 28, 31],
                        'Person_A': [95, 96, 97, 98],              #score of person A
                        'Person_B': [95, 94, 93, 92]               #score of person B
                       })            
df_wide

Unnamed: 0,Month,Date,Person_A,Person_B
0,Jan,15,95,95
1,Jan,31,96,94
2,Feb,28,97,93
3,mar,31,98,92


In [11]:
# id_vars - record identifiers;  value_vars - cols to be unpivoted;  value_name - name for the variable column
df_long = pd.melt(df_wide, id_vars = ['Month', 'Date'], value_vars = ['Person_A', 'Person_B'], value_name = 'score')
df_long

Unnamed: 0,Month,Date,variable,score
0,Jan,15,Person_A,95
1,Jan,31,Person_A,96
2,Feb,28,Person_A,97
3,mar,31,Person_A,98
4,Jan,15,Person_B,95
5,Jan,31,Person_B,94
6,Feb,28,Person_B,93
7,mar,31,Person_B,92


In [12]:
# columns - wide_df columns
df_wide_again = df_long.pivot(index = ['Month', 'Date'], columns = ['variable'], values = 'score')
df_wide_again  # multi_column

Unnamed: 0_level_0,variable,Person_A,Person_B
Month,Date,Unnamed: 2_level_1,Unnamed: 3_level_1
Feb,28,97,93
Jan,15,95,95
Jan,31,96,94
mar,31,98,92


In [13]:
# MultiIndex
df_wide_again.index   
# convert df index as df columns
df_wide_again.index.to_frame()

MultiIndex([('Feb', 28),
            ('Jan', 15),
            ('Jan', 31),
            ('mar', 31)],
           names=['Month', 'Date'])

Unnamed: 0_level_0,Unnamed: 1_level_0,Month,Date
Month,Date,Unnamed: 2_level_1,Unnamed: 3_level_1
Feb,28,Feb,28
Jan,15,Jan,15
Jan,31,Jan,31
mar,31,mar,31


In [14]:
df_wide_again.reset_index(drop = False, inplace = True)
df_wide_again.columns.name = None    # remove column name - 'variable' in this case
df_wide_again

Unnamed: 0,Month,Date,Person_A,Person_B
0,Feb,28,97,93
1,Jan,15,95,95
2,Jan,31,96,94
3,mar,31,98,92
