In [1]:
import pandas as pd

In [2]:
df_ones = pd.DataFrame({'k1': ['A', 'A', 'B', 'B', 'C', 'C'], 
                       'col1': [100, 200, 300, 300, 400, 500],
                       'col2' : ['NY', 'CA', 'WA', 'WA', 'AK', 'NV']})

In [4]:
df_ones

Unnamed: 0,k1,col1,col2
0,A,100,NY
1,A,200,CA
2,B,300,WA
3,B,300,WA
4,C,400,AK
5,C,500,NV


In [5]:
# unique() returns the unique values
df_ones['col2'].unique()

array(['NY', 'CA', 'WA', 'AK', 'NV'], dtype=object)

In [6]:

df_ones['k1'].unique()

array(['A', 'B', 'C'], dtype=object)

In [7]:
# it returns the number of uniques, length of unique list
df_ones['k1'].nunique()

3

In [8]:
df_ones['col2'].nunique()

5

In [9]:
# if you want to know number of counts per category.
df_ones['col2'].value_counts()

WA    2
CA    1
AK    1
NY    1
NV    1
Name: col2, dtype: int64

In [10]:
# drop_duplicate will drop exact duplicate row
df_ones.drop_duplicates()

Unnamed: 0,k1,col1,col2
0,A,100,NY
1,A,200,CA
2,B,300,WA
4,C,400,AK
5,C,500,NV


In [13]:
# new column that is result of math operations
df_ones['NEW'] = df_ones['col1'] * 10

In [14]:
df_ones

Unnamed: 0,k1,col1,col2,NEW
0,A,100,NY,1000
1,A,200,CA,2000
2,B,300,WA,3000
3,B,300,WA,3000
4,C,400,AK,4000
5,C,500,NV,5000


In [15]:
def grab_first_letter(state):
    return state[0]

In [16]:
grab_first_letter('CA')

'C'

In [18]:
df_ones['col2'].apply(grab_first_letter)

0    N
1    C
2    W
3    W
4    A
5    N
Name: col2, dtype: object

In [20]:
df_ones['first letters'] = df_ones['col2'].apply(grab_first_letter)

In [21]:
df_ones

Unnamed: 0,k1,col1,col2,NEW,first letters
0,A,100,NY,1000,N
1,A,200,CA,2000,C
2,B,300,WA,3000,W
3,B,300,WA,3000,W
4,C,400,AK,4000,A
5,C,500,NV,5000,N


In [22]:
def complex_letter(state):
    
    if state[0] == "W":
        return "Washington"
    else:
        return "Error"

In [25]:
df_ones['col2'].apply(complex_letter)

0         Error
1         Error
2    Washington
3    Washington
4         Error
5         Error
Name: col2, dtype: object

In [26]:
df_ones['k1']

0    A
1    A
2    B
3    B
4    C
5    C
Name: k1, dtype: object

In [27]:
my_map = {'A': 1, 'B':2, 'C':3}

In [28]:
df_ones['k1'].map(my_map)

0    1
1    1
2    2
3    2
4    3
5    3
Name: k1, dtype: int64

In [30]:
df_ones['numbers'] = df_ones['k1'].map(my_map)

In [31]:
df_ones

Unnamed: 0,k1,col1,col2,NEW,first letters,numbers
0,A,100,NY,1000,N,1
1,A,200,CA,2000,C,1
2,B,300,WA,3000,W,2
3,B,300,WA,3000,W,2
4,C,400,AK,4000,A,3
5,C,500,NV,5000,N,3


In [32]:
# if you want to know the max
df_ones['col1'].max()

500

In [33]:
# if you want to know the index of max, integer based one
df_ones['col1'].idxmax()

5

In [35]:
df_ones.columns 

Index(['k1', 'col1', 'col2', 'NEW', 'first letters', 'numbers'], dtype='object')

In [37]:
# reassignment of columns
# must have exact length of existing columns
df_ones.columns = ['C1', 'C2', 'C3', 'C4', 'C5', 'C6']

In [39]:
# if you want to sort the dataframe by particular column in alphabetical order
df_ones.sort_values('C3')


Unnamed: 0,C1,C2,C3,C4,C5,C6
4,C,400,AK,4000,A,3
1,A,200,CA,2000,C,1
5,C,500,NV,5000,N,3
0,A,100,NY,1000,N,1
2,B,300,WA,3000,W,2
3,B,300,WA,3000,W,2


In [40]:
features = pd.DataFrame({'A': [100, 200, 300, 400, 500], 
                        'B' : [12, 13, 14, 15, 16]})

predictions = pd.DataFrame({'pred': [0, 1, 1, 0, 1]})

In [42]:
features

Unnamed: 0,A,B
0,100,12
1,200,13
2,300,14
3,400,15
4,500,16


In [41]:
predictions

Unnamed: 0,pred
0,0
1,1
2,1
3,0
4,1


In [43]:
# if you want to concetenate them
pd.concat([features, predictions])

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  


Unnamed: 0,A,B,pred
0,100.0,12.0,
1,200.0,13.0,
2,300.0,14.0,
3,400.0,15.0,
4,500.0,16.0,
0,,,0.0
1,,,1.0
2,,,1.0
3,,,0.0
4,,,1.0


In [44]:
# if you want to join them along the columns
pd.concat([features, predictions], axis=1)

Unnamed: 0,A,B,pred
0,100,12,0
1,200,13,1
2,300,14,1
3,400,15,0
4,500,16,1


In [45]:
df_ones

Unnamed: 0,C1,C2,C3,C4,C5,C6
0,A,100,NY,1000,N,1
1,A,200,CA,2000,C,1
2,B,300,WA,3000,W,2
3,B,300,WA,3000,W,2
4,C,400,AK,4000,A,3
5,C,500,NV,5000,N,3


In [46]:
# if you want to make [one hot encoding variables] for categorical column
# use get_dummies()
pd.get_dummies(df_ones['C1'])

Unnamed: 0,A,B,C
0,1,0,0
1,1,0,0
2,0,1,0
3,0,1,0
4,0,0,1
5,0,0,1
