In [1]:
import pandas as pd

In [2]:
df2 = pd.DataFrame({'Names': ['Simon', 'Kate', 'Francis', 'Laura', 'Mary', 'Julian', 'Rosie', 'Simon', 'Laura'],
                   'Height':[180, 165, 170, 164, 163, 175, 166, 180, 164],
                   'Weight':[85, 65, 68, 45, 43, 72, 46, 85, 45],
                   'Pref_food': ['steak', 'pizza', 'pasta', 'pizza', 'vegetables', 'steak', 'seafood', 'steak', 'pizza'],
                   'Sex': ['m','f','m','f','f','m','f', 'm', 'f']})

In [3]:
df2

Unnamed: 0,Names,Height,Weight,Pref_food,Sex
0,Simon,180,85,steak,m
1,Kate,165,65,pizza,f
2,Francis,170,68,pasta,m
3,Laura,164,45,pizza,f
4,Mary,163,43,vegetables,f
5,Julian,175,72,steak,m
6,Rosie,166,46,seafood,f
7,Simon,180,85,steak,m
8,Laura,164,45,pizza,f


### work with dunny variables

In [4]:
df_dummy = pd.get_dummies(df2['Sex'], prefix='Sex')

In [5]:
df_dummy

Unnamed: 0,Sex_f,Sex_m
0,0,1
1,1,0
2,0,1
3,1,0
4,1,0
5,0,1
6,1,0
7,0,1
8,1,0


### join dummies to original dataset

In [6]:
df2.join(df_dummy)

Unnamed: 0,Names,Height,Weight,Pref_food,Sex,Sex_f,Sex_m
0,Simon,180,85,steak,m,0,1
1,Kate,165,65,pizza,f,1,0
2,Francis,170,68,pasta,m,0,1
3,Laura,164,45,pizza,f,1,0
4,Mary,163,43,vegetables,f,1,0
5,Julian,175,72,steak,m,0,1
6,Rosie,166,46,seafood,f,1,0
7,Simon,180,85,steak,m,0,1
8,Laura,164,45,pizza,f,1,0


In [7]:
df3 = df2.join(df_dummy)

In [8]:
df3.head(2)

Unnamed: 0,Names,Height,Weight,Pref_food,Sex,Sex_f,Sex_m
0,Simon,180,85,steak,m,0,1
1,Kate,165,65,pizza,f,1,0


### remove the Sex variable

In [9]:
del df3['Sex']

In [10]:
df3.head(2)

Unnamed: 0,Names,Height,Weight,Pref_food,Sex_f,Sex_m
0,Simon,180,85,steak,0,1
1,Kate,165,65,pizza,1,0


### identify duplicates

In [11]:
df3.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6    False
7     True
8     True
dtype: bool

### remove duplicates with drop_duplicates()

In [12]:
df2.drop_duplicates()

Unnamed: 0,Names,Height,Weight,Pref_food,Sex
0,Simon,180,85,steak,m
1,Kate,165,65,pizza,f
2,Francis,170,68,pasta,m
3,Laura,164,45,pizza,f
4,Mary,163,43,vegetables,f
5,Julian,175,72,steak,m
6,Rosie,166,46,seafood,f


### overwrite df3

In [13]:
df3 = df2.drop_duplicates()

### we can delete case / entry using drop

In [14]:
df2.drop(2) # drops row with row index 2

Unnamed: 0,Names,Height,Weight,Pref_food,Sex
0,Simon,180,85,steak,m
1,Kate,165,65,pizza,f
3,Laura,164,45,pizza,f
4,Mary,163,43,vegetables,f
5,Julian,175,72,steak,m
6,Rosie,166,46,seafood,f
7,Simon,180,85,steak,m
8,Laura,164,45,pizza,f


### stack() and unstack() functions for re-organizing data

In [15]:
df3.stack()

0  Names             Simon
   Height              180
   Weight               85
   Pref_food         steak
   Sex                   m
1  Names              Kate
   Height              165
   Weight               65
   Pref_food         pizza
   Sex                   f
2  Names           Francis
   Height              170
   Weight               68
   Pref_food         pasta
   Sex                   m
3  Names             Laura
   Height              164
   Weight               45
   Pref_food         pizza
   Sex                   f
4  Names              Mary
   Height              163
   Weight               43
   Pref_food    vegetables
   Sex                   f
5  Names            Julian
   Height              175
   Weight               72
   Pref_food         steak
   Sex                   m
6  Names             Rosie
   Height              166
   Weight               46
   Pref_food       seafood
   Sex                   f
dtype: object

In [16]:
stacked = df3.stack()

In [17]:
unstacked = stacked.unstack()

In [18]:
unstacked

Unnamed: 0,Names,Height,Weight,Pref_food,Sex
0,Simon,180,85,steak,m
1,Kate,165,65,pizza,f
2,Francis,170,68,pasta,m
3,Laura,164,45,pizza,f
4,Mary,163,43,vegetables,f
5,Julian,175,72,steak,m
6,Rosie,166,46,seafood,f


In [19]:
pd.melt(df3) #Flatten

Unnamed: 0,variable,value
0,Names,Simon
1,Names,Kate
2,Names,Francis
3,Names,Laura
4,Names,Mary
5,Names,Julian
6,Names,Rosie
7,Height,180
8,Height,165
9,Height,170


### Transpose columns

In [21]:
df2.head(2)

Unnamed: 0,Names,Height,Weight,Pref_food,Sex
0,Simon,180,85,steak,m
1,Kate,165,65,pizza,f


In [22]:
df2.T

Unnamed: 0,0,1,2,3,4,5,6,7,8
Names,Simon,Kate,Francis,Laura,Mary,Julian,Rosie,Simon,Laura
Height,180,165,170,164,163,175,166,180,164
Weight,85,65,68,45,43,72,46,85,45
Pref_food,steak,pizza,pasta,pizza,vegetables,steak,seafood,steak,pizza
Sex,m,f,m,f,f,m,f,m,f


### Extract random sample of data using sample()

In [23]:
df2.sample(2)

Unnamed: 0,Names,Height,Weight,Pref_food,Sex
5,Julian,175,72,steak,m
1,Kate,165,65,pizza,f


### Consistent sampling

In [24]:
import numpy as np

In [25]:
np.random.seed(1)

In [26]:
df2.sample(2)

Unnamed: 0,Names,Height,Weight,Pref_food,Sex
8,Laura,164,45,pizza,f
2,Francis,170,68,pasta,m


In [27]:
np.random.seed(1)

In [28]:
df2.sample(2)

Unnamed: 0,Names,Height,Weight,Pref_food,Sex
8,Laura,164,45,pizza,f
2,Francis,170,68,pasta,m


### use frac to extract a percentage or fraction

In [29]:
df2.sample(frac=0.1)

Unnamed: 0,Names,Height,Weight,Pref_food,Sex
3,Laura,164,45,pizza,f


In [30]:
df2.sample(frac=0.5)

Unnamed: 0,Names,Height,Weight,Pref_food,Sex
1,Kate,165,65,pizza,f
7,Simon,180,85,steak,m
0,Simon,180,85,steak,m
3,Laura,164,45,pizza,f


### using nlargest to get data with largest of specified column