In [1]:
import numpy as np
import pandas as pd

### Filtering, Sorting, Reshaping & Combining

In [2]:
dataset = np.arange(48).reshape(6, 8)
dataset

array([[ 0,  1,  2,  3,  4,  5,  6,  7],
       [ 8,  9, 10, 11, 12, 13, 14, 15],
       [16, 17, 18, 19, 20, 21, 22, 23],
       [24, 25, 26, 27, 28, 29, 30, 31],
       [32, 33, 34, 35, 36, 37, 38, 39],
       [40, 41, 42, 43, 44, 45, 46, 47]])

In [3]:
np.extract((dataset < 10), dataset)

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [4]:
dataset[dataset < 10]

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [5]:
dataset[(dataset >= 5) & (dataset <= 10)]

array([ 5,  6,  7,  8,  9, 10])

In [7]:
dataset[(dataset <= 5) | (dataset >= 45)]

array([ 0,  1,  2,  3,  4,  5, 45, 46, 47])

In [14]:
np.where(dataset < 15)
# (row), (col) -> meaning [0, 0, 0, 0, 0, 1, 1, 1, 1] -> 0th and 1st row
# [0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6] -> col index 0-7 for 0th row and 
# same col index 0-7 for 1st index

(array([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]),
 array([0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6]))

### Sorting

In [16]:
from numpy import random

In [19]:
x = random.randint(100, size=(6, 8))
x

array([[ 5, 73, 58, 59, 46, 52, 86, 89],
       [99, 47, 16, 12, 25,  7, 88, 10],
       [73, 36, 11, 84, 32, 15, 34, 47],
       [43, 12, 74, 75, 97, 81, 53, 76],
       [58, 63, 76, 59, 35, 54, 26, 35],
       [18, 10, 25, 80, 82,  5, 48, 68]])

In [21]:
np.sort(x, axis=0) # Sort through column

array([[ 5, 10, 11, 12, 25,  5, 26, 10],
       [18, 12, 16, 59, 32,  7, 34, 35],
       [43, 36, 25, 59, 35, 15, 48, 47],
       [58, 47, 58, 75, 46, 52, 53, 68],
       [73, 63, 74, 80, 82, 54, 86, 76],
       [99, 73, 76, 84, 97, 81, 88, 89]])

In [23]:
np.sort(x, axis=1) # Sort through row

array([[ 5, 46, 52, 58, 59, 73, 86, 89],
       [ 7, 10, 12, 16, 25, 47, 88, 99],
       [11, 15, 32, 34, 36, 47, 73, 84],
       [12, 43, 53, 74, 75, 76, 81, 97],
       [26, 35, 35, 54, 58, 59, 63, 76],
       [ 5, 10, 18, 25, 48, 68, 80, 82]])

In [25]:
np.sort(x)

array([[ 5, 46, 52, 58, 59, 73, 86, 89],
       [ 7, 10, 12, 16, 25, 47, 88, 99],
       [11, 15, 32, 34, 36, 47, 73, 84],
       [12, 43, 53, 74, 75, 76, 81, 97],
       [26, 35, 35, 54, 58, 59, 63, 76],
       [ 5, 10, 18, 25, 48, 68, 80, 82]])

In [29]:
np.sort(x, axis=None).reshape(6, 8) # sort whole and reshape for desired output

array([[ 5,  5,  7, 10, 10, 11, 12, 12],
       [15, 16, 18, 25, 25, 26, 32, 34],
       [35, 35, 36, 43, 46, 47, 47, 48],
       [52, 53, 54, 58, 58, 59, 59, 63],
       [68, 73, 73, 74, 75, 76, 76, 80],
       [81, 82, 84, 86, 88, 89, 97, 99]])

In [30]:
x.shape

(6, 8)

In [32]:
x.reshape(1, x.size)

array([[ 5, 73, 58, 59, 46, 52, 86, 89, 99, 47, 16, 12, 25,  7, 88, 10,
        73, 36, 11, 84, 32, 15, 34, 47, 43, 12, 74, 75, 97, 81, 53, 76,
        58, 63, 76, 59, 35, 54, 26, 35, 18, 10, 25, 80, 82,  5, 48, 68]])

In [34]:
y = x.reshape(1, -1)
y

array([[ 5, 73, 58, 59, 46, 52, 86, 89, 99, 47, 16, 12, 25,  7, 88, 10,
        73, 36, 11, 84, 32, 15, 34, 47, 43, 12, 74, 75, 97, 81, 53, 76,
        58, 63, 76, 59, 35, 54, 26, 35, 18, 10, 25, 80, 82,  5, 48, 68]])

In [36]:
z = np.sort(y)
z

array([[ 5,  5,  7, 10, 10, 11, 12, 12, 15, 16, 18, 25, 25, 26, 32, 34,
        35, 35, 36, 43, 46, 47, 47, 48, 52, 53, 54, 58, 58, 59, 59, 63,
        68, 73, 73, 74, 75, 76, 76, 80, 81, 82, 84, 86, 88, 89, 97, 99]])

In [37]:
z.reshape(x.shape)

array([[ 5,  5,  7, 10, 10, 11, 12, 12],
       [15, 16, 18, 25, 25, 26, 32, 34],
       [35, 35, 36, 43, 46, 47, 47, 48],
       [52, 53, 54, 58, 58, 59, 59, 63],
       [68, 73, 73, 74, 75, 76, 76, 80],
       [81, 82, 84, 86, 88, 89, 97, 99]])

In [38]:
np.sort(x.reshape(1, -1)).reshape(x.shape)

array([[ 5,  5,  7, 10, 10, 11, 12, 12],
       [15, 16, 18, 25, 25, 26, 32, 34],
       [35, 35, 36, 43, 46, 47, 47, 48],
       [52, 53, 54, 58, 58, 59, 59, 63],
       [68, 73, 73, 74, 75, 76, 76, 80],
       [81, 82, 84, 86, 88, 89, 97, 99]])

### Combining

In [41]:
ds1 = np.arange(1, 10).reshape(3, 3)
ds1

array([[1, 2, 3],
       [4, 5, 6],
       [7, 8, 9]])

In [42]:
ds2 = np.arange(10, 19).reshape(3, 3)
ds2

array([[10, 11, 12],
       [13, 14, 15],
       [16, 17, 18]])

In [44]:
np.vstack([ds1, ds2]) # Combining vertically

array([[ 1,  2,  3],
       [ 4,  5,  6],
       [ 7,  8,  9],
       [10, 11, 12],
       [13, 14, 15],
       [16, 17, 18]])

In [45]:
np.hstack([ds1, ds2]) # Combining horizontally

array([[ 1,  2,  3, 10, 11, 12],
       [ 4,  5,  6, 13, 14, 15],
       [ 7,  8,  9, 16, 17, 18]])

In [47]:
np.concatenate([ds1, ds2], axis=1)

array([[ 1,  2,  3, 10, 11, 12],
       [ 4,  5,  6, 13, 14, 15],
       [ 7,  8,  9, 16, 17, 18]])

In [48]:
np.concatenate([ds1, ds2], axis=0)

array([[ 1,  2,  3],
       [ 4,  5,  6],
       [ 7,  8,  9],
       [10, 11, 12],
       [13, 14, 15],
       [16, 17, 18]])

In [49]:
np.stack([ds1, ds2])

array([[[ 1,  2,  3],
        [ 4,  5,  6],
        [ 7,  8,  9]],

       [[10, 11, 12],
        [13, 14, 15],
        [16, 17, 18]]])

In [50]:
np.stack([ds1, ds2], axis=1)

array([[[ 1,  2,  3],
        [10, 11, 12]],

       [[ 4,  5,  6],
        [13, 14, 15]],

       [[ 7,  8,  9],
        [16, 17, 18]]])

## Pandas

### Exploring Pandas DataFrame

In [56]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'

df = pd.read_csv(url, header=None)

In [57]:
df.head()

Unnamed: 0,0,1,2,3,4
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [58]:
df

Unnamed: 0,0,1,2,3,4
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Iris-virginica
146,6.3,2.5,5.0,1.9,Iris-virginica
147,6.5,3.0,5.2,2.0,Iris-virginica
148,6.2,3.4,5.4,2.3,Iris-virginica


In [62]:
column_names = ['sl', 'sw', 'pl', 'pw', 'label']
df = pd.read_csv(url, names=column_names)
df

Unnamed: 0,sl,sw,pl,pw,label
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Iris-virginica
146,6.3,2.5,5.0,1.9,Iris-virginica
147,6.5,3.0,5.2,2.0,Iris-virginica
148,6.2,3.4,5.4,2.3,Iris-virginica


In [63]:
df.head()

Unnamed: 0,sl,sw,pl,pw,label
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [64]:
df.tail()

Unnamed: 0,sl,sw,pl,pw,label
145,6.7,3.0,5.2,2.3,Iris-virginica
146,6.3,2.5,5.0,1.9,Iris-virginica
147,6.5,3.0,5.2,2.0,Iris-virginica
148,6.2,3.4,5.4,2.3,Iris-virginica
149,5.9,3.0,5.1,1.8,Iris-virginica


In [65]:
df.head(7)

Unnamed: 0,sl,sw,pl,pw,label
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
5,5.4,3.9,1.7,0.4,Iris-setosa
6,4.6,3.4,1.4,0.3,Iris-setosa


In [66]:
df.sample() # gives a random row

Unnamed: 0,sl,sw,pl,pw,label
88,5.6,3.0,4.1,1.3,Iris-versicolor


In [67]:
df.sample(10)

Unnamed: 0,sl,sw,pl,pw,label
18,5.7,3.8,1.7,0.3,Iris-setosa
119,6.0,2.2,5.0,1.5,Iris-virginica
6,4.6,3.4,1.4,0.3,Iris-setosa
124,6.7,3.3,5.7,2.1,Iris-virginica
0,5.1,3.5,1.4,0.2,Iris-setosa
32,5.2,4.1,1.5,0.1,Iris-setosa
96,5.7,2.9,4.2,1.3,Iris-versicolor
31,5.4,3.4,1.5,0.4,Iris-setosa
111,6.4,2.7,5.3,1.9,Iris-virginica
135,7.7,3.0,6.1,2.3,Iris-virginica


In [68]:
df.shape

(150, 5)

In [69]:
df.size

750

In [70]:
df.describe()

Unnamed: 0,sl,sw,pl,pw
count,150.0,150.0,150.0,150.0
mean,5.843333,3.054,3.758667,1.198667
std,0.828066,0.433594,1.76442,0.763161
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


### Titanic Dataset

In [71]:
df = pd.read_csv('titanic.csv')

In [72]:
df

Unnamed: 0,Name,PClass,Age,Sex,Survived,SexCode
0,"Allen, Miss Elisabeth Walton",1st,29.00,female,1,1
1,"Allison, Miss Helen Loraine",1st,2.00,female,0,1
2,"Allison, Mr Hudson Joshua Creighton",1st,30.00,male,0,0
3,"Allison, Mrs Hudson JC (Bessie Waldo Daniels)",1st,25.00,female,0,1
4,"Allison, Master Hudson Trevor",1st,0.92,male,1,0
...,...,...,...,...,...,...
1308,"Zakarian, Mr Artun",3rd,27.00,male,0,0
1309,"Zakarian, Mr Maprieder",3rd,26.00,male,0,0
1310,"Zenni, Mr Philip",3rd,22.00,male,0,0
1311,"Lievens, Mr Rene",3rd,24.00,male,0,0


In [73]:
df.describe()

Unnamed: 0,Age,Survived,SexCode
count,756.0,1313.0,1313.0
mean,30.397989,0.342727,0.351866
std,14.259049,0.474802,0.477734
min,0.17,0.0,0.0
25%,21.0,0.0,0.0
50%,28.0,0.0,0.0
75%,39.0,1.0,1.0
max,71.0,1.0,1.0


In [75]:
df.isnull()

Unnamed: 0,Name,PClass,Age,Sex,Survived,SexCode
0,False,False,False,False,False,False
1,False,False,False,False,False,False
2,False,False,False,False,False,False
3,False,False,False,False,False,False
4,False,False,False,False,False,False
...,...,...,...,...,...,...
1308,False,False,False,False,False,False
1309,False,False,False,False,False,False
1310,False,False,False,False,False,False
1311,False,False,False,False,False,False


In [76]:
df.isnull().head(50)

Unnamed: 0,Name,PClass,Age,Sex,Survived,SexCode
0,False,False,False,False,False,False
1,False,False,False,False,False,False
2,False,False,False,False,False,False
3,False,False,False,False,False,False
4,False,False,False,False,False,False
5,False,False,False,False,False,False
6,False,False,False,False,False,False
7,False,False,False,False,False,False
8,False,False,False,False,False,False
9,False,False,False,False,False,False


In [77]:
df.sample(10).isnull()

Unnamed: 0,Name,PClass,Age,Sex,Survived,SexCode
571,False,False,False,False,False,False
1117,False,False,True,False,False,False
21,False,False,False,False,False,False
372,False,False,False,False,False,False
839,False,False,False,False,False,False
325,False,False,True,False,False,False
241,False,False,False,False,False,False
518,False,False,False,False,False,False
170,False,False,False,False,False,False
704,False,False,True,False,False,False


In [78]:
df.isnull().sum()

Name          0
PClass        0
Age         557
Sex           0
Survived      0
SexCode       0
dtype: int64

In [79]:
df[df['Age'].isnull()]

Unnamed: 0,Name,PClass,Age,Sex,Survived,SexCode
12,"Aubert, Mrs Leontine Pauline",1st,,female,1,1
13,"Barkworth, Mr Algernon H",1st,,male,1,0
14,"Baumann, Mr John D",1st,,male,0,0
29,"Borebank, Mr John James",1st,,male,0,0
32,"Bradley, Mr George",1st,,male,1,0
...,...,...,...,...,...,...
1300,"Wiseman, Mr Phillippe",3rd,,male,0,0
1302,"Yalsevac, Mr Ivan",3rd,,male,1,0
1305,"Youssef, Mr Gerios",3rd,,male,0,0
1306,"Zabour, Miss Hileni",3rd,,female,0,1


In [81]:
df_clean = df.dropna()

In [82]:
df_clean.isnull().sum()

Name        0
PClass      0
Age         0
Sex         0
Survived    0
SexCode     0
dtype: int64

In [83]:
df_clean.replace('female', 'Woman')

Unnamed: 0,Name,PClass,Age,Sex,Survived,SexCode
0,"Allen, Miss Elisabeth Walton",1st,29.00,Woman,1,1
1,"Allison, Miss Helen Loraine",1st,2.00,Woman,0,1
2,"Allison, Mr Hudson Joshua Creighton",1st,30.00,male,0,0
3,"Allison, Mrs Hudson JC (Bessie Waldo Daniels)",1st,25.00,Woman,0,1
4,"Allison, Master Hudson Trevor",1st,0.92,male,1,0
...,...,...,...,...,...,...
1308,"Zakarian, Mr Artun",3rd,27.00,male,0,0
1309,"Zakarian, Mr Maprieder",3rd,26.00,male,0,0
1310,"Zenni, Mr Philip",3rd,22.00,male,0,0
1311,"Lievens, Mr Rene",3rd,24.00,male,0,0


In [84]:
df_clean.replace(['female', 'male'], ['Woman', 'Man'], inplace=True)
df_clean

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean.replace(['female', 'male'], ['Woman', 'Man'], inplace=True)


Unnamed: 0,Name,PClass,Age,Sex,Survived,SexCode
0,"Allen, Miss Elisabeth Walton",1st,29.00,Woman,1,1
1,"Allison, Miss Helen Loraine",1st,2.00,Woman,0,1
2,"Allison, Mr Hudson Joshua Creighton",1st,30.00,Man,0,0
3,"Allison, Mrs Hudson JC (Bessie Waldo Daniels)",1st,25.00,Woman,0,1
4,"Allison, Master Hudson Trevor",1st,0.92,Man,1,0
...,...,...,...,...,...,...
1308,"Zakarian, Mr Artun",3rd,27.00,Man,0,0
1309,"Zakarian, Mr Maprieder",3rd,26.00,Man,0,0
1310,"Zenni, Mr Philip",3rd,22.00,Man,0,0
1311,"Lievens, Mr Rene",3rd,24.00,Man,0,0


In [85]:
df_clean = df.dropna()

In [87]:
df1 = df_clean.replace(['female', 'male'], ['Woman', 'Man'])
df1

Unnamed: 0,Name,PClass,Age,Sex,Survived,SexCode
0,"Allen, Miss Elisabeth Walton",1st,29.00,Woman,1,1
1,"Allison, Miss Helen Loraine",1st,2.00,Woman,0,1
2,"Allison, Mr Hudson Joshua Creighton",1st,30.00,Man,0,0
3,"Allison, Mrs Hudson JC (Bessie Waldo Daniels)",1st,25.00,Woman,0,1
4,"Allison, Master Hudson Trevor",1st,0.92,Man,1,0
...,...,...,...,...,...,...
1308,"Zakarian, Mr Artun",3rd,27.00,Man,0,0
1309,"Zakarian, Mr Maprieder",3rd,26.00,Man,0,0
1310,"Zenni, Mr Philip",3rd,22.00,Man,0,0
1311,"Lievens, Mr Rene",3rd,24.00,Man,0,0


In [88]:
df1['SexCode'] = df_clean['SexCode'].replace([0, 1], ['Zero', 'One'])
df1

Unnamed: 0,Name,PClass,Age,Sex,Survived,SexCode
0,"Allen, Miss Elisabeth Walton",1st,29.00,Woman,1,One
1,"Allison, Miss Helen Loraine",1st,2.00,Woman,0,One
2,"Allison, Mr Hudson Joshua Creighton",1st,30.00,Man,0,Zero
3,"Allison, Mrs Hudson JC (Bessie Waldo Daniels)",1st,25.00,Woman,0,One
4,"Allison, Master Hudson Trevor",1st,0.92,Man,1,Zero
...,...,...,...,...,...,...
1308,"Zakarian, Mr Artun",3rd,27.00,Man,0,Zero
1309,"Zakarian, Mr Maprieder",3rd,26.00,Man,0,Zero
1310,"Zenni, Mr Philip",3rd,22.00,Man,0,Zero
1311,"Lievens, Mr Rene",3rd,24.00,Man,0,Zero


In [89]:
df1['Age'].mean()

30.397989417989418

In [90]:
df1.Age

0       29.00
1        2.00
2       30.00
3       25.00
4        0.92
        ...  
1308    27.00
1309    26.00
1310    22.00
1311    24.00
1312    29.00
Name: Age, Length: 756, dtype: float64

In [92]:
df.iloc[:, 2]

0       29.00
1        2.00
2       30.00
3       25.00
4        0.92
        ...  
1308    27.00
1309    26.00
1310    22.00
1311    24.00
1312    29.00
Name: Age, Length: 1313, dtype: float64