In [32]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd

# Numpy

In [33]:
## Creating numpy array from a list
l1 = list(range(10))
l1, type(l1)

([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], list)

In [34]:
l1_arr = np.array(l1)
l1_arr, type(l1_arr)

(array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9]), numpy.ndarray)

In [35]:
## Adding two lists together vs adding two arrays together
l2 = list(range(0, 20, 2))
l_add = l1 + l2

arr_add = l1_arr + np.array(l2)
print('Adding two lists: ', l_add)
print('Adding two arrays: ', arr_add)

Adding two lists:  [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18]
Adding two arrays:  [ 0  3  6  9 12 15 18 21 24 27]


### Mathematical Operations on arrays

In [36]:
## Multiplying two arrays
arr_mul = np.array(l1) * np.array(l2)
print('Multiplying arr1 with arr2: ', arr_mul)

Multiplying arr1 with arr2:  [  0   2   8  18  32  50  72  98 128 162]


In [37]:
## Dividing array 2 with array 1
arr_div = np.array(l2) / np.array(l1)
print('Dividing arr2 with arr1: ', arr_div)

Dividing arr2 with arr1:  [nan  2.  2.  2.  2.  2.  2.  2.  2.  2.]


In [38]:
## Sine of an array
print('sin(arr1): ', np.sin(np.array(l1)))

sin(arr1):  [ 0.          0.84147098  0.90929743  0.14112001 -0.7568025  -0.95892427
 -0.2794155   0.6569866   0.98935825  0.41211849]


In [39]:
## Logarithm
print('Natural log: ', np.log(np.array(l1)))
print('Base-10 log: ', np.log10(np.array(l1)))
print('Base-2 log: ', np.log2(np.array(l1)))

Natural log:  [      -inf 0.         0.69314718 1.09861229 1.38629436 1.60943791
 1.79175947 1.94591015 2.07944154 2.19722458]
Base-10 log:  [      -inf 0.         0.30103    0.47712125 0.60205999 0.69897
 0.77815125 0.84509804 0.90308999 0.95424251]
Base-2 log:  [      -inf 0.         1.         1.5849625  2.         2.32192809
 2.5849625  2.80735492 3.         3.169925  ]


### Generation of arrays
- np.zeros
- np.ones
- np.arange
- np.linspace

In [43]:
print('A series of zeros: ', np.zeros(10))
print('A series of ones: ', np.ones(10))
print('A series of numbers: ', np.arange(10, 20))
print('A series of numbers with a step of 2: ',  np.arange(10, 20, 2))
print('A series of numbers with a step of 0.1: ', np.arange(10, 12, 0.2))
print('A series of numbers with every 5th number from 20 in reverse order: ', np.arange(20, -1, -5))
print('10 linearly spaced numbers between 1 and 4: ', np.linspace(1, 4, 10))

A series of zeros:  [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
A series of ones:  [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
A series of numbers:  [10 11 12 13 14 15 16 17 18 19]
A series of numbers with a step of 2:  [10 12 14 16 18]
A series of numbers with a step of 0.1:  [10.  10.2 10.4 10.6 10.8 11.  11.2 11.4 11.6 11.8]
A series of numbers with every 5th number from 20 in reverse order:  [20 15 10  5  0]
10 linearly spaced numbers between 1 and 4:  [1.         1.33333333 1.66666667 2.         2.33333333 2.66666667
 3.         3.33333333 3.66666667 4.        ]


### Multi-dimensional Arrays

In [44]:
nd_arr = [[1, 2, 3], [4, 5, 6], [7, 8, 9]]
nd_arr = np.array(nd_arr)
nd_arr

array([[1, 2, 3],
       [4, 5, 6],
       [7, 8, 9]])

In [48]:
nd_tuple = np.array([(1.2, 2, 3), (3, 5, 7)])
nd_tuple

array([[1.2, 2. , 3. ],
       [3. , 5. , 7. ]])

In [52]:
## Dimension, shape, size, data type of 2D array
print('Dimension: ', nd_arr.ndim)
print('Size: ', nd_arr.size)
print('Shape: ', nd_arr.shape)
print('DType: ', nd_arr.dtype)

Dimension:  2
Size:  9
Shape:  (3, 3)
DType:  int32


### Zeros, Ones, Random & Identity Matrices

In [55]:
print('Matrix of ones: \n', np.ones((3, 4)))

Matrix of ones: 
 [[1. 1. 1. 1.]
 [1. 1. 1. 1.]
 [1. 1. 1. 1.]]


In [56]:
print('Matrix of 5s: \n', 5 * np.ones((3, 4)))

Matrix of 5s: 
 [[5. 5. 5. 5.]
 [5. 5. 5. 5.]
 [5. 5. 5. 5.]]


In [57]:
print('Identity matrix of dimension 2: \n', np.eye((2)))

Identity matrix of dimension 2: 
 [[1. 0.]
 [0. 1.]]


In [58]:
print('Identity matrix of dimension 6: \n', np.eye((6)))

Identity matrix of dimension 6: 
 [[1. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 1.]]


In [59]:
print('Random matrix of shape (3, 4): \n', np.random.randint(low=1, high=20, size=(3, 4)))

Random matrix of shape (3, 4): 
 [[12  6  9  3]
 [19  5  5  9]
 [ 6  5  7 17]]


### Reshaping, Ravel, Min, Max, Sorting

In [60]:
a = np.random.randint(1, 200, 30)
b = a.reshape(2, 3, 5)
c = a.reshape(6, 5)

print('Shape of a: ', a.shape)
print('Shape of b: ', b.shape)
print('Shape of c: ', c.shape)

Shape of a:  (30,)
Shape of b:  (2, 3, 5)
Shape of c:  (6, 5)


In [65]:
print('a: \n', a)

a: 
 [ 17  71 115  87  22 135 112  70 168  29 129  13 181 178 111  41 152  53
 169  31 142  48 118  20  10 197  70 176 140  76]


In [66]:
print('b: \n', b)

b: 
 [[[ 17  71 115  87  22]
  [135 112  70 168  29]
  [129  13 181 178 111]]

 [[ 41 152  53 169  31]
  [142  48 118  20  10]
  [197  70 176 140  76]]]


In [67]:
print('c: \n', c)

c: 
 [[ 17  71 115  87  22]
 [135 112  70 168  29]
 [129  13 181 178 111]
 [ 41 152  53 169  31]
 [142  48 118  20  10]
 [197  70 176 140  76]]


In [69]:
## re-flatting the array
b_flat = b.ravel()
print(b_flat)

[ 17  71 115  87  22 135 112  70 168  29 129  13 181 178 111  41 152  53
 169  31 142  48 118  20  10 197  70 176 140  76]


### Indexing & Slicing

In [70]:
arr = np.arange(1, 20)
arr

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19])

In [74]:
print('5th element: ', arr[5])
print('Elements from 3rd to 5th index: ', arr[3:5])
print('Elements up to 4th index are:', arr[:4])
print('Elements in reverse: ', arr[-1::-1])
print('3 elements from last backwards: ', arr[-1:6:-2])
print('Elements from 3rd, 6th, 9th index: ', arr[[3, 6, 9]])

5th element:  6
Elements from 3rd to 5th index:  [4 5]
Elements up to 4th index are: [1 2 3 4]
Elements in reverse:  [19 18 17 16 15 14 13 12 11 10  9  8  7  6  5  4  3  2  1]
3 elements from last backwards:  [19 17 15 13 11  9]
Elements from 3rd, 6th, 9th index:  [ 4  7 10]


In [80]:
mat = np.random.randint(1, 100, 30).reshape(6, 5)
mat

array([[45, 49, 79, 90, 86],
       [45, 75, 47, 42, 36],
       [18, 73, 44,  6, 68],
       [73, 23, 86, 30, 50],
       [35, 77, 21,  1, 93],
       [69, 97, 46, 92, 58]])

In [86]:
print('Element in 1st row, 3rd col: ', mat[1][3])
print('Entire row at index 2: ', mat[2])
print('Entire 3rd column: ', mat[:, 3])
print('Matrix with row indices 1, 2 and column 3, 4: \n', mat[1:3, 3:5])

Element in 1st row, 3rd col:  42
Entire row at index 2:  [18 73 44  6 68]
Entire 3rd column:  [90 42  6 30  1 92]
Matrix with row indices 1, 2 and column 3, 4: 
 [[42 36]
 [ 6 68]]


### Conditional Subsetting

In [87]:
mat = np.random.randint(1, 10, 20).reshape(4, 5)
mat

array([[9, 1, 4, 5, 5],
       [8, 6, 6, 6, 2],
       [2, 6, 1, 5, 6],
       [8, 7, 2, 3, 8]])

In [90]:
print('Elements greater than 5: ', mat[mat>5])
print(mat>5)

Elements greater than 5:  [9 8 6 6 6 6 6 8 7 8]
[[ True False False False False]
 [ True  True  True  True False]
 [False  True False False  True]
 [ True  True False False  True]]


In [91]:
mat*(mat>5)

array([[9, 0, 0, 0, 0],
       [8, 6, 6, 6, 0],
       [0, 6, 0, 0, 6],
       [8, 7, 0, 0, 8]])

### Array operations

In [95]:
mat1 = np.random.randint(1, 10, 9).reshape(3, 3)
mat2 = np.random.randint(11, 20, 9).reshape(3, 3)

In [112]:
print('Matrix 1: \n', mat1)
print()
print('Matrix 2: \n', mat2)

Matrix 1: 
 [[3 1 7]
 [3 9 1]
 [9 8 5]]

Matrix 2: 
 [[12 16 18]
 [17 18 19]
 [17 11 18]]


In [113]:
print('Addition: \n', mat1 + mat2)

Addition: 
 [[15 17 25]
 [20 27 20]
 [26 19 23]]


In [114]:
print('Multiplication: \n', mat1 * mat2)

Multiplication: 
 [[ 36  16 126]
 [ 51 162  19]
 [153  88  90]]


In [115]:
print('Division: \n', mat1 / mat2)

Division: 
 [[0.25       0.0625     0.38888889]
 [0.17647059 0.5        0.05263158]
 [0.52941176 0.72727273 0.27777778]]


In [116]:
print('Linear combination: ', 3*mat1 - 2*mat2)

Linear combination:  [[-15 -29 -15]
 [-25  -9 -35]
 [ -7   2 -21]]


In [117]:
print('Addition of scalar: \n', 200 + mat1)

Addition of scalar: 
 [[203 201 207]
 [203 209 201]
 [209 208 205]]


In [118]:
print('Cube of a matrix: \n', mat1 ** 3)

Cube of a matrix: 
 [[ 27   1 343]
 [ 27 729   1]
 [729 512 125]]


In [120]:
print('Square root using pow: \n', pow(mat1, 0.5))

Square root using pow: 
 [[1.73205081 1.         2.64575131]
 [1.73205081 3.         1.        ]
 [3.         2.82842712 2.23606798]]


# Pandas

### Series

In [123]:
labels = ['a', 'b', 'c']
data = [10, 20, 30]
arr = np.array(data)
d = dict(zip(labels, data))
d

{'a': 10, 'b': 20, 'c': 30}

In [124]:
s1 = pd.Series(data=data)
s1

0    10
1    20
2    30
dtype: int64

In [125]:
s2 = pd.Series(data=data, index=labels)
s2

a    10
b    20
c    30
dtype: int64

In [126]:
s3 = pd.Series(d)
s3

a    10
b    20
c    30
dtype: int64

### DataFrames

In [132]:
df = pd.DataFrame(d, index=['x', 'y', 'z'])
df.head()

Unnamed: 0,a,b,c
x,10,20,30
y,10,20,30
z,10,20,30


In [133]:
df = pd.read_csv('data/boston_housing.csv')
df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,PRICE
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2


### Quick-checking DataFrames
- df.head()
- df.tail()
- df.sample()
- df.info()
- df.describe()

In [134]:
df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,PRICE
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2


In [135]:
df.tail()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,PRICE
501,0.06263,0.0,11.93,0,0.573,6.593,69.1,2.4786,1,273,21.0,391.99,9.67,22.4
502,0.04527,0.0,11.93,0,0.573,6.12,76.7,2.2875,1,273,21.0,396.9,9.08,20.6
503,0.06076,0.0,11.93,0,0.573,6.976,91.0,2.1675,1,273,21.0,396.9,5.64,23.9
504,0.10959,0.0,11.93,0,0.573,6.794,89.3,2.3889,1,273,21.0,393.45,6.48,22.0
505,0.04741,0.0,11.93,0,0.573,6.03,80.8,2.505,1,273,21.0,396.9,7.88,11.9


In [136]:
df.sample()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,PRICE
152,1.12658,0.0,19.58,1,0.871,5.012,88.0,1.6102,5,403,14.7,343.28,12.12,15.3


In [137]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
CRIM       506 non-null float64
ZN         506 non-null float64
INDUS      506 non-null float64
CHAS       506 non-null int64
NOX        506 non-null float64
RM         506 non-null float64
AGE        506 non-null float64
DIS        506 non-null float64
RAD        506 non-null int64
TAX        506 non-null int64
PTRATIO    506 non-null float64
B          506 non-null float64
LSTAT      506 non-null float64
PRICE      506 non-null float64
dtypes: float64(11), int64(3)
memory usage: 55.5 KB


In [138]:
df.describe()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,PRICE
count,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0
mean,3.593761,11.363636,11.136779,0.06917,0.554695,6.284634,68.574901,3.795043,9.549407,408.237154,18.455534,356.674032,12.653063,22.532806
std,8.596783,23.322453,6.860353,0.253994,0.115878,0.702617,28.148861,2.10571,8.707259,168.537116,2.164946,91.294864,7.141062,9.197104
min,0.00632,0.0,0.46,0.0,0.385,3.561,2.9,1.1296,1.0,187.0,12.6,0.32,1.73,5.0
25%,0.082045,0.0,5.19,0.0,0.449,5.8855,45.025,2.100175,4.0,279.0,17.4,375.3775,6.95,17.025
50%,0.25651,0.0,9.69,0.0,0.538,6.2085,77.5,3.20745,5.0,330.0,19.05,391.44,11.36,21.2
75%,3.647422,12.5,18.1,0.0,0.624,6.6235,94.075,5.188425,24.0,666.0,20.2,396.225,16.955,25.0
max,88.9762,100.0,27.74,1.0,0.871,8.78,100.0,12.1265,24.0,711.0,22.0,396.9,37.97,50.0


In [140]:
df.describe().transpose() ## transpose of describe()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
CRIM,506.0,3.593761,8.596783,0.00632,0.082045,0.25651,3.647422,88.9762
ZN,506.0,11.363636,23.322453,0.0,0.0,0.0,12.5,100.0
INDUS,506.0,11.136779,6.860353,0.46,5.19,9.69,18.1,27.74
CHAS,506.0,0.06917,0.253994,0.0,0.0,0.0,0.0,1.0
NOX,506.0,0.554695,0.115878,0.385,0.449,0.538,0.624,0.871
RM,506.0,6.284634,0.702617,3.561,5.8855,6.2085,6.6235,8.78
AGE,506.0,68.574901,28.148861,2.9,45.025,77.5,94.075,100.0
DIS,506.0,3.795043,2.10571,1.1296,2.100175,3.20745,5.188425,12.1265
RAD,506.0,9.549407,8.707259,1.0,4.0,5.0,24.0,24.0
TAX,506.0,408.237154,168.537116,187.0,279.0,330.0,666.0,711.0


### Basic Descriptive Stats on DataFrames
- df.mean()
- df.std()
- df.var()
- min(), max()

In [141]:
df.mean()

CRIM         3.593761
ZN          11.363636
INDUS       11.136779
CHAS         0.069170
NOX          0.554695
RM           6.284634
AGE         68.574901
DIS          3.795043
RAD          9.549407
TAX        408.237154
PTRATIO     18.455534
B          356.674032
LSTAT       12.653063
PRICE       22.532806
dtype: float64

In [142]:
df.std()

CRIM         8.596783
ZN          23.322453
INDUS        6.860353
CHAS         0.253994
NOX          0.115878
RM           0.702617
AGE         28.148861
DIS          2.105710
RAD          8.707259
TAX        168.537116
PTRATIO      2.164946
B           91.294864
LSTAT        7.141062
PRICE        9.197104
dtype: float64

In [143]:
df.var()

CRIM          73.904671
ZN           543.936814
INDUS         47.064442
CHAS           0.064513
NOX            0.013428
RM             0.493671
AGE          792.358399
DIS            4.434015
RAD           75.816366
TAX        28404.759488
PTRATIO        4.686989
B           8334.752263
LSTAT         50.994760
PRICE         84.586724
dtype: float64

In [144]:
df.min()

CRIM         0.00632
ZN           0.00000
INDUS        0.46000
CHAS         0.00000
NOX          0.38500
RM           3.56100
AGE          2.90000
DIS          1.12960
RAD          1.00000
TAX        187.00000
PTRATIO     12.60000
B            0.32000
LSTAT        1.73000
PRICE        5.00000
dtype: float64

In [147]:
df.max()

CRIM        88.9762
ZN         100.0000
INDUS       27.7400
CHAS         1.0000
NOX          0.8710
RM           8.7800
AGE        100.0000
DIS         12.1265
RAD         24.0000
TAX        711.0000
PTRATIO     22.0000
B          396.9000
LSTAT       37.9700
PRICE       50.0000
dtype: float64

### Indexing, Slicing

In [151]:
print('The CRIM column: \n', df['CRIM'])

The CRIM column: 
 0      0.00632
1      0.02731
2      0.02729
3      0.03237
4      0.06905
        ...   
501    0.06263
502    0.04527
503    0.06076
504    0.10959
505    0.04741
Name: CRIM, Length: 506, dtype: float64


In [159]:
print('Label-based loc method used for selecting rows: \n', df.loc[[200, 500]])

Label-based loc method used for selecting rows: 
         CRIM    ZN  INDUS  CHAS    NOX     RM   AGE     DIS  RAD  TAX  \
200  0.01778  95.0   1.47     0  0.403  7.135  13.9  7.6534    3  402   
500  0.22438   0.0   9.69     0  0.585  6.027  79.7  2.4982    6  391   

     PTRATIO      B  LSTAT  PRICE  
200     17.0  384.3   4.45   32.9  
500     19.2  396.9  14.33   16.8  


In [157]:
print('Index position based iloc: \n', df.iloc[[200, 500]])

Single row: 
         CRIM    ZN  INDUS  CHAS    NOX     RM   AGE     DIS  RAD  TAX  \
200  0.01778  95.0   1.47     0  0.403  7.135  13.9  7.6534    3  402   
500  0.22438   0.0   9.69     0  0.585  6.027  79.7  2.4982    6  391   

     PTRATIO      B  LSTAT  PRICE  
200     17.0  384.3   4.45   32.9  
500     19.2  396.9  14.33   16.8  


### Conditional subsetting

In [160]:
df['TAX'] > 200

0      True
1      True
2      True
3      True
4      True
       ... 
501    True
502    True
503    True
504    True
505    True
Name: TAX, Length: 506, dtype: bool

In [165]:
## Creating a subset dataframe using conditions
print('Subset where TAX > 500')
df[df['TAX'] > 500]

Subset where TAX > 500


Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,PRICE
356,8.98296,0.0,18.10,1,0.770,6.212,97.4,2.1222,24,666,20.2,377.73,17.60,17.8
357,3.84970,0.0,18.10,1,0.770,6.395,91.0,2.5052,24,666,20.2,391.34,13.27,21.7
358,5.20177,0.0,18.10,1,0.770,6.127,83.4,2.7227,24,666,20.2,395.43,11.48,22.7
359,4.26131,0.0,18.10,0,0.770,6.112,81.3,2.5091,24,666,20.2,390.74,12.67,22.6
360,4.54192,0.0,18.10,0,0.770,6.398,88.0,2.5182,24,666,20.2,374.56,7.79,25.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
488,0.15086,0.0,27.74,0,0.609,5.454,92.7,1.8209,4,711,20.1,395.09,18.06,15.2
489,0.18337,0.0,27.74,0,0.609,5.414,98.3,1.7554,4,711,20.1,344.05,23.97,7.0
490,0.20746,0.0,27.74,0,0.609,5.093,98.0,1.8226,4,711,20.1,318.43,29.68,8.1
491,0.10574,0.0,27.74,0,0.609,5.983,98.8,1.8681,4,711,20.1,390.11,18.07,13.6


In [166]:
print('Subset where TAX > 500 & AGE > 50')
df[(df['TAX'] > 500) & (df['AGE'] > 50)]

Subset where TAX > 500 & AGE > 50


Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,PRICE
356,8.98296,0.0,18.10,1,0.770,6.212,97.4,2.1222,24,666,20.2,377.73,17.60,17.8
357,3.84970,0.0,18.10,1,0.770,6.395,91.0,2.5052,24,666,20.2,391.34,13.27,21.7
358,5.20177,0.0,18.10,1,0.770,6.127,83.4,2.7227,24,666,20.2,395.43,11.48,22.7
359,4.26131,0.0,18.10,0,0.770,6.112,81.3,2.5091,24,666,20.2,390.74,12.67,22.6
360,4.54192,0.0,18.10,0,0.770,6.398,88.0,2.5182,24,666,20.2,374.56,7.79,25.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
488,0.15086,0.0,27.74,0,0.609,5.454,92.7,1.8209,4,711,20.1,395.09,18.06,15.2
489,0.18337,0.0,27.74,0,0.609,5.414,98.3,1.7554,4,711,20.1,344.05,23.97,7.0
490,0.20746,0.0,27.74,0,0.609,5.093,98.0,1.8226,4,711,20.1,318.43,29.68,8.1
491,0.10574,0.0,27.74,0,0.609,5.983,98.8,1.8681,4,711,20.1,390.11,18.07,13.6


### Operations on specific columns

In [172]:
df['TAX'].mean(), df['TAX'].std()

(408.2371541501976, 168.53711605495903)

In [197]:
print('Range of TAX in the dataset:')
df['TAX'].max() - df['TAX'].min()

Range of TAX in the dataset:


524

In [198]:
print('Top-5 percentile items in TAX: ', np.percentile(df['TAX'], 95))

Top-5 percentile items in TAX:  666.0


### Creating a new column

In [199]:
df['new_column'] = df['TAX'] * 20
df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,PRICE,new_column
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0,5920
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6,4840
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7,4840
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4,4440
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2,4440


In [200]:
df.sort_values(by='new_column')

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,PRICE,new_column
353,0.01709,90.0,2.02,0,0.410,6.728,36.1,12.1265,5,187,17.0,384.46,4.50,30.1,3740
123,0.15038,0.0,25.65,0,0.581,5.856,97.0,1.9444,2,188,19.1,370.31,25.41,17.3,3760
122,0.09299,0.0,25.65,0,0.581,5.961,92.9,2.0869,2,188,19.1,378.09,17.93,20.5,3760
126,0.38735,0.0,25.65,0,0.581,5.613,95.6,1.7572,2,188,19.1,359.29,27.26,15.7,3760
125,0.16902,0.0,25.65,0,0.581,5.986,88.4,1.9929,2,188,19.1,385.02,14.81,21.4,3760
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
492,0.11132,0.0,27.74,0,0.609,5.983,83.5,2.1099,4,711,20.1,396.90,13.35,20.1,14220
491,0.10574,0.0,27.74,0,0.609,5.983,98.8,1.8681,4,711,20.1,390.11,18.07,13.6,14220
490,0.20746,0.0,27.74,0,0.609,5.093,98.0,1.8226,4,711,20.1,318.43,29.68,8.1,14220
489,0.18337,0.0,27.74,0,0.609,5.414,98.3,1.7554,4,711,20.1,344.05,23.97,7.0,14220
