# NumPy | Pandas | Matplotlib | Seaborn

### Auto-managed closed tools
Qilk | Tableau | Looker | Zoho Analytics

### Programming Languages
Python | R | Julia

## Data Extraction

SQL

Scapping

File Formats

    CSV
    JSON
    XML
Consulting APIs

Buying Data

Distributed Databases

## Data Cleaning

Missing values and empty data

Data imputation

Incorrect types

Incorrect or invalid values

Outliers and non relevant data

Statistical sanitization

## Data Wrangling

Hierarchical Data

Handling Categorical data

Reshaping and transforming structures

Indexing data for quick access

Merging, combining and joining data

## Analysis

Exploration

Building Statistical models

Visualization and representations

Correlation vs Causation analysis

Hypothesis testing

Statistical analysis

Reporting

## Action

Building Machine Learning models

Feature Engineering

Moving ML into production

Building ETL pipelines

Live dashboard and reporting

Decision making and real-life tests

## Python Ecosystem

### Libraries

pandas | matplotlib | numpy | seaborn | statsmodels | scipy | scikit-learn 

# NumPy

In [1]:
import sys
import numpy as np

## Basic NumPy arrays

In [2]:
np.array([1, 2, 3, 4])

array([1, 2, 3, 4])

In [4]:
a = np.array([1, 2, 3, 4])
a

array([1, 2, 3, 4])

In [5]:
b = np.array([5, 6, 7, 8])
b

array([5, 6, 7, 8])

In [6]:
a + b

array([ 6,  8, 10, 12])

In [7]:
a * b 

array([ 5, 12, 21, 32])

In [8]:
a - b 

array([-4, -4, -4, -4])

In [9]:
a / b 

array([0.2       , 0.33333333, 0.42857143, 0.5       ])

In [10]:
a[0], a[1] 

(1, 2)

In [14]:
a[0:2]

array([1, 2])

In [12]:
b[1:3] 

array([6, 7])

In [13]:
a[::2]

array([1, 3])

In [15]:
b

array([5, 6, 7, 8])

In [16]:
b[0], b[1], b[-1]

(5, 6, 8)

In [17]:
b[[0, 1, -1]]

array([5, 6, 8])

## Speed - Arrays vs Lists

In [99]:
lista = list(range(1000000))
arr = np.arange(1000000)

In [100]:
%time for _ in range(10): arr2 = arr * 2

CPU times: user 11.3 ms, sys: 4.77 ms, total: 16.1 ms
Wall time: 15 ms


In [101]:
%time for _ in range(10): lista2 = [x * 2 for x in lista]

CPU times: user 402 ms, sys: 56.8 ms, total: 459 ms
Wall time: 458 ms


## Array Types

In [18]:
a

array([1, 2, 3, 4])

In [19]:
a.dtype

dtype('int64')

In [21]:
c = np.array([0.1, 0.2, 0.3, 0.4])

In [22]:
c

array([0.1, 0.2, 0.3, 0.4])

In [23]:
c.dtype

dtype('float64')

In [24]:
np.array([1, 2, 3, 4], dtype=np.float)

array([1., 2., 3., 4.])

In [25]:
d = np.array(['a', 'b', 'c', 'd'])

In [26]:
d

array(['a', 'b', 'c', 'd'], dtype='<U1')

In [27]:
d.dtype

dtype('<U1')

In [29]:
e = np.array([{'a' : 1}, sys])

In [30]:
e.dtype

dtype('O')

## Dimensions and Shapes

In [31]:
A = np.array([
    [1, 2, 3],
    [4, 5, 6]
])

In [32]:
A

array([[1, 2, 3],
       [4, 5, 6]])

In [33]:
A.shape

(2, 3)

In [34]:
A.ndim

2

In [35]:
A.size

6

In [36]:
B = np.array([
    [
        [1, 2, 3],
        [4, 5, 6],
    ],
    [
        [7, 8, 9],
        [10, 11, 12]
    ]
])

In [37]:
B

array([[[ 1,  2,  3],
        [ 4,  5,  6]],

       [[ 7,  8,  9],
        [10, 11, 12]]])

In [38]:
B.shape

(2, 2, 3)

In [39]:
B.ndim

3

In [40]:
B.size

12

## Indexing and Slicing of Matrices

In [54]:
A = np.array([
    [1, 2, 3],
    [4, 5, 6],
    [7, 8, 9]
])

In [55]:
A

array([[1, 2, 3],
       [4, 5, 6],
       [7, 8, 9]])

In [56]:
A[1]

array([4, 5, 6])

In [57]:
A[1][1]

5

In [58]:
A[1, 0]

4

In [59]:
A[:, :2]

array([[1, 2],
       [4, 5],
       [7, 8]])

In [60]:
A[:2, 2:]

array([[3],
       [6]])

In [61]:
A[:2, :2]

array([[1, 2],
       [4, 5]])

In [62]:
A[1] = np.array([10, 10, 10])

In [63]:
A

array([[ 1,  2,  3],
       [10, 10, 10],
       [ 7,  8,  9]])

In [64]:
A[2] = 99

In [65]:
A

array([[ 1,  2,  3],
       [10, 10, 10],
       [99, 99, 99]])

## Summary Statistics

In [66]:
a = np.array([1, 2, 3, 4])

In [67]:
a.sum()

10

In [68]:
a.mean()

2.5

In [69]:
a.std()

1.118033988749895

In [70]:
a.var()

1.25

In [71]:
A = np.array([
    [1, 2, 3],
    [4, 5, 6],
    [7, 8, 9]
])

In [72]:
A.sum()

45

In [73]:
A.mean()

5.0

In [74]:
A.std()

2.581988897471611

In [75]:
A.var()

6.666666666666667

In [76]:
A.sum(axis=1)

array([ 6, 15, 24])

In [80]:
A.sum(axis=0)

array([12, 15, 18])

In [77]:
A.mean(axis=1)

array([2., 5., 8.])

In [81]:
A.mean(axis=0)

array([4., 5., 6.])

In [78]:
A.std(axis=1)

array([0.81649658, 0.81649658, 0.81649658])

In [79]:
A.var(axis=1)

array([0.66666667, 0.66666667, 0.66666667])

## Broadcasting and Vectorized Operations

In [82]:
a = np.arange(4)

In [83]:
a

array([0, 1, 2, 3])

In [84]:
a + 10

array([10, 11, 12, 13])

In [85]:
a * 10

array([ 0, 10, 20, 30])

In [88]:
a += 100

In [89]:
a

array([200, 201, 202, 203])

## Boolean Arrays

In [102]:
a = np.arange(4)

In [103]:
a

array([0, 1, 2, 3])

In [104]:
a[[0, -1]]

array([0, 3])

In [105]:
a[[True, False, False, True]]

array([0, 3])

In [106]:
a >= 2

array([False, False,  True,  True])

In [108]:
a[a >= 2]

array([2, 3])

In [109]:
a.mean()

1.5

In [110]:
a[a > a.mean()]

array([2, 3])

In [111]:
a[~(a > a.mean())]

array([0, 1])

In [112]:
a[(a == 0) | (a == 1)]

array([0, 1])

In [113]:
a[(a <= 2) & (a % 2 == 0)]

array([0, 2])

In [114]:
A = np.random.randint(100, size=(3, 3))

In [115]:
A

array([[75, 82, 88],
       [37, 90, 68],
       [99, 65, 42]])

In [116]:
A[np.array([
    [True, False, True],
    [False, True, False],
    [True, False, True]
])]

array([75, 88, 90, 99, 42])

In [118]:
A > 50

array([[ True,  True,  True],
       [False,  True,  True],
       [ True,  True, False]])

## Linear Algebra

In [119]:
A = np.array([
    [1, 2, 3],
    [4, 5, 6],
    [7, 8, 9]
])

In [120]:
A

array([[1, 2, 3],
       [4, 5, 6],
       [7, 8, 9]])

In [121]:
B = np.array([
    [10, 11, 12],
    [13, 14, 15],
    [16, 17, 18]
])

In [122]:
A.dot(B)

array([[ 84,  90,  96],
       [201, 216, 231],
       [318, 342, 366]])

In [123]:
A @ B

array([[ 84,  90,  96],
       [201, 216, 231],
       [318, 342, 366]])

In [124]:
B.T

array([[10, 13, 16],
       [11, 14, 17],
       [12, 15, 18]])

In [125]:
A

array([[1, 2, 3],
       [4, 5, 6],
       [7, 8, 9]])

In [126]:
B.T @ A

array([[174, 213, 252],
       [186, 228, 270],
       [198, 243, 288]])

## Useful NumPy Functions

### random

In [127]:
np.random.random(size=2)

array([0.20680412, 0.83912217])

In [128]:
np.random.normal(size=2)

array([-1.89686269,  0.56548912])

In [129]:
np.random.rand(2, 4)

array([[0.16145687, 0.58532728, 0.63904779, 0.51947092],
       [0.29336215, 0.03163814, 0.59360999, 0.70137197]])

### arange

In [130]:
np.arange(10)

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [131]:
np.arange(5, 10)

array([5, 6, 7, 8, 9])

In [132]:
np.arange(0, 1, .1)

array([0. , 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9])

### reshape

In [133]:
np.arange(10).reshape(2, 5)

array([[0, 1, 2, 3, 4],
       [5, 6, 7, 8, 9]])

In [134]:
np.arange(10).reshape(5, 2)

array([[0, 1],
       [2, 3],
       [4, 5],
       [6, 7],
       [8, 9]])

### linspace

In [135]:
np.linspace(0, 1, 5)

array([0.  , 0.25, 0.5 , 0.75, 1.  ])

In [136]:
np.linspace(0, 1, 20)

array([0.        , 0.05263158, 0.10526316, 0.15789474, 0.21052632,
       0.26315789, 0.31578947, 0.36842105, 0.42105263, 0.47368421,
       0.52631579, 0.57894737, 0.63157895, 0.68421053, 0.73684211,
       0.78947368, 0.84210526, 0.89473684, 0.94736842, 1.        ])

In [137]:
np.linspace(0, 1, 20, False)

array([0.  , 0.05, 0.1 , 0.15, 0.2 , 0.25, 0.3 , 0.35, 0.4 , 0.45, 0.5 ,
       0.55, 0.6 , 0.65, 0.7 , 0.75, 0.8 , 0.85, 0.9 , 0.95])

### zeros, ones, empty

In [138]:
np.zeros(5)

array([0., 0., 0., 0., 0.])

In [140]:
np.zeros((3, 3))

array([[0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.]])

In [141]:
np.zeros((3, 3), dtype=np.int)

array([[0, 0, 0],
       [0, 0, 0],
       [0, 0, 0]])

In [142]:
np.ones(5)

array([1., 1., 1., 1., 1.])

In [143]:
np.ones((3, 3))

array([[1., 1., 1.],
       [1., 1., 1.],
       [1., 1., 1.]])

In [144]:
np.ones((3, 3), dtype=np.int)

array([[1, 1, 1],
       [1, 1, 1],
       [1, 1, 1]])

In [145]:
np.empty(5)

array([1., 1., 1., 1., 1.])

In [146]:
np.empty((2, 2))

array([[0.25, 0.5 ],
       [0.75, 1.  ]])

### identity and eye

In [147]:
np.identity(3)

array([[1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.]])

In [148]:
np.eye(3, 3)

array([[1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.]])

In [149]:
np.eye(8, 4)

array([[1., 0., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 0., 1.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.]])

In [150]:
np.eye(8, 4, k=1)

array([[0., 1., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 0., 1.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.]])

In [152]:
np.eye(8, 4, k=-3)

array([[0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [1., 0., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 0., 1.],
       [0., 0., 0., 0.]])

In [155]:
'Hello World!'[6]

'W'

# Pandas

In [1]:
import pandas as pd
import numpy as np

## Pandas Series

In [3]:
g7_pop = pd.Series([35.467, 63.951, 80.940, 60.665, 127.061, 64.511, 318.523])
g7_pop

0     35.467
1     63.951
2     80.940
3     60.665
4    127.061
5     64.511
6    318.523
dtype: float64

In [4]:
g7_pop.name = 'G7 Population in millions'

In [5]:
g7_pop

0     35.467
1     63.951
2     80.940
3     60.665
4    127.061
5     64.511
6    318.523
Name: G7 Population in millions, dtype: float64

In [6]:
g7_pop.dtype

dtype('float64')

In [7]:
g7_pop.values 

array([ 35.467,  63.951,  80.94 ,  60.665, 127.061,  64.511, 318.523])

In [8]:
g7_pop[0]

35.467

In [9]:
g7_pop.index

RangeIndex(start=0, stop=7, step=1)

In [10]:
g7_pop.index = ['Canada',
                'France',
                'Germany',
                'Italy',
                'Japan',
                'United Kingdom',
                'United States'] 

In [11]:
g7_pop.index

Index(['Canada', 'France', 'Germany', 'Italy', 'Japan', 'United Kingdom',
       'United States'],
      dtype='object')

In [12]:
g7_pop

Canada             35.467
France             63.951
Germany            80.940
Italy              60.665
Japan             127.061
United Kingdom     64.511
United States     318.523
Name: G7 Population in millions, dtype: float64

In [14]:
pd.Series({
    'Canada' : 35.467,
    'France' : 63.951,
    'Germany' : 80.940,
    'Italy' : 60.665,
    'Japan' : 127.061,
    'United Kingdom' : 64.511,
    'United States' : 318.523
}, name = 'G7 Population in millions')

Canada             35.467
France             63.951
Germany            80.940
Italy              60.665
Japan             127.061
United Kingdom     64.511
United States     318.523
Name: G7 Population in millions, dtype: float64

### Indexing

In [15]:
g7_pop['Canada']

35.467

In [16]:
g7_pop['Japan']

127.061

In [17]:
g7_pop.iloc[0]

35.467

In [18]:
g7_pop.iloc[-1]

318.523

In [19]:
g7_pop[['Italy', 'France']]

Italy     60.665
France    63.951
Name: G7 Population in millions, dtype: float64

In [21]:
g7_pop[[2, -2]]

Germany           80.940
United Kingdom    64.511
Name: G7 Population in millions, dtype: float64

### Conditional Selection (boolean arrays)

In [22]:
g7_pop

Canada             35.467
France             63.951
Germany            80.940
Italy              60.665
Japan             127.061
United Kingdom     64.511
United States     318.523
Name: G7 Population in millions, dtype: float64

In [23]:
g7_pop > 70

Canada            False
France            False
Germany            True
Italy             False
Japan              True
United Kingdom    False
United States      True
Name: G7 Population in millions, dtype: bool

In [25]:
g7_pop[g7_pop > 70]

Germany           80.940
Japan            127.061
United States    318.523
Name: G7 Population in millions, dtype: float64

In [28]:
gpop_mean = g7_pop.mean()
gpop_mean

107.30257142857144

In [29]:
g7_pop[g7_pop > gpop_mean]

Japan            127.061
United States    318.523
Name: G7 Population in millions, dtype: float64

In [31]:
gpop_std = g7_pop.std()
gpop_std

97.24996987121581

In [32]:
g7_pop[(g7_pop > gpop_mean - gpop_std / 2) | (g7_pop > gpop_mean + g7_pop + gpop_std / 2)]

France             63.951
Germany            80.940
Italy              60.665
Japan             127.061
United Kingdom     64.511
United States     318.523
Name: G7 Population in millions, dtype: float64

### Operations and Methods

In [35]:
g7_pop *= 1_000_000

In [36]:
g7_pop

Canada             35467000.0
France             63951000.0
Germany            80940000.0
Italy              60665000.0
Japan             127061000.0
United Kingdom     64511000.0
United States     318523000.0
Name: G7 Population in millions, dtype: float64

In [38]:
gpop_mean = g7_pop.mean()
gpop_mean

107302571.42857143

In [39]:
np.log(g7_pop)

Canada            17.384113
France            17.973628
Germany           18.209219
Italy             17.920877
Japan             18.660178
United Kingdom    17.982346
United States     19.579205
Name: G7 Population in millions, dtype: float64

In [41]:
g7_pop['France' : 'Italy'].mean()

68518666.66666667

### Modifying Series

In [45]:
g7_pop['Canada'] = 40.5
g7_pop

Canada                   40.5
France             63951000.0
Germany            80940000.0
Italy              60665000.0
Japan             127061000.0
United Kingdom     64511000.0
United States           500.0
Name: G7 Population in millions, dtype: float64

In [46]:
g7_pop.iloc[-1] = 500
g7_pop

Canada                   40.5
France             63951000.0
Germany            80940000.0
Italy              60665000.0
Japan             127061000.0
United Kingdom     64511000.0
United States           500.0
Name: G7 Population in millions, dtype: float64

In [47]:
g7_pop[g7_pop < 70] = 99.99
g7_pop

Canada            9.999000e+01
France            6.395100e+07
Germany           8.094000e+07
Italy             6.066500e+07
Japan             1.270610e+08
United Kingdom    6.451100e+07
United States     5.000000e+02
Name: G7 Population in millions, dtype: float64

## Pandas DataFrame

In [48]:
df = pd.DataFrame({
    'Population': [35.467, 63.951, 80.94 , 60.665, 127.061, 64.511, 318.523],
    'GDP': [
        1785387,
        2833687,
        3874437,
        2167744,
        4602367,
        2950039,
        17348075
    ],
    'Surface Area': [
        9984670,
        640679,
        357114,
        301336,
        377930,
        242495,
        9525067
    ],
    'HDI': [
        0.913,
        0.888,
        0.916,
        0.873,
        0.891,
        0.907,
        0.915
    ],
    'Continent': [
        'America',
        'Europe',
        'Europe',
        'Europe',
        'Asia',
        'Europe',
        'America'
    ]
}, columns=['Population', 'GDP', 'Surface Area', 'HDI', 'Continent'])

In [49]:
df

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
0,35.467,1785387,9984670,0.913,America
1,63.951,2833687,640679,0.888,Europe
2,80.94,3874437,357114,0.916,Europe
3,60.665,2167744,301336,0.873,Europe
4,127.061,4602367,377930,0.891,Asia
5,64.511,2950039,242495,0.907,Europe
6,318.523,17348075,9525067,0.915,America


In [50]:
df.index = [
    'Canada',
    'France',
    'Germany',
    'Italy',
    'Japan',
    'United Kingdom',
    'United States',
]

In [51]:
df

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
Canada,35.467,1785387,9984670,0.913,America
France,63.951,2833687,640679,0.888,Europe
Germany,80.94,3874437,357114,0.916,Europe
Italy,60.665,2167744,301336,0.873,Europe
Japan,127.061,4602367,377930,0.891,Asia
United Kingdom,64.511,2950039,242495,0.907,Europe
United States,318.523,17348075,9525067,0.915,America


In [52]:
df.columns

Index(['Population', 'GDP', 'Surface Area', 'HDI', 'Continent'], dtype='object')

In [53]:
df.index

Index(['Canada', 'France', 'Germany', 'Italy', 'Japan', 'United Kingdom',
       'United States'],
      dtype='object')

In [54]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7 entries, Canada to United States
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Population    7 non-null      float64
 1   GDP           7 non-null      int64  
 2   Surface Area  7 non-null      int64  
 3   HDI           7 non-null      float64
 4   Continent     7 non-null      object 
dtypes: float64(2), int64(2), object(1)
memory usage: 336.0+ bytes


In [55]:
df.size

35

In [56]:
df.shape

(7, 5)

In [57]:
df.describe()

Unnamed: 0,Population,GDP,Surface Area,HDI
count,7.0,7.0,7.0,7.0
mean,107.302571,5080248.0,3061327.0,0.900429
std,97.24997,5494020.0,4576187.0,0.016592
min,35.467,1785387.0,242495.0,0.873
25%,62.308,2500716.0,329225.0,0.8895
50%,64.511,2950039.0,377930.0,0.907
75%,104.0005,4238402.0,5082873.0,0.914
max,318.523,17348080.0,9984670.0,0.916


In [58]:
df.dtypes

Population      float64
GDP               int64
Surface Area      int64
HDI             float64
Continent        object
dtype: object

In [59]:
df.dtypes.value_counts()

float64    2
int64      2
object     1
dtype: int64

### Indexing, Selection and Slicing

In [60]:
df

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
Canada,35.467,1785387,9984670,0.913,America
France,63.951,2833687,640679,0.888,Europe
Germany,80.94,3874437,357114,0.916,Europe
Italy,60.665,2167744,301336,0.873,Europe
Japan,127.061,4602367,377930,0.891,Asia
United Kingdom,64.511,2950039,242495,0.907,Europe
United States,318.523,17348075,9525067,0.915,America


In [62]:
df.loc['Canada']

Population       35.467
GDP             1785387
Surface Area    9984670
HDI               0.913
Continent       America
Name: Canada, dtype: object

In [63]:
df.iloc[-1]

Population       318.523
GDP             17348075
Surface Area     9525067
HDI                0.915
Continent        America
Name: United States, dtype: object

In [64]:
df['Population']

Canada             35.467
France             63.951
Germany            80.940
Italy              60.665
Japan             127.061
United Kingdom     64.511
United States     318.523
Name: Population, dtype: float64

In [65]:
df['Population'].to_frame()

Unnamed: 0,Population
Canada,35.467
France,63.951
Germany,80.94
Italy,60.665
Japan,127.061
United Kingdom,64.511
United States,318.523


In [66]:
df[['Population', 'GDP']]

Unnamed: 0,Population,GDP
Canada,35.467,1785387
France,63.951,2833687
Germany,80.94,3874437
Italy,60.665,2167744
Japan,127.061,4602367
United Kingdom,64.511,2950039
United States,318.523,17348075


In [67]:
df[1:3]

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
France,63.951,2833687,640679,0.888,Europe
Germany,80.94,3874437,357114,0.916,Europe


In [68]:
df.loc['France' : 'Italy']

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
France,63.951,2833687,640679,0.888,Europe
Germany,80.94,3874437,357114,0.916,Europe
Italy,60.665,2167744,301336,0.873,Europe


In [69]:
df.loc['France' : 'Italy', 'Population']

France     63.951
Germany    80.940
Italy      60.665
Name: Population, dtype: float64

In [70]:
df.loc['France': 'Italy', ['Population', 'GDP']]

Unnamed: 0,Population,GDP
France,63.951,2833687
Germany,80.94,3874437
Italy,60.665,2167744


In [71]:
df.iloc[1:3, 3]

France     0.888
Germany    0.916
Name: HDI, dtype: float64

In [73]:
df.iloc[[0, 1, -1]]

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
Canada,35.467,1785387,9984670,0.913,America
France,63.951,2833687,640679,0.888,Europe
United States,318.523,17348075,9525067,0.915,America


### Conditional Selection (boolean arrays)

In [74]:
df['Population'] > 70

Canada            False
France            False
Germany            True
Italy             False
Japan              True
United Kingdom    False
United States      True
Name: Population, dtype: bool

In [75]:
df.loc[df['Population'] > 70]

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
Germany,80.94,3874437,357114,0.916,Europe
Japan,127.061,4602367,377930,0.891,Asia
United States,318.523,17348075,9525067,0.915,America


In [76]:
df.loc[df['Population'] > 70, 'Population']

Germany           80.940
Japan            127.061
United States    318.523
Name: Population, dtype: float64

In [77]:
df.loc[df['Population'] > 70, ['Population', 'GDP']]

Unnamed: 0,Population,GDP
Germany,80.94,3874437
Japan,127.061,4602367
United States,318.523,17348075


### Dropping 

In [78]:
df.drop('Canada')

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
France,63.951,2833687,640679,0.888,Europe
Germany,80.94,3874437,357114,0.916,Europe
Italy,60.665,2167744,301336,0.873,Europe
Japan,127.061,4602367,377930,0.891,Asia
United Kingdom,64.511,2950039,242495,0.907,Europe
United States,318.523,17348075,9525067,0.915,America


In [79]:
df.drop(['Canada', 'Japan'])

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
France,63.951,2833687,640679,0.888,Europe
Germany,80.94,3874437,357114,0.916,Europe
Italy,60.665,2167744,301336,0.873,Europe
United Kingdom,64.511,2950039,242495,0.907,Europe
United States,318.523,17348075,9525067,0.915,America


In [80]:
df.drop(columns=['Population', 'HDI'])

Unnamed: 0,GDP,Surface Area,Continent
Canada,1785387,9984670,America
France,2833687,640679,Europe
Germany,3874437,357114,Europe
Italy,2167744,301336,Europe
Japan,4602367,377930,Asia
United Kingdom,2950039,242495,Europe
United States,17348075,9525067,America


In [81]:
df.drop(['Italy', 'Canada'], axis=0)

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
France,63.951,2833687,640679,0.888,Europe
Germany,80.94,3874437,357114,0.916,Europe
Japan,127.061,4602367,377930,0.891,Asia
United Kingdom,64.511,2950039,242495,0.907,Europe
United States,318.523,17348075,9525067,0.915,America


In [82]:
df.drop(['Population', 'HDI'], axis=1)

Unnamed: 0,GDP,Surface Area,Continent
Canada,1785387,9984670,America
France,2833687,640679,Europe
Germany,3874437,357114,Europe
Italy,2167744,301336,Europe
Japan,4602367,377930,Asia
United Kingdom,2950039,242495,Europe
United States,17348075,9525067,America


In [83]:
df.drop(['Population', 'HDI'], axis=1)

Unnamed: 0,GDP,Surface Area,Continent
Canada,1785387,9984670,America
France,2833687,640679,Europe
Germany,3874437,357114,Europe
Italy,2167744,301336,Europe
Japan,4602367,377930,Asia
United Kingdom,2950039,242495,Europe
United States,17348075,9525067,America


In [84]:
df.drop(['Population', 'HDI'], axis='columns')

Unnamed: 0,GDP,Surface Area,Continent
Canada,1785387,9984670,America
France,2833687,640679,Europe
Germany,3874437,357114,Europe
Italy,2167744,301336,Europe
Japan,4602367,377930,Asia
United Kingdom,2950039,242495,Europe
United States,17348075,9525067,America


In [85]:
df.drop(['Canada', 'Germany'], axis='rows')

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
France,63.951,2833687,640679,0.888,Europe
Italy,60.665,2167744,301336,0.873,Europe
Japan,127.061,4602367,377930,0.891,Asia
United Kingdom,64.511,2950039,242495,0.907,Europe
United States,318.523,17348075,9525067,0.915,America


### Operations

In [87]:
df[['Population', 'GDP']]

Unnamed: 0,Population,GDP
Canada,35.467,1785387
France,63.951,2833687
Germany,80.94,3874437
Italy,60.665,2167744
Japan,127.061,4602367
United Kingdom,64.511,2950039
United States,318.523,17348075


In [88]:
df[['Population', 'GDP']] / 100

Unnamed: 0,Population,GDP
Canada,0.35467,17853.87
France,0.63951,28336.87
Germany,0.8094,38744.37
Italy,0.60665,21677.44
Japan,1.27061,46023.67
United Kingdom,0.64511,29500.39
United States,3.18523,173480.75


In [89]:
crisis = pd.Series([-1_000_000, -0.3], index=['GDP', 'HDI'])
crisis

GDP   -1000000.0
HDI         -0.3
dtype: float64

In [90]:
df[['GDP', 'HDI']]

Unnamed: 0,GDP,HDI
Canada,1785387,0.913
France,2833687,0.888
Germany,3874437,0.916
Italy,2167744,0.873
Japan,4602367,0.891
United Kingdom,2950039,0.907
United States,17348075,0.915


In [92]:
df[['GDP', 'HDI']] + crisis

Unnamed: 0,GDP,HDI
Canada,785387.0,0.613
France,1833687.0,0.588
Germany,2874437.0,0.616
Italy,1167744.0,0.573
Japan,3602367.0,0.591
United Kingdom,1950039.0,0.607
United States,16348075.0,0.615


### Modifying Values

In [93]:
langs = pd.Series(
    ['French', 'German', 'Italian'],
    index=['France', 'Germany', 'Italy'],
    name='Language'
)

In [94]:
langs

France      French
Germany     German
Italy      Italian
Name: Language, dtype: object

In [95]:
df['Language'] = langs

In [96]:
df

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent,Language
Canada,35.467,1785387,9984670,0.913,America,
France,63.951,2833687,640679,0.888,Europe,French
Germany,80.94,3874437,357114,0.916,Europe,German
Italy,60.665,2167744,301336,0.873,Europe,Italian
Japan,127.061,4602367,377930,0.891,Asia,
United Kingdom,64.511,2950039,242495,0.907,Europe,
United States,318.523,17348075,9525067,0.915,America,


In [97]:
df['Language'] = 'English'

In [98]:
df

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent,Language
Canada,35.467,1785387,9984670,0.913,America,English
France,63.951,2833687,640679,0.888,Europe,English
Germany,80.94,3874437,357114,0.916,Europe,English
Italy,60.665,2167744,301336,0.873,Europe,English
Japan,127.061,4602367,377930,0.891,Asia,English
United Kingdom,64.511,2950039,242495,0.907,Europe,English
United States,318.523,17348075,9525067,0.915,America,English


In [99]:
df.rename(
    columns={
        'HDI': 'Human Development Index',
        'Anual Popcorn Consumption': 'APC'
    }, index={
        'United States': 'USA',
        'United Kingdom': 'UK',
        'Argentina': 'AR'
    })

Unnamed: 0,Population,GDP,Surface Area,Human Development Index,Continent,Language
Canada,35.467,1785387,9984670,0.913,America,English
France,63.951,2833687,640679,0.888,Europe,English
Germany,80.94,3874437,357114,0.916,Europe,English
Italy,60.665,2167744,301336,0.873,Europe,English
Japan,127.061,4602367,377930,0.891,Asia,English
UK,64.511,2950039,242495,0.907,Europe,English
USA,318.523,17348075,9525067,0.915,America,English


In [100]:
df.rename(index=str.upper)

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent,Language
CANADA,35.467,1785387,9984670,0.913,America,English
FRANCE,63.951,2833687,640679,0.888,Europe,English
GERMANY,80.94,3874437,357114,0.916,Europe,English
ITALY,60.665,2167744,301336,0.873,Europe,English
JAPAN,127.061,4602367,377930,0.891,Asia,English
UNITED KINGDOM,64.511,2950039,242495,0.907,Europe,English
UNITED STATES,318.523,17348075,9525067,0.915,America,English


In [101]:
df.rename(index=lambda x: x.lower())

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent,Language
canada,35.467,1785387,9984670,0.913,America,English
france,63.951,2833687,640679,0.888,Europe,English
germany,80.94,3874437,357114,0.916,Europe,English
italy,60.665,2167744,301336,0.873,Europe,English
japan,127.061,4602367,377930,0.891,Asia,English
united kingdom,64.511,2950039,242495,0.907,Europe,English
united states,318.523,17348075,9525067,0.915,America,English


### Statistical Info

In [106]:
df

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
Canada,35.467,1785387,9984670,0.913,America
France,63.951,2833687,640679,0.888,Europe
Germany,80.94,3874437,357114,0.916,Europe
Italy,60.665,2167744,301336,0.873,Europe
Japan,127.061,4602367,377930,0.891,Asia
United Kingdom,64.511,2950039,242495,0.907,Europe
United States,318.523,17348075,9525067,0.915,America


In [107]:
population = df['Population']

In [108]:
population.min(), population.max()

(35.467, 318.523)

In [109]:
population.sum()

751.118

In [110]:
population.sum() / len(population)

107.30257142857144

In [111]:
population.mean()

107.30257142857144

In [113]:
population.std()

97.24996987121581

In [114]:
population.median()

64.511

In [115]:
population.describe()

count      7.000000
mean     107.302571
std       97.249970
min       35.467000
25%       62.308000
50%       64.511000
75%      104.000500
max      318.523000
Name: Population, dtype: float64

In [116]:
population.quantile(.25)

62.308

In [117]:
population.quantile([.2, .4, .6, .8, 1])

0.2     61.3222
0.4     64.1750
0.6     74.3684
0.8    117.8368
1.0    318.5230
Name: Population, dtype: float64