In [7]:
import numpy as np
import pandas as pd

# 1. Series

## 1.1 Create Series

In [16]:
population = [35.467, 63.951, 80.940, 60.665, 127.061, 64.511, 318.523]
countries = ["Canada", "France", "Germany", "Italy", "Japan", "United Kingdom", "United States" ]

In [14]:
g7 = pd.Series(population)
g7

0     35.467
1     63.951
2     80.940
3     60.665
4    127.061
5     64.511
6    318.523
dtype: float64

In [17]:
g7.name = "G7 Population in millions"
g7

0     35.467
1     63.951
2     80.940
3     60.665
4    127.061
5     64.511
6    318.523
Name: G7 Population in millions, dtype: float64

In [19]:
g7.values

array([ 35.467,  63.951,  80.94 ,  60.665, 127.061,  64.511, 318.523])

## 1.2 Indexing

In [20]:
g7.index

RangeIndex(start=0, stop=7, step=1)

# 2. Data Frame

## 2.1 Create & overview

In [32]:
g7_dict = {
    "Population": [35.467, 63.951, 80.94, 60.665, 127.061, 64.511, 318.523],
    "GDP": [
        1785387,
        2833687,
        3874437,
        2167744,
        4602367,
        2950039,
        17348075
    ],
    "HDI": [
        0.9123,
        0.888,
        0.916,
        0.873,
        0.891,
        0.907,
        0.915
    ],
    "Continent": [
        "America",
        "Europe",
        "Europe",
        "Europe",
        "Asia",
        "Europe",
        "America"
    ],
    "Surface Area": [
        9984670,
        640479,
        357114,
        301336,
        377930,
        242495,
        9525067
    ],
}

In [33]:
df = pd.DataFrame(g7_dict)

In [36]:
df

Unnamed: 0,Population,GDP,HDI,Continent,Surface Area
0,35.467,1785387,0.9123,America,9984670
1,63.951,2833687,0.888,Europe,640479
2,80.94,3874437,0.916,Europe,357114
3,60.665,2167744,0.873,Europe,301336
4,127.061,4602367,0.891,Asia,377930
5,64.511,2950039,0.907,Europe,242495
6,318.523,17348075,0.915,America,9525067


In [34]:
df.size

35

In [38]:
df.shape

(7, 5)

In [39]:
df.shape[0]

7

In [35]:
df.info() # Check the info (returns aggregated summary)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Population    7 non-null      float64
 1   GDP           7 non-null      int64  
 2   HDI           7 non-null      float64
 3   Continent     7 non-null      object 
 4   Surface Area  7 non-null      int64  
dtypes: float64(2), int64(2), object(1)
memory usage: 408.0+ bytes


In [37]:
df.describe() # 

Unnamed: 0,Population,GDP,HDI,Surface Area
count,7.0,7.0,7.0,7.0
mean,107.302571,5080248.0,0.900329,3061299.0
std,97.24997,5494020.0,0.016505,4576204.0
min,35.467,1785387.0,0.873,242495.0
25%,62.308,2500716.0,0.8895,329225.0
50%,64.511,2950039.0,0.907,377930.0
75%,104.0005,4238402.0,0.91365,5082773.0
max,318.523,17348080.0,0.916,9984670.0


In [42]:
df.describe(include="object")

Unnamed: 0,Continent
count,7
unique,3
top,Europe
freq,4


In [54]:
df.index = countries

In [55]:
df

Unnamed: 0,Population,GDP,HDI,Continent,Surface Area
Canada,35.467,1785387,0.9123,America,9984670
France,63.951,2833687,0.888,Europe,640479
Germany,80.94,3874437,0.916,Europe,357114
Italy,60.665,2167744,0.873,Europe,301336
Japan,127.061,4602367,0.891,Asia,377930
United Kingdom,64.511,2950039,0.907,Europe,242495
United States,318.523,17348075,0.915,America,9525067


In [44]:
df.describe(percentiles=[.2, .4, .6, .8])

Unnamed: 0,Population,GDP,HDI,Surface Area
count,7.0,7.0,7.0,7.0
mean,107.302571,5080248.0,0.900329,3061299.0
std,97.24997,5494020.0,0.016505,4576204.0
min,35.467,1785387.0,0.873,242495.0
20%,61.3222,2300933.0,0.8886,312491.6
40%,64.175,2880228.0,0.8974,365440.4
50%,64.511,2950039.0,0.907,377930.0
60%,74.3684,3504678.0,0.91018,535459.4
80%,117.8368,4456781.0,0.91446,7748149.0
max,318.523,17348080.0,0.916,9984670.0


# 2.2 Indexing, Selecting and Slicing

In [45]:
df

Unnamed: 0,Population,GDP,HDI,Continent,Surface Area
0,35.467,1785387,0.9123,America,9984670
1,63.951,2833687,0.888,Europe,640479
2,80.94,3874437,0.916,Europe,357114
3,60.665,2167744,0.873,Europe,301336
4,127.061,4602367,0.891,Asia,377930
5,64.511,2950039,0.907,Europe,242495
6,318.523,17348075,0.915,America,9525067


In [46]:
df["GDP"]

0     1785387
1     2833687
2     3874437
3     2167744
4     4602367
5     2950039
6    17348075
Name: GDP, dtype: int64

In [56]:
df.loc["Canada", "GDP"]

np.int64(1785387)

In [57]:
df.loc["France"] ["GDP"]

np.int64(2833687)

In [51]:
df["Population"]

0     35.467
1     63.951
2     80.940
3     60.665
4    127.061
5     64.511
6    318.523
Name: Population, dtype: float64

In [60]:
# using .column_name
df.Population

Canada             35.467
France             63.951
Germany            80.940
Italy              60.665
Japan             127.061
United Kingdom     64.511
United States     318.523
Name: Population, dtype: float64

In [62]:

print(df.Population.Canada)

35.467


In [64]:
# Change the data in the dataframe
df.loc["Canada", "Population"] = 35.2
df

Unnamed: 0,Population,GDP,HDI,Continent,Surface Area
Canada,35.2,1785387,0.9123,America,9984670
France,63.951,2833687,0.888,Europe,640479
Germany,80.94,3874437,0.916,Europe,357114
Italy,60.665,2167744,0.873,Europe,301336
Japan,127.061,4602367,0.891,Asia,377930
United Kingdom,64.511,2950039,0.907,Europe,242495
United States,318.523,17348075,0.915,America,9525067


## 2.3 Drop Values and Colums

In [None]:
# Removes the row or column
df.drop("Canada")

In [None]:
# Removes Canada and Japan
df.drop(["Canada", "Japan"])

In [None]:
# Removes columns GDP and Surface Area
df.drop(columns=["GDP", "Surface Area"])

In [None]:
# Removes the axis=0 (Row 0) index Italy
df.drop("Italy", axis=0)

## 2.4 Operations

In [65]:
# Divide all the values in all of the rows by 100
df[["Population", "GDP"]] / 100

Unnamed: 0,Population,GDP
Canada,0.352,17853.87
France,0.63951,28336.87
Germany,0.8094,38744.37
Italy,0.60665,21677.44
Japan,1.27061,46023.67
United Kingdom,0.64511,29500.39
United States,3.18523,173480.75


In [66]:
crisis = pd.Series([-500000, -0.3], index=["GDP", "HDI"])
crisis

GDP   -500000.0
HDI        -0.3
dtype: float64

In [67]:
df[["GDP", "HDI"]]

Unnamed: 0,GDP,HDI
Canada,1785387,0.9123
France,2833687,0.888
Germany,3874437,0.916
Italy,2167744,0.873
Japan,4602367,0.891
United Kingdom,2950039,0.907
United States,17348075,0.915


In [68]:
df[["GDP", "HDI"]] + crisis

Unnamed: 0,GDP,HDI
Canada,1285387.0,0.6123
France,2333687.0,0.588
Germany,3374437.0,0.616
Italy,1667744.0,0.573
Japan,4102367.0,0.591
United Kingdom,2450039.0,0.607
United States,16848075.0,0.615


## 2.5 Modyfying DataFrames

In [69]:
langs = pd.Series(["French", "German", "Italian"], index=["France", "Germany", "Italy"], name="Language")
langs

France      French
Germany     German
Italy      Italian
Name: Language, dtype: object

In [73]:
# Add column Languages using data from langs
df["Language"] = langs
df

Unnamed: 0,Population,GDP,HDI,Continent,Surface Area,Language
Canada,35.2,1785387,0.9123,America,9984670,
France,63.951,2833687,0.888,Europe,640479,French
Germany,80.94,3874437,0.916,Europe,357114,German
Italy,60.665,2167744,0.873,Europe,301336,Italian
Japan,127.061,4602367,0.891,Asia,377930,
United Kingdom,64.511,2950039,0.907,Europe,242495,
United States,318.523,17348075,0.915,America,9525067,


In [72]:
# Change all of the languages to English
df.Language = "English"
df

Unnamed: 0,Population,GDP,HDI,Continent,Surface Area,Language
Canada,35.2,1785387,0.9123,America,9984670,English
France,63.951,2833687,0.888,Europe,640479,English
Germany,80.94,3874437,0.916,Europe,357114,English
Italy,60.665,2167744,0.873,Europe,301336,English
Japan,127.061,4602367,0.891,Asia,377930,English
United Kingdom,64.511,2950039,0.907,Europe,242495,English
United States,318.523,17348075,0.915,America,9525067,English


Rename columns


In [75]:
df.rename(
    index={
        "United States": "USA",
        "United Kingdom": "UK"
}, columns={
        "GDP": "Gross Domestic Product",
        "HDI": "Human Development Index"
    }, inplace=True)

In [76]:
df

Unnamed: 0,Population,Gross Domestic Product,Human Development Index,Continent,Surface Area,Language
Canada,35.2,1785387,0.9123,America,9984670,
France,63.951,2833687,0.888,Europe,640479,French
Germany,80.94,3874437,0.916,Europe,357114,German
Italy,60.665,2167744,0.873,Europe,301336,Italian
Japan,127.061,4602367,0.891,Asia,377930,
UK,64.511,2950039,0.907,Europe,242495,
USA,318.523,17348075,0.915,America,9525067,


In [77]:
df.rename(index=str.upper)

Unnamed: 0,Population,Gross Domestic Product,Human Development Index,Continent,Surface Area,Language
CANADA,35.2,1785387,0.9123,America,9984670,
FRANCE,63.951,2833687,0.888,Europe,640479,French
GERMANY,80.94,3874437,0.916,Europe,357114,German
ITALY,60.665,2167744,0.873,Europe,301336,Italian
JAPAN,127.061,4602367,0.891,Asia,377930,
UK,64.511,2950039,0.907,Europe,242495,
USA,318.523,17348075,0.915,America,9525067,


In [78]:
# replace spaces with underscores
# lambda x:
df.rename(columns=lambda x: x.replace(" ", "_"))

Unnamed: 0,Population,Gross_Domestic_Product,Human_Development_Index,Continent,Surface_Area,Language
Canada,35.2,1785387,0.9123,America,9984670,
France,63.951,2833687,0.888,Europe,640479,French
Germany,80.94,3874437,0.916,Europe,357114,German
Italy,60.665,2167744,0.873,Europe,301336,Italian
Japan,127.061,4602367,0.891,Asia,377930,
UK,64.511,2950039,0.907,Europe,242495,
USA,318.523,17348075,0.915,America,9525067,


Rename axis

In [79]:
df.rename_axis(index="Countries", columns="Parameters")

Parameters,Population,Gross Domestic Product,Human Development Index,Continent,Surface Area,Language
Countries,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Canada,35.2,1785387,0.9123,America,9984670,
France,63.951,2833687,0.888,Europe,640479,French
Germany,80.94,3874437,0.916,Europe,357114,German
Italy,60.665,2167744,0.873,Europe,301336,Italian
Japan,127.061,4602367,0.891,Asia,377930,
UK,64.511,2950039,0.907,Europe,242495,
USA,318.523,17348075,0.915,America,9525067,


Add new value

In [82]:
df.loc["China"] = pd.Series({"Population": 1400,
                             "Continent": "Asia",
                            "Language": "Mandarin"})
df

Unnamed: 0,Population,Gross Domestic Product,Human Development Index,Continent,Surface Area,Language
Canada,35.2,1785387.0,0.9123,America,9984670.0,
France,63.951,2833687.0,0.888,Europe,640479.0,French
Germany,80.94,3874437.0,0.916,Europe,357114.0,German
Italy,60.665,2167744.0,0.873,Europe,301336.0,Italian
Japan,127.061,4602367.0,0.891,Asia,377930.0,
UK,64.511,2950039.0,0.907,Europe,242495.0,
USA,318.523,17348075.0,0.915,America,9525067.0,
China,1400.0,,,Asia,,Mandarin


# 2.6 Index changeing

In [83]:
df.reset_index()

Unnamed: 0,index,Population,Gross Domestic Product,Human Development Index,Continent,Surface Area,Language
0,Canada,35.2,1785387.0,0.9123,America,9984670.0,
1,France,63.951,2833687.0,0.888,Europe,640479.0,French
2,Germany,80.94,3874437.0,0.916,Europe,357114.0,German
3,Italy,60.665,2167744.0,0.873,Europe,301336.0,Italian
4,Japan,127.061,4602367.0,0.891,Asia,377930.0,
5,UK,64.511,2950039.0,0.907,Europe,242495.0,
6,USA,318.523,17348075.0,0.915,America,9525067.0,
7,China,1400.0,,,Asia,,Mandarin


In [109]:
df.set_index("Population")

Unnamed: 0_level_0,Gross Domestic Product,Human Development Index,Continent,Surface Area,Language,GDP per Capita
Population,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
35.2,1785387.0,0.9123,America,9984670.0,,50721.221591
63.951,2833687.0,0.888,Europe,640479.0,French,44310.284437
80.94,3874437.0,0.916,Europe,357114.0,German,47868.013343
60.665,2167744.0,0.873,Europe,301336.0,Italian,35733.025633
127.061,4602367.0,0.891,Asia,377930.0,,36221.712406
64.511,2950039.0,0.907,Europe,242495.0,,45729.239975
318.523,17348075.0,0.915,America,9525067.0,,54464.12033
1400.0,,,Asia,,Mandarin,


Create a new column

In [86]:
df["GDP per Capita"] = df["Gross Domestic Product"] / df["Population"]
df

Unnamed: 0,Population,Gross Domestic Product,Human Development Index,Continent,Surface Area,Language,GDP per Capita
Canada,35.2,1785387.0,0.9123,America,9984670.0,,50721.221591
France,63.951,2833687.0,0.888,Europe,640479.0,French,44310.284437
Germany,80.94,3874437.0,0.916,Europe,357114.0,German,47868.013343
Italy,60.665,2167744.0,0.873,Europe,301336.0,Italian,35733.025633
Japan,127.061,4602367.0,0.891,Asia,377930.0,,36221.712406
UK,64.511,2950039.0,0.907,Europe,242495.0,,45729.239975
USA,318.523,17348075.0,0.915,America,9525067.0,,54464.12033
China,1400.0,,,Asia,,Mandarin,


Insert column in a specific position

In [None]:
df.insert(2, "GDP per Capita"

In [88]:
new_idx = df.columns.get_loc("Gross Domestic Product") +1
df

Unnamed: 0,Population,Gross Domestic Product,Human Development Index,Continent,Surface Area,Language,GDP per Capita
Canada,35.2,1785387.0,0.9123,America,9984670.0,,50721.221591
France,63.951,2833687.0,0.888,Europe,640479.0,French,44310.284437
Germany,80.94,3874437.0,0.916,Europe,357114.0,German,47868.013343
Italy,60.665,2167744.0,0.873,Europe,301336.0,Italian,35733.025633
Japan,127.061,4602367.0,0.891,Asia,377930.0,,36221.712406
UK,64.511,2950039.0,0.907,Europe,242495.0,,45729.239975
USA,318.523,17348075.0,0.915,America,9525067.0,,54464.12033
China,1400.0,,,Asia,,Mandarin,


# 2.7 Statistic methods

In [100]:
round(df.min(numeric_only=True), 3)

Population                      35.200
Gross Domestic Product     1785387.000
Human Development Index          0.873
Surface Area                242495.000
GDP per Capita               35733.026
dtype: float64

In [98]:
df.max(numeric_only=True)

Population                 1.400000e+03
Gross Domestic Product     1.734808e+07
Human Development Index    9.160000e-01
Surface Area               9.984670e+06
GDP per Capita             5.446412e+04
dtype: float64

In [91]:
df.sum(numeric_only=True)

Population                 2.150851e+03
Gross Domestic Product     3.556174e+07
Human Development Index    6.302300e+00
Surface Area               2.142909e+07
GDP per Capita             3.150476e+05
dtype: float64

In [92]:
df.mean(numeric_only=True)

Population                 2.688564e+02
Gross Domestic Product     5.080248e+06
Human Development Index    9.003286e-01
Surface Area               3.061299e+06
GDP per Capita             4.500680e+04
dtype: float64

In [93]:
df.median(numeric_only=True)

Population                 7.272550e+01
Gross Domestic Product     2.950039e+06
Human Development Index    9.070000e-01
Surface Area               3.779300e+05
GDP per Capita             4.572924e+04
dtype: float64

In [94]:
df.median

<bound method DataFrame.median of          Population  Gross Domestic Product  Human Development Index  \
Canada       35.200               1785387.0                   0.9123   
France       63.951               2833687.0                   0.8880   
Germany      80.940               3874437.0                   0.9160   
Italy        60.665               2167744.0                   0.8730   
Japan       127.061               4602367.0                   0.8910   
UK           64.511               2950039.0                   0.9070   
USA         318.523              17348075.0                   0.9150   
China      1400.000                     NaN                      NaN   

        Continent  Surface Area  Language  GDP per Capita  
Canada    America     9984670.0       NaN    50721.221591  
France     Europe      640479.0    French    44310.284437  
Germany    Europe      357114.0    German    47868.013343  
Italy      Europe      301336.0   Italian    35733.025633  
Japan        Asia

In [107]:
round(df.std(numeric_only=True), 3)

Population                     465.841
Gross Domestic Product     5494020.158
Human Development Index          0.017
Surface Area               4576204.207
GDP per Capita                7005.590
dtype: float64

In [103]:
df.quantile(.25)

TypeError: unsupported operand type(s) for -: 'str' and 'str'

In [104]:
df.quantile([.2, .4, .6, .8])

TypeError: unsupported operand type(s) for -: 'str' and 'str'

In [105]:
df.mode

<bound method DataFrame.mode of          Population  Gross Domestic Product  Human Development Index  \
Canada       35.200               1785387.0                   0.9123   
France       63.951               2833687.0                   0.8880   
Germany      80.940               3874437.0                   0.9160   
Italy        60.665               2167744.0                   0.8730   
Japan       127.061               4602367.0                   0.8910   
UK           64.511               2950039.0                   0.9070   
USA         318.523              17348075.0                   0.9150   
China      1400.000                     NaN                      NaN   

        Continent  Surface Area  Language  GDP per Capita  
Canada    America     9984670.0       NaN    50721.221591  
France     Europe      640479.0    French    44310.284437  
Germany    Europe      357114.0    German    47868.013343  
Italy      Europe      301336.0   Italian    35733.025633  
Japan        Asia  

## 2.8 Corr (Corrolation = dependancy)

In [110]:
df.corr() 

ValueError: could not convert string to float: 'America'