In [4]:
import pandas as pd
import numpy as np

## Series

In [23]:
s1 = pd.Series([1, 2, -3, 4, -5])

print(f"Series: \n{s1}")
print(f"Values: {s1.values}")
print(f"Index: {s1.index}")

Series: 
0    1
1    2
2   -3
3    4
4   -5
dtype: int64
Values: [ 1  2 -3  4 -5]
Index: RangeIndex(start=0, stop=5, step=1)


In [22]:
s1=pd.Series([1,2,-5,0], index=['a','b','c','d'])
print(f"Before: \n{s1}")
s1['a']=1000
print(f"After: \n{s1}")

Before: 
a    1
b    2
c   -5
d    0
dtype: int64
After: 
a    1000
b       2
c      -5
d       0
dtype: int64


### comparison

In [5]:
s1=pd.Series([1,2,-5,0], index=['a','b','c','d'])
s1[s1>0] # readonly

a    1
b    2
dtype: int64

### Algebra

In [9]:
# readonly
s1=pd.Series([1,2,-5,0], index=['a','b','c','d'])
print(f"s1:\n{s1}\n")
print(f"s1 * 2:\n{s1*2}\n")
print(f"s1.isnull():\n{s1.isnull()}")

s1:
a    1
b    2
c   -5
d    0
dtype: int64

s1 * 2:
a     2
b     4
c   -10
d     0
dtype: int64

s1.isnull():
a    False
b    False
c    False
d    False
dtype: bool


### dataframe

In [12]:
dados={
    'estado': ['SP', 'MG', 'PR', 'SP','MG', 'PR'],
    'ano':[2019,2019,2019,2020,2020,2020],
    'pop': [45.9, 21.2,16.9,46.6,21.4,17.3]
}
df=pd.DataFrame(dados)
print(f"--- df (Original DataFrame) ---\n{df}\n")

# 1. View first 2 rows
print(f"--- df.head(2) (First 2 rows) ---\n{df.head(2)}\n")

# 2. View last 2 rows
# NOTE: The correct function is 'tail(n)', not 'tails(n)'
print(f"--- df.tail(2) (Last 2 rows) ---\n{df.tail(2)}\n")

# 3. View 2 random rows
print(f"--- df.sample(2) (2 random rows) ---\n{df.sample(2)}\n")

# 4. Descriptive statistics
print(f"--- df.describe() (Descriptive Statistics) ---\n{df.describe()}\n")

# 5. Sort by 'ano' (ascending by default)
print(f"--- df.sort_values(by='ano') (Sort by 'ano' ASC) ---\n{df.sort_values(by='ano')}\n")

# 6. Sort by 'ano' descending
print(f"--- df.sort_values(by='ano', ascending=False) (Sort by 'ano' DESC) ---\n{df.sort_values(by='ano', ascending=False)}\n")

# 7. Sort by 'pop' descending
print(f"--- df.sort_values(by='pop', ascending=False) (Sort by 'pop' DESC) ---\n{df.sort_values(by='pop', ascending=False)}\n")

# 8. Sort by 'estado', then 'ano' (ascending)
print(f"--- df.sort_values(by=['estado','ano']) (Sort by 'estado' then 'ano' ASC) ---\n{df.sort_values(by=['estado','ano'])}\n")

# 9. Sort by 'estado', then 'ano' (descending)
print(f"--- df.sort_values(by=['estado','ano'], ascending=False) (Sort by 'estado' then 'ano' DESC) ---\n{df.sort_values(by=['estado','ano'], ascending=False)}\n")

# 10. Group by 'estado' and calculate the mean of numeric columns
print(f"--- df.groupby('estado').mean() (Mean by 'estado') ---\n{df.groupby('estado').mean()}\n")

# 11. Group by 'estado' and calculate the sum of numeric columns
print(f"--- df.groupby('estado').sum() (Sum by 'estado') ---\n{df.groupby('estado').sum()}\n")

# 12. Group by 'estado' and find the minimum 'ano' for each group (using agg)
print(f"--- df.groupby('estado').agg({{'ano':'min'}}) (Min 'ano' by 'estado') ---\n{df.groupby('estado').agg({'ano':'min'})}\n")

--- df (Original DataFrame) ---
  estado    ano   pop
0     SP   2019  45.9
1     MG  20119  21.2
2     PR  20019  16.9
3     SP   2020  46.6
4     MG   2020  21.4
5     PR   2020  17.3

--- df.head(2) (First 2 rows) ---
  estado    ano   pop
0     SP   2019  45.9
1     MG  20119  21.2

--- df.tail(2) (Last 2 rows) ---
  estado   ano   pop
4     MG  2020  21.4
5     PR  2020  17.3

--- df.sample(2) (2 random rows) ---
  estado    ano   pop
1     MG  20119  21.2
2     PR  20019  16.9

--- df.describe() (Descriptive Statistics) ---
                ano        pop
count      6.000000   6.000000
mean    8036.166667  28.216667
std     9320.646274  14.096725
min     2019.000000  16.900000
25%     2020.000000  18.275000
50%     2020.000000  21.300000
75%    15519.250000  39.775000
max    20119.000000  46.600000

--- df.sort_values(by='ano') (Sort by 'ano' ASC) ---
  estado    ano   pop
0     SP   2019  45.9
3     SP   2020  46.6
5     PR   2020  17.3
4     MG   2020  21.4
2     PR  20019  16.9

### New DF from the previous one (see the columns)

In [14]:
dados={
    'estado': ['SP', 'MG', 'PR', 'SP','MG', 'PR'],
    'ano':[2019,2019,2019,2020,2020,2020],
    'pop': [45.9, 21.2,16.9,46.6,21.4,17.3]
}
df=pd.DataFrame(dados)
df2=pd.DataFrame(dados,columns=['ano','estado','pop'])
print(f"--- df (Original DataFrame) ---\n{df}\n")
print(f"--- df2 (Copy of df) ---\n{df2}\n")

--- df (Original DataFrame) ---
  estado    ano   pop
0     SP   2019  45.9
1     MG  20119  21.2
2     PR  20019  16.9
3     SP   2020  46.6
4     MG   2020  21.4
5     PR   2020  17.3

--- df (Copy DataFrame) ---
     ano estado   pop
0   2019     SP  45.9
1  20119     MG  21.2
2  20019     PR  16.9
3   2020     SP  46.6
4   2020     MG  21.4
5   2020     PR  17.3



### Assign Values

In [19]:
dados={
    'estado': ['SP', 'MG', 'PR', 'SP','MG', 'PR'],
    'ano':[2019,2019,2019,2020,2020,2020],
    'pop': [45.9, 21.2,16.9,46.6,21.4,17.3]
}
df=pd.DataFrame(dados,columns=['ano','estado','pop'])
df['estimativa']=50
print(f"--- df (DataFrame) ---\n{df}\n")
df['estimativa']=np.arange(6)
print(f"--- df (DataFrame) ---\n{df}\n")
df3=df['ano']
print(f"--- df3=df['ano'] (DataFrame) ---\n{df3}\n")
df['Não Paraná']=df.estado!='PR'
print(f"--- df['Não Paraná']=df.estado!='PR' (DataFrame) ---\n{df}\n")

--- df (DataFrame) ---
     ano estado   pop  estimativa
0   2019     SP  45.9          50
1  20119     MG  21.2          50
2  20019     PR  16.9          50
3   2020     SP  46.6          50
4   2020     MG  21.4          50
5   2020     PR  17.3          50

--- df (DataFrame) ---
     ano estado   pop  estimativa
0   2019     SP  45.9           0
1  20119     MG  21.2           1
2  20019     PR  16.9           2
3   2020     SP  46.6           3
4   2020     MG  21.4           4
5   2020     PR  17.3           5

--- df3=df2['ano'] (DataFrame) ---
0     2019
1    20119
2    20019
3     2020
4     2020
5     2020
Name: ano, dtype: int64

--- df2['Não Paraná']=df2.estado!='PR' (DataFrame) ---
     ano estado   pop  estimativa  Não Paraná
0   2019     SP  45.9           0        True
1  20119     MG  21.2           1        True
2  20019     PR  16.9           2       False
3   2020     SP  46.6           3        True
4   2020     MG  21.4           4        True
5   2020     PR  17

In [None]:
df2['estimativa']=np.arange(6)

### delete a column

In [38]:
dados={
    'estado': ['SP', 'MG', 'PR', 'SP','MG', 'PR'],
    'ano':[2019,2019,2019,2020,2020,2020],
    'pop': [45.9, 21.2,16.9,46.6,21.4,17.3]
}
df=pd.DataFrame(dados,columns=['ano','estado','pop'])
df['Not PR']=df.estado!='PR'
print(f"--- df (Original DataFrame) ---\n{df}\n")
del df['Not PR']
print(f"--- df (delete columm Not PR) ---\n{df}\n")

--- df (Original DataFrame) ---
     ano estado   pop  Not PR
0   2019     SP  45.9    True
1  20119     MG  21.2    True
2  20019     PR  16.9   False
3   2020     SP  46.6    True
4   2020     MG  21.4    True
5   2020     PR  17.3   False

--- df (delete columm Not PR) ---
     ano estado   pop
0   2019     SP  45.9
1  20119     MG  21.2
2  20019     PR  16.9
3   2020     SP  46.6
4   2020     MG  21.4
5   2020     PR  17.3



### extend dataframe

In [45]:
dados={
    'estado': ['SP', 'MG', 'PR', 'SP','MG', 'PR'],
    'ano':[2019,2019,2019,2020,2020,2020],
    'pop': [45.9, 21.2,16.9,46.6,21.4,17.3]
}
df=pd.DataFrame(dados,columns=['ano','estado','pop'])

print(f"--- df (Original DataFrame) ---\n{df}\n")
# 1. Get the dimensions (rows, columns)
print(f"--- df.shape (Shape/Dimensions) ---\n{df.shape}\n")
# 2. Get the number of rows
print(f"--- df.shape[0] (Number of Rows) ---\n{df.shape[0]}\n")
# 3. Get the row index (labels)
print(f"--- df.index (Row Index) ---\n{df.index}\n")
# 4. Get the column labels
print(f"--- df.columns (Column Names) ---\n{df.columns}\n")
# 5. Count non-missing values per column
print(f"--- df.count() (Non-null Counts) ---\n{df.count()}")

--- df (Original DataFrame) ---
     ano estado   pop
0   2019     SP  45.9
1  20119     MG  21.2
2  20019     PR  16.9
3   2020     SP  46.6
4   2020     MG  21.4
5   2020     PR  17.3

--- df.shape (Shape/Dimensions) ---
(6, 3)

--- df.shape[0] (Number of Rows) ---
6

--- df.index (Row Index) ---
RangeIndex(start=0, stop=6, step=1)

--- df.columns (Column Names) ---
Index(['ano', 'estado', 'pop'], dtype='object')

--- df.count() (Non-null Counts) ---
ano       6
estado    6
pop       6
dtype: int64


### update column name

In [55]:
dados={
    'estado': ['SP', 'MG', 'PR', 'SP','MG', 'PR'],
    'ano':[2019,2019,2019,2020,2020,2020],
    'pop': [45.9, 21.2,16.9,46.6,21.4,17.3]
}
df=pd.DataFrame(dados,columns=['ano','estado','pop'])
df['estimativa']=50

print(f"--- df (Original DataFrame) ---\n{df}\n")
df.columns=['Ano','estado','Populacao','estimativa']
print(f"--- df (renamed columns of DataFrame) ---\n{df}\n")

--- df (Original DataFrame) ---
     ano estado   pop  estimativa
0   2019     SP  45.9          50
1  20119     MG  21.2          50
2  20019     PR  16.9          50
3   2020     SP  46.6          50
4   2020     MG  21.4          50
5   2020     PR  17.3          50

--- df (renamed columns of DataFrame) ---
     Ano estado  Populacao  estimativa
0   2019     SP       45.9          50
1  20119     MG       21.2          50
2  20019     PR       16.9          50
3   2020     SP       46.6          50
4   2020     MG       21.4          50
5   2020     PR       17.3          50



### Analyze the DataFrame

In [44]:
dados={
    'estado': ['SP', 'MG', 'PR', 'SP','MG', 'PR'],
    'ano':[2019,2019,2019,2020,2020,2020],
    'pop': [45.9, 21.2,16.9,46.6,21.4,17.3]
}
df=pd.DataFrame(dados,columns=['ano','estado','pop'])
print(f"--- df describe ---\n{df2.describe(include='all')}\n")

--- df describe ---
                 ano estado        pop
count       6.000000      6   6.000000
unique           NaN      3        NaN
top              NaN     SP        NaN
freq             NaN      2        NaN
mean     8036.166667    NaN  28.216667
std      9320.646274    NaN  14.096725
min      2019.000000    NaN  16.900000
25%      2020.000000    NaN  18.275000
50%      2020.000000    NaN  21.300000
75%     15519.250000    NaN  39.775000
max     20119.000000    NaN  46.600000



### Update Values and Filter Data

In [74]:
dados={
    'estado': ['SP', 'MG', 'PR', 'SP','MG', 'PR'],
    'ano':[2019,2019,2019,2020,2020,2020],
    'pop': [45.9, 21.2,16.9,46.6,21.4,17.3]
}
df=pd.DataFrame(dados,columns=['ano','estado','pop'])
print(f"--- df (Original DataFrame) ---\n{df}\n")
df['ano']=df['ano']+2
print(f"--- df (increase year) ---\n{df}\n")
print(f"--- df (ano > 2021) ---\n{df[df.ano > 2021]}\n")
print(f"--- df (ano > 2024) ---\n{df[df.ano > 2024]}\n")

--- df (Original DataFrame) ---
    ano estado   pop
0  2019     SP  45.9
1  2019     MG  21.2
2  2019     PR  16.9
3  2020     SP  46.6
4  2020     MG  21.4
5  2020     PR  17.3

--- df (increase year) ---
    ano estado   pop
0  2021     SP  45.9
1  2021     MG  21.2
2  2021     PR  16.9
3  2022     SP  46.6
4  2022     MG  21.4
5  2022     PR  17.3

--- df (ano > 2021) ---
    ano estado   pop
3  2022     SP  46.6
4  2022     MG  21.4
5  2022     PR  17.3

--- df (ano > 2024) ---
Empty DataFrame
Columns: [ano, estado, pop]
Index: []

--- df (Original DataFrame) ---
    ano estado   pop
0  2021     SP  45.9
1  2021     MG  21.2
2  2021     PR  16.9
3  2022     SP  46.6
4  2022     MG  21.4
5  2022     PR  17.3



### Drop Columns

In [77]:
dados={
    'estado': ['SP', 'MG', 'PR', 'SP','MG', 'PR'],
    'ano':[2019,2019,2019,2020,2020,2020],
    'pop': [45.9, 21.2,16.9,46.6,21.4,17.3]
}
df=pd.DataFrame(dados,columns=['ano','estado','pop'])
print(f"--- df (Original DataFrame) ---\n{df}\n")
df.drop('ano', axis='columns') # works: df=df.drop('ano', axis='columns')
print(f"--- df (after drop won't work) ---\n{df}\n")
df.drop('ano', axis='columns', inplace=True)
print(f"--- df (after drop works) ---\n{df}\n")

--- df (Original DataFrame) ---
    ano estado   pop
0  2019     SP  45.9
1  2019     MG  21.2
2  2019     PR  16.9
3  2020     SP  46.6
4  2020     MG  21.4
5  2020     PR  17.3

--- df (after drop won't work) ---
    ano estado   pop
0  2019     SP  45.9
1  2019     MG  21.2
2  2019     PR  16.9
3  2020     SP  46.6
4  2020     MG  21.4
5  2020     PR  17.3

--- df (after drop works) ---
  estado   pop
0     SP  45.9
1     MG  21.2
2     PR  16.9
3     SP  46.6
4     MG  21.4
5     PR  17.3



### Drop Rows

In [81]:
dados={
    'estado': ['SP', 'MG', 'PR', 'SP','MG', 'PR'],
    'ano':[2019,2019,2019,2020,2020,2020],
    'pop': [45.9, 21.2,16.9,46.6,21.4,17.3]
}
df=pd.DataFrame(dados,columns=['ano','estado','pop'])
print(f"--- df (Original DataFrame) ---\n{df}\n")
df.drop([0,1]) # df=df.drop([0,1])
print(f"--- df (drop first line won't work) ---\n{df}\n")
df.drop([0,1], inplace=True)
print(f"--- df (drop first line works) ---\n{df}\n")

--- df (Original DataFrame) ---
    ano estado   pop
0  2019     SP  45.9
1  2019     MG  21.2
2  2019     PR  16.9
3  2020     SP  46.6
4  2020     MG  21.4
5  2020     PR  17.3

--- df (drop first line won't work) ---
    ano estado   pop
0  2019     SP  45.9
1  2019     MG  21.2
2  2019     PR  16.9
3  2020     SP  46.6
4  2020     MG  21.4
5  2020     PR  17.3

--- df (drop first line works) ---
    ano estado   pop
2  2019     PR  16.9
3  2020     SP  46.6
4  2020     MG  21.4
5  2020     PR  17.3



### Display Row Data

In [85]:
dados={
    'estado': ['SP', 'MG', 'PR', 'SP','MG', 'PR'],
    'ano':[2019,2019,2019,2020,2020,2020],
    'pop': [45.9, 21.2,16.9,46.6,21.4,17.3]
}
df=pd.DataFrame(dados,columns=['ano','estado','pop'])
print(f"--- df (Original DataFrame) ---\n{df}\n")
print(f"--- df.iloc[0] (First Row of df) ---\n{df.iloc[0]}\n")
print(f"--- df.iloc[1:3] (Rows 1 and 2 of df) ---\n{df.iloc[1:3]}\n")
print(f"--- df2.iloc[1:3,[1,2]] (Rows 1-2, Cols 1-2 of df2) ---\n{df2.iloc[1:3,[1,2]]}")


print("\n--- Descriptions---")
print(f"df.iloc[0]: Selects the **first row** (position 0) as a Series.")
print(f"df.iloc[1:3]: Selects **rows 1 and 2** (positions 1 through 3, exclusive).")
print(f"df.iloc[1:3,[1,2]]: Selects **rows 1 and 2** and **columns 1 ('estado') and 2 ('pop')** of df.")

--- df (Original DataFrame) ---
    ano estado   pop
0  2019     SP  45.9
1  2019     MG  21.2
2  2019     PR  16.9
3  2020     SP  46.6
4  2020     MG  21.4
5  2020     PR  17.3

--- df.iloc[0] (First Row of df) ---
ano       2019
estado      SP
pop       45.9
Name: 0, dtype: object

--- df.iloc[1:3] (Rows 1 and 2 of df) ---
    ano estado   pop
1  2019     MG  21.2
2  2019     PR  16.9

--- df2.iloc[1:3,[1,2]] (Rows 1-2, Cols 1-2 of df2) ---
  estado   pop
4     MG  21.4
5     PR  17.3

--- Descriptions---
df.iloc[0]: Selects the **first row** (position 0) as a Series.
df.iloc[1:3]: Selects **rows 1 and 2** (positions 1 through 3, exclusive).
df.iloc[1:3,[1,2]]: Selects **rows 1 and 2** and **columns 1 ('estado') and 2 ('pop')** of df.
