- What is a dataframe (pandas)
- create df
- vectorized operations
- access rows, columns, and cells. (.loc, .iloc, .at)
- change values
- sort values

In [145]:
import pandas as pd

In [146]:
from typing import List
data_list: List[int] = [1,2,3,4,5]

s1 = pd.Series(data_list)
s1


0    1
1    2
2    3
3    4
4    5
dtype: int64

In [147]:
s1 = pd.Series(data=data_list, index=["a", "b", "c", "d", "e"])
s1

a    1
b    2
c    3
d    4
e    5
dtype: int64

In [148]:
s1["a"]

np.int64(1)

In [149]:
s1 = s1 + 23
s1

a    24
b    25
c    26
d    27
e    28
dtype: int64

In [150]:
type(s1)

pandas.core.series.Series

In [151]:
# df = a collection of series objects

# dictionary creation
data = {
    "name": ["Nate", "Rebecca", "Edwin", "Preston"],
    "age": [39, 40, 11, 7],
    "year": ["Senior", "Junior", "Sophmore", "Freshman"],
}
students_df = pd.DataFrame(data=data)
students_df = students_df.set_index("name")
students_df

Unnamed: 0_level_0,age,year
name,Unnamed: 1_level_1,Unnamed: 2_level_1
Nate,39,Senior
Rebecca,40,Junior
Edwin,11,Sophmore
Preston,7,Freshman


In [152]:
students_df = students_df.reset_index()
students_df

Unnamed: 0,name,age,year
0,Nate,39,Senior
1,Rebecca,40,Junior
2,Edwin,11,Sophmore
3,Preston,7,Freshman


In [153]:
# Row based creation

import random
names = ["Messi", "Ronaldo", "Neymar", "Pulisc", "Mbappe"]
goals_per_game = [random.random() for _ in names]
games = [round(random.random() *1000) for _ in names]
games

cols = ["name", "goals/game", "games"]
stats = [[names[i], goals_per_game[i], games[i]] for i in range(5)]
stats_df = pd.DataFrame(data=stats, columns=cols)
stats_df

Unnamed: 0,name,goals/game,games
0,Messi,0.216637,56
1,Ronaldo,0.258052,152
2,Neymar,0.612574,166
3,Pulisc,0.10027,29
4,Mbappe,0.801333,506


In [154]:
stats_df["new_col"] = 0
stats_df

Unnamed: 0,name,goals/game,games,new_col
0,Messi,0.216637,56,0
1,Ronaldo,0.258052,152,0
2,Neymar,0.612574,166,0
3,Pulisc,0.10027,29,0
4,Mbappe,0.801333,506,0


In [155]:
stats_df = stats_df.drop(columns=["new_col"])
stats_df

Unnamed: 0,name,goals/game,games
0,Messi,0.216637,56
1,Ronaldo,0.258052,152
2,Neymar,0.612574,166
3,Pulisc,0.10027,29
4,Mbappe,0.801333,506


In [156]:
stats_df["goals"] = stats_df["goals/game"] * stats_df["games"]
stats_df

Unnamed: 0,name,goals/game,games,goals
0,Messi,0.216637,56,12.131681
1,Ronaldo,0.258052,152,39.22394
2,Neymar,0.612574,166,101.687269
3,Pulisc,0.10027,29,2.907821
4,Mbappe,0.801333,506,405.47432


In [157]:
# stats_df = stats_df.rename(columns={"goals/game": "goals_p_game"})
stats_df.rename(columns={"goals/game": "goals_p_game"}, inplace=True)

stats_df

Unnamed: 0,name,goals_p_game,games,goals
0,Messi,0.216637,56,12.131681
1,Ronaldo,0.258052,152,39.22394
2,Neymar,0.612574,166,101.687269
3,Pulisc,0.10027,29,2.907821
4,Mbappe,0.801333,506,405.47432


In [158]:
stats_df["name"]

0      Messi
1    Ronaldo
2     Neymar
3     Pulisc
4     Mbappe
Name: name, dtype: object

In [159]:
stats_df["name"] = stats_df["name"].str.lower()
stats_df


Unnamed: 0,name,goals_p_game,games,goals
0,messi,0.216637,56,12.131681
1,ronaldo,0.258052,152,39.22394
2,neymar,0.612574,166,101.687269
3,pulisc,0.10027,29,2.907821
4,mbappe,0.801333,506,405.47432


In [160]:
stats_df[["name", "games"]]

Unnamed: 0,name,games
0,messi,56
1,ronaldo,152
2,neymar,166
3,pulisc,29
4,mbappe,506


In [161]:
# filtering with .loc

stats_df.loc[0,"name"]

'messi'

In [162]:
stats_df.loc[0]

name                messi
goals_p_game     0.216637
games                  56
goals           12.131681
Name: 0, dtype: object

In [163]:
stats_df.loc[:, "name"]

0      messi
1    ronaldo
2     neymar
3     pulisc
4     mbappe
Name: name, dtype: object

In [164]:
stats_df.loc[1:2, "name"] # in df, second iten after colon is inclusive

1    ronaldo
2     neymar
Name: name, dtype: object

In [165]:
l = [1,2,3,4,5]
l[1:2] # in a list the second item is exclusive

[2]

In [166]:
stats_df.loc[:, "name" : "goals"]

Unnamed: 0,name,goals_p_game,games,goals
0,messi,0.216637,56,12.131681
1,ronaldo,0.258052,152,39.22394
2,neymar,0.612574,166,101.687269
3,pulisc,0.10027,29,2.907821
4,mbappe,0.801333,506,405.47432


In [167]:
# boolean logic with .loc

stats_df.loc[stats_df["goals_p_game"] > 0.5]

Unnamed: 0,name,goals_p_game,games,goals
2,neymar,0.612574,166,101.687269
4,mbappe,0.801333,506,405.47432


In [168]:
stats_df.loc[stats_df["goals_p_game"] > 0.5, "name"]

2    neymar
4    mbappe
Name: name, dtype: object

In [169]:
stats_df.loc[stats_df["name"] == "messi", "name"] = "ronaldo"
stats_df.loc[stats_df["goals_p_game"] > 0.5, "name"] = "messi"
stats_df

Unnamed: 0,name,goals_p_game,games,goals
0,ronaldo,0.216637,56,12.131681
1,ronaldo,0.258052,152,39.22394
2,messi,0.612574,166,101.687269
3,pulisc,0.10027,29,2.907821
4,messi,0.801333,506,405.47432


In [170]:
# .iloc, .at, .iat