## Ví dụ về Series trong pandas

In [2]:
import pandas as pd

In [3]:
fruits = pd.Series(['apple', 'orange', 'banana','pear','strawberry'])
fruits

0         apple
1        orange
2        banana
3          pear
4    strawberry
dtype: object

In [4]:
for i in fruits:
    print(i)

apple
orange
banana
pear
strawberry


In [5]:
fruits = fruits = pd.Series(['apple', 'orange', 'banana','pear','strawberry'], index=['A','B','C','D','E'])
fruits


A         apple
B        orange
C        banana
D          pear
E    strawberry
dtype: object

khởi tạo series trong pandas dùng dict thay cho list

In [6]:
fruits_dict = {
    'A':'apple',
    'B':'orange',
    'C':'banana',
    'D':'pear',
    'E':'strawberry'
}
fruits = pd.Series(fruits_dict)
fruits

A         apple
B        orange
C        banana
D          pear
E    strawberry
dtype: object

### Thực hành về Series

In [7]:
weights = pd.Series([38, 48, 42, 39, 41, 38, 40, 40, 100, 41],
                    index=["Hoa", "Nam", "Hải", "Linh", "Nhân", "Phương", "Thu", "Vũ", "Bảo","Long"])
weights

Hoa        38
Nam        48
Hải        42
Linh       39
Nhân       41
Phương     38
Thu        40
Vũ         40
Bảo       100
Long       41
dtype: int64

In [8]:
print(weights.mean())
print(weights.median())
print(weights.mode())

46.7
40.5
0    38
1    40
2    41
dtype: int64


### Thực hành với DataFrame

In [9]:
animals_dict = {
    "dong_vat": ["Sao la", "Khỉ đuôi ngắn", "Voi châu Á"],
    "tuoi_tho": [25, 25, 70],
    "loai": ["có vú", "linh trưởng", "có vú"]
}
animals = pd.DataFrame(animals_dict)

animals

Unnamed: 0,dong_vat,tuoi_tho,loai
0,Sao la,25,có vú
1,Khỉ đuôi ngắn,25,linh trưởng
2,Voi châu Á,70,có vú


In [10]:
animals.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   dong_vat  3 non-null      object
 1   tuoi_tho  3 non-null      int64 
 2   loai      3 non-null      object
dtypes: int64(1), object(2)
memory usage: 204.0+ bytes


In [11]:
animals.head(2)

Unnamed: 0,dong_vat,tuoi_tho,loai
0,Sao la,25,có vú
1,Khỉ đuôi ngắn,25,linh trưởng


In [12]:
animals.tail(2)

Unnamed: 0,dong_vat,tuoi_tho,loai
1,Khỉ đuôi ngắn,25,linh trưởng
2,Voi châu Á,70,có vú


In [13]:
animals.describe()

Unnamed: 0,tuoi_tho
count,3.0
mean,40.0
std,25.980762
min,25.0
25%,25.0
50%,25.0
75%,47.5
max,70.0


In [14]:
animals.dtypes

dong_vat    object
tuoi_tho     int64
loai        object
dtype: object

### Đọc và ghi dữ liệu

In [15]:
zoo_df = pd.read_csv("zoo.csv", index_col=0)
zoo_df

Unnamed: 0_level_0,animal,sound
row_number,Unnamed: 1_level_1,Unnamed: 2_level_1
1,dog,woof
2,cat,meow
3,bird,tweet
4,mouse,squeek
5,cow,moo
6,cow,moo
7,frog,croak
8,bird,tweet
9,elephant,toot
10,duck,quack


In [16]:
zoo_df.head(5)

Unnamed: 0_level_0,animal,sound
row_number,Unnamed: 1_level_1,Unnamed: 2_level_1
1,dog,woof
2,cat,meow
3,bird,tweet
4,mouse,squeek
5,cow,moo


In [17]:
zoo_df = zoo_df.rename_axis(index=None)
zoo_df

Unnamed: 0,animal,sound
1,dog,woof
2,cat,meow
3,bird,tweet
4,mouse,squeek
5,cow,moo
6,cow,moo
7,frog,croak
8,bird,tweet
9,elephant,toot
10,duck,quack


In [18]:
zoo_df.shape

(14, 2)

In [19]:
zoo_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 14 entries, 1 to 14
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   animal  14 non-null     object
 1   sound   13 non-null     object
dtypes: object(2)
memory usage: 336.0+ bytes


In [20]:
zoo_df.to_csv("zoo_new.csv")

In [21]:
zoo_df.to_excel("zoo_new.xlsx")

### Truy cập dữ liệu

In [22]:
animals

Unnamed: 0,dong_vat,tuoi_tho,loai
0,Sao la,25,có vú
1,Khỉ đuôi ngắn,25,linh trưởng
2,Voi châu Á,70,có vú


In [23]:
animals["dong_vat"]

0           Sao la
1    Khỉ đuôi ngắn
2       Voi châu Á
Name: dong_vat, dtype: object

In [24]:
animals[["dong_vat", "tuoi_tho"]]

Unnamed: 0,dong_vat,tuoi_tho
0,Sao la,25
1,Khỉ đuôi ngắn,25
2,Voi châu Á,70


In [25]:
animals.tuoi_tho

0    25
1    25
2    70
Name: tuoi_tho, dtype: int64

Ta có thể biến cột động vật thành chỉ số để truy cập vào các phần tử bằng lệnh `DataFrame.set_index()`.\
Tham số `inplace=True` có tác dụng cập nhật vào thẳng DataFrame.

In [26]:
animals.set_index("dong_vat", inplace=True)
# or
# animals = animals.set_index("dong_vat")
animals

Unnamed: 0_level_0,tuoi_tho,loai
dong_vat,Unnamed: 1_level_1,Unnamed: 2_level_1
Sao la,25,có vú
Khỉ đuôi ngắn,25,linh trưởng
Voi châu Á,70,có vú


Lệnh `DataFrame.reset_index()` undo thao tác set_index phía trên

In [27]:
animals.reset_index(inplace=True)
animals

Unnamed: 0,dong_vat,tuoi_tho,loai
0,Sao la,25,có vú
1,Khỉ đuôi ngắn,25,linh trưởng
2,Voi châu Á,70,có vú


### Truy cập dữ liệu theo hàng

In [28]:
animals[:2]

Unnamed: 0,dong_vat,tuoi_tho,loai
0,Sao la,25,có vú
1,Khỉ đuôi ngắn,25,linh trưởng


## Lọc dữ liệu

Lấy dữ liệu 2 hàng đầu tiên và cột 2 và côt 3 trong dataframe animals

In [29]:
animals.iloc[:2, 1:3]

Unnamed: 0,tuoi_tho,loai
0,25,có vú
1,25,linh trưởng


In [30]:
animals.loc[animals.tuoi_tho>50, "tuoi_tho": "loai"]

Unnamed: 0,tuoi_tho,loai
2,70,có vú


In [31]:
animals

Unnamed: 0,dong_vat,tuoi_tho,loai
0,Sao la,25,có vú
1,Khỉ đuôi ngắn,25,linh trưởng
2,Voi châu Á,70,có vú


### Lọc boolean

In [32]:
animals[animals["tuoi_tho"]<30]

Unnamed: 0,dong_vat,tuoi_tho,loai
0,Sao la,25,có vú
1,Khỉ đuôi ngắn,25,linh trưởng


In [33]:
animals.set_index("dong_vat", inplace=True)
animals.loc[["Sao la", "Khỉ đuôi ngắn"], "tuoi_tho":"loai"]

Unnamed: 0_level_0,tuoi_tho,loai
dong_vat,Unnamed: 1_level_1,Unnamed: 2_level_1
Sao la,25,có vú
Khỉ đuôi ngắn,25,linh trưởng


In [34]:
df = pd.read_csv("Billionaires.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2640 entries, 0 to 2639
Data columns (total 7 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Rank                 2640 non-null   int64  
 1   Name                 2640 non-null   object 
 2   Net Worth            2640 non-null   object 
 3   Age                  2576 non-null   float64
 4   Country | Territory  2640 non-null   object 
 5   Source               2640 non-null   object 
 6   Industry             2640 non-null   object 
dtypes: float64(1), int64(1), object(5)
memory usage: 144.5+ KB


In [35]:
df.head(10)

Unnamed: 0,Rank,Name,Net Worth,Age,Country | Territory,Source,Industry
0,1,Bernard Arnault & family,$211 B,74.0,France,LVMH,Fashion & Retail
1,2,Elon Musk,$180 B,51.0,United States,"Tesla, SpaceX",Automotive
2,3,Jeff Bezos,$114 B,59.0,United States,Amazon,Technology
3,4,Larry Ellison,$107 B,78.0,United States,Oracle,Technology
4,5,Warren Buffett,$106 B,92.0,United States,Berkshire Hathaway,Finance & Investments
5,6,Bill Gates,$104 B,67.0,United States,Microsoft,Technology
6,7,Michael Bloomberg,$94.5 B,81.0,United States,Bloomberg LP,Media & Entertainment
7,8,Carlos Slim Helu & family,$93 B,83.0,Mexico,Telecom,Telecom
8,9,Mukesh Ambani,$83.4 B,65.0,India,Diversified,Diversified
9,10,Steve Ballmer,$80.7 B,67.0,United States,Microsoft,Technology


In [36]:
df.dtypes

Rank                     int64
Name                    object
Net Worth               object
Age                    float64
Country | Territory     object
Source                  object
Industry                object
dtype: object

In [37]:
def clean_networth(networth):
    return networth[1:-1]
df["Net Worth"] = df["Net Worth"].apply(clean_networth)
df["Net Worth"] = pd.to_numeric(df["Net Worth"])

In [38]:
df.dtypes

Rank                     int64
Name                    object
Net Worth              float64
Age                    float64
Country | Territory     object
Source                  object
Industry                object
dtype: object

In [41]:
### your code here
df["Age"] = pd.to_numeric(df["Age"])

In [42]:
df.dtypes

Rank                     int64
Name                    object
Net Worth              float64
Age                    float64
Country | Territory     object
Source                  object
Industry                object
dtype: object

In [46]:
df[["Industry", "Net Worth"]].groupby("Industry").sum()

Unnamed: 0_level_0,Net Worth
Industry,Unnamed: 1_level_1
Automotive,525.3
Construction & Engineering,118.5
Diversified,905.2
Energy,453.5
Fashion & Retail,1698.8
Finance & Investments,1605.1
Food & Beverage,957.2
Gambling & Casinos,120.5
Healthcare,643.2
Logistics,239.5


In [51]:
df_industry= df[["Industry", "Net Worth"]].groupby("Industry").sum()
df_industry = df_industry.sort_values(by='Net Worth', ascending=False)
df_industry

Unnamed: 0_level_0,Net Worth
Industry,Unnamed: 1_level_1
Technology,1871.0
Fashion & Retail,1698.8
Finance & Investments,1605.1
Manufacturing,1025.9
Food & Beverage,957.2
Diversified,905.2
Real Estate,657.4
Healthcare,643.2
Automotive,525.3
Energy,453.5


In [52]:
df.head(10)

Unnamed: 0,Rank,Name,Net Worth,Age,Country | Territory,Source,Industry
0,1,Bernard Arnault & family,211.0,74.0,France,LVMH,Fashion & Retail
1,2,Elon Musk,180.0,51.0,United States,"Tesla, SpaceX",Automotive
2,3,Jeff Bezos,114.0,59.0,United States,Amazon,Technology
3,4,Larry Ellison,107.0,78.0,United States,Oracle,Technology
4,5,Warren Buffett,106.0,92.0,United States,Berkshire Hathaway,Finance & Investments
5,6,Bill Gates,104.0,67.0,United States,Microsoft,Technology
6,7,Michael Bloomberg,94.5,81.0,United States,Bloomberg LP,Media & Entertainment
7,8,Carlos Slim Helu & family,93.0,83.0,Mexico,Telecom,Telecom
8,9,Mukesh Ambani,83.4,65.0,India,Diversified,Diversified
9,10,Steve Ballmer,80.7,67.0,United States,Microsoft,Technology


In [54]:
import pycountry_convert as pc

def get_continent(country_name):
    try:
        country_code = pc.country_name_to_country_alpha2(country_name)
        continent_code = pc.country_alpha2_to_continent_code(country_code)
        continent_name = pc.convert_continent_code_to_continent_name(continent_code)
        return continent_name
    
    except:
        print(country_name)
        return "Unknown"
df["Continent"] = df["Country | Territory"].apply(get_continent)
df.head()

Eswatini (Swaziland)


Unnamed: 0,Rank,Name,Net Worth,Age,Country | Territory,Source,Industry,Continent
0,1,Bernard Arnault & family,211.0,74.0,France,LVMH,Fashion & Retail,Europe
1,2,Elon Musk,180.0,51.0,United States,"Tesla, SpaceX",Automotive,North America
2,3,Jeff Bezos,114.0,59.0,United States,Amazon,Technology,North America
3,4,Larry Ellison,107.0,78.0,United States,Oracle,Technology,North America
4,5,Warren Buffett,106.0,92.0,United States,Berkshire Hathaway,Finance & Investments,North America


In [56]:
df[["Net Worth", "Continent"]].groupby("Continent").mean()

Unnamed: 0_level_0,Net Worth
Continent,Unnamed: 1_level_1
Africa,4.421053
Asia,3.547064
Europe,4.853279
North America,6.025276
Oceania,3.924
South America,3.375
Unknown,6.5


In [58]:
# Tìm quốc gia Unknown không xấc định được châu lục
df[df["Continent"] == "Unknown"]

Unnamed: 0,Rank,Name,Net Worth,Age,Country | Territory,Source,Industry,Continent
391,390,Nathan Kirsh,6.5,91.0,Eswatini (Swaziland),"Retail, real estate",Fashion & Retail,Unknown


In [59]:
### your code here
df["Country | Territory"] = df["Country | Territory"].replace({"Eswatini (Swaziland)":"Swaziland"})
df["Continent"] = df["Country | Territory"].apply(get_continent)
df[["Net Worth", "Continent"]].groupby("Continent").mean()

Unnamed: 0_level_0,Net Worth
Continent,Unnamed: 1_level_1
Africa,4.525
Asia,3.547064
Europe,4.853279
North America,6.025276
Oceania,3.924
South America,3.375


In [None]:
df["Country | Territory"] = df["Country | Territory"].replace({"Eswatini (Swaziland)":"Swaziland"})
df["Continent"] = df["Country | Territory"].apply(get_continent)
df[["Net Worth", "Continent"]].groupby("Continent").mean()

In [60]:
df_continent = df[["Net Worth", "Continent"]].groupby("Continent").mean()
df_continent.sort_values(by="Net Worth", ascending=False)

Unnamed: 0_level_0,Net Worth
Continent,Unnamed: 1_level_1
North America,6.025276
Europe,4.853279
Africa,4.525
Oceania,3.924
Asia,3.547064
South America,3.375


In [62]:
df = pd.read_csv("hockey_players_data.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22 entries, 0 to 21
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   number        22 non-null     int64  
 1   name          22 non-null     object 
 2   position      22 non-null     object 
 3   games_played  22 non-null     int64  
 4   goals         12 non-null     float64
 5   assists       22 non-null     int64  
 6   PIM           22 non-null     int64  
 7   birthdate     22 non-null     object 
 8   age           22 non-null     int64  
 9   birthplace    20 non-null     object 
 10  weight        11 non-null     float64
 11  height        21 non-null     float64
dtypes: float64(3), int64(5), object(4)
memory usage: 2.2+ KB


In [63]:
df

Unnamed: 0,number,name,position,games_played,goals,assists,PIM,birthdate,age,birthplace,weight,height
0,13,Mikyla Grant-Mentis,F,6,5.0,4,4,7/15/98,22,"Brampton, ONT",,5.08
1,44,Lindsay Eastwood,D,6,4.0,5,10,1/14/97,23,,,5.08
2,2,Taylor Woods,F,6,4.0,4,4,4/12/94,26,"Thunder Bay,ONT",160.0,5.04
3,19,Brooke Boquist,F,6,2.0,2,2,7/27/96,24,"Burford,ONT",,5.03
4,67,Emma Woods,F,6,,4,0,9/8/96,24,"Markham,ONT",155.0,5.04
5,21,Breanne Wilson-Bennett,F,6,2.0,4,2,3/5/96,24,"Goose Bay, Newfoundland",,5.03
6,27,Amy Curlew,F,6,,2,4,3/4/97,23,"Rouyn-Noranda, PQ",,5.07
7,27,Sarah-Eve Coutu-Godbout,F,6,2.0,4,2,6/16/97,23,,,
8,18,Taytum Clairmont,F,6,1.0,3,21,11/28/93,26,"Royal Oak, MI",177.0,5.11
9,18,Taytum Clairmont,F,6,1.0,3,21,11/28/93,26,"Royal Oak, MI",177.0,5.11


In [64]:
df.isnull().sum()

number           0
name             0
position         0
games_played     0
goals           10
assists          0
PIM              0
birthdate        0
age              0
birthplace       2
weight          11
height           1
dtype: int64

In [65]:
df.dropna()

Unnamed: 0,number,name,position,games_played,goals,assists,PIM,birthdate,age,birthplace,weight,height
2,2,Taylor Woods,F,6,4.0,4,4,4/12/94,26,"Thunder Bay,ONT",160.0,5.04
8,18,Taytum Clairmont,F,6,1.0,3,21,11/28/93,26,"Royal Oak, MI",177.0,5.11
9,18,Taytum Clairmont,F,6,1.0,3,21,11/28/93,26,"Royal Oak, MI",177.0,5.11
13,4,Sarah Steele,D,6,1.0,4,4,9/11/94,25,"Belleville, ONT",145.0,5.06
14,23,Megan Quinn,D,6,1.0,0,0,3/9/95,22,"Burlington, ONT",165.0,5.05


In [66]:
df.drop(["birthdate", "birthplace", "weight"], axis=1, inplace=True)
df

Unnamed: 0,number,name,position,games_played,goals,assists,PIM,age,height
0,13,Mikyla Grant-Mentis,F,6,5.0,4,4,22,5.08
1,44,Lindsay Eastwood,D,6,4.0,5,10,23,5.08
2,2,Taylor Woods,F,6,4.0,4,4,26,5.04
3,19,Brooke Boquist,F,6,2.0,2,2,24,5.03
4,67,Emma Woods,F,6,,4,0,24,5.04
5,21,Breanne Wilson-Bennett,F,6,2.0,4,2,24,5.03
6,27,Amy Curlew,F,6,,2,4,23,5.07
7,27,Sarah-Eve Coutu-Godbout,F,6,2.0,4,2,23,
8,18,Taytum Clairmont,F,6,1.0,3,21,26,5.11
9,18,Taytum Clairmont,F,6,1.0,3,21,26,5.11


In [67]:
df.goals = df.goals.fillna(0)
df.tail()

Unnamed: 0,number,name,position,games_played,goals,assists,PIM,age,height
17,91,Jenna McParland,F,6,0.0,0,0,33,5.08
18,91,Henna McParland,M,5,0.0,0,0,33,5.08
19,27,Kristen Barbara,D,6,0.0,0,0,26,5.1
20,29,Julie Allen,F,6,0.0,0,0,26,5.07
21,34,Elaine Chuli,G,6,0.0,0,0,24,5.11


In [68]:
df.dtypes

number            int64
name             object
position         object
games_played      int64
goals           float64
assists           int64
PIM               int64
age               int64
height          float64
dtype: object