In [1]:
import pandas as pd
import numpy as np

titanic = pd.read_csv("titanic_data.csv")
titanic


Unnamed: 0,PassengerId,Name,PClass,Age,Sex,Survived
0,1,"Allen, Miss Elisabeth Walton",1st,29.00,female,1
1,2,"Allison, Miss Helen Loraine",1st,2.00,female,0
2,3,"Allison, Mr Hudson Joshua Creighton",1st,30.00,male,0
3,4,"Allison, Mrs Hudson JC (Bessie Waldo Daniels)",1st,25.00,female,0
4,5,"Allison, Master Hudson Trevor",1st,0.92,male,1
...,...,...,...,...,...,...
1308,1309,"Zakarian, Mr Artun",3rd,27.00,male,0
1309,1310,"Zakarian, Mr Maprieder",3rd,26.00,male,0
1310,1311,"Zenni, Mr Philip",3rd,22.00,male,0
1311,1312,"Lievens, Mr Rene",3rd,24.00,male,0


## ---檢查PassengerId欄位是否是唯一值---

In [2]:

# 檢查PassengerId欄位是否是唯一值
np.unique(titanic["PassengerId"].values).size


1313

In [3]:
print("---指定DataFrame物件的索引欄位---")
# 指定DataFrame物件的索引欄位
titanic.set_index(["PassengerId"], inplace=True)



---指定DataFrame物件的索引欄位---


In [4]:
titanic.head()

Unnamed: 0_level_0,Name,PClass,Age,Sex,Survived
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,"Allen, Miss Elisabeth Walton",1st,29.0,female,1
2,"Allison, Miss Helen Loraine",1st,2.0,female,0
3,"Allison, Mr Hudson Joshua Creighton",1st,30.0,male,0
4,"Allison, Mrs Hudson JC (Bessie Waldo Daniels)",1st,25.0,female,0
5,"Allison, Master Hudson Trevor",1st,0.92,male,1


## ---新增SexCode欄位---

In [5]:


# 新增SexCode欄位
titanic["SexCode"] = np.where(titanic["Sex"]=="female", 1, 0)
print(titanic.head())



                                                      Name PClass    Age  \
PassengerId                                                                
1                             Allen, Miss Elisabeth Walton    1st  29.00   
2                              Allison, Miss Helen Loraine    1st   2.00   
3                      Allison, Mr Hudson Joshua Creighton    1st  30.00   
4            Allison, Mrs Hudson JC (Bessie Waldo Daniels)    1st  25.00   
5                            Allison, Master Hudson Trevor    1st   0.92   

                Sex  Survived  SexCode  
PassengerId                             
1            female         1        1  
2            female         0        1  
3              male         0        0  
4            female         0        1  
5              male         1        0  


In [6]:
print("---PCass欄位轉換成數值資料---")
# PCass欄位轉換成數值資料
class_mapping = {"1st": 1,
                 "2nd": 2,
                 "3rd": 3}
titanic["PClass"] = titanic["PClass"].map(class_mapping)
titanic.head()

---PCass欄位轉換成數值資料---


Unnamed: 0_level_0,Name,PClass,Age,Sex,Survived,SexCode
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,"Allen, Miss Elisabeth Walton",1.0,29.0,female,1,1
2,"Allison, Miss Helen Loraine",1.0,2.0,female,0,1
3,"Allison, Mr Hudson Joshua Creighton",1.0,30.0,male,0,0
4,"Allison, Mrs Hudson JC (Bessie Waldo Daniels)",1.0,25.0,female,0,1
5,"Allison, Master Hudson Trevor",1.0,0.92,male,1,0


## ---檢查Age欄位的遺漏值有多少---

In [7]:
titanic.isnull().sum()
sum(titanic["Age"].isnull())

557

## ---補值成平均值---

In [8]:
# 補值成平均值
avg_age = titanic["Age"].mean()
titanic["Age"].fillna(avg_age, inplace=True)
sum(titanic["Age"].isnull())

0

## ---顯示性別人數和計算平均年齡---

In [9]:
# 顯示性別人數和計算平均年齡
print("性別人數:")
titanic["Sex"].groupby(titanic["Sex"]).size()
titanic.groupby("Sex")["Age"].mean()


性別人數:


Sex
female    29.773637
male      30.736945
Name: Age, dtype: float64

## ---處理姓名欄位---


In [10]:
# 處理姓名欄位
import re
patt = re.compile(r"\,\s(\S+\s)")  # 編譯一個正則表達式模式，用於匹配逗號後跟著一個空格，然後捕獲一個或多個非空白字符，再跟著一個空格
titles = []  # 創建一個空列表，用於存儲提取的稱號
for index, row in titanic.iterrows():  # 遍歷“titanic”數據集中的每一行
    m = re.search(patt, row["Name"])  # 在當前行的“Name”列中搜索模式的匹配項
    if m is None:  # 如果沒有找到匹配項
        title = "Mrs" if row["SexCode"] == 1 else "Mr"  # 如果“SexCode”列為1（表示女性），則將稱號設置為“Mrs”，否則設置為“Mr”
    else:  # 如果找到了匹配項
        title = m.group(0)  # 將匹配的字符串賦值給變量“title”
        title = re.sub(r",", "", title).strip()  # 使用re.sub()函數刪除“title”字符串中的逗號，並去除任何前後的空格
        if title[0] != "M":  # 如果“title”字符串的第一個字符不是“M”
            title = "Mrs" if row["SexCode"] == 1 else "Mr"  # 如果“SexCode”列為1（表示女性），則將稱號設置為“Mrs”，否則設置為“Mr”
        else:  # 如果“title”字符串的第一個字符是“M”
            if title[0] == "M" and title[1] == "a":  # 檢查第二個字符是否為“a”
                title = "Mrs" if row["SexCode"] == 1 else "Mr"  # 如果“SexCode”列為1（表示女性），則將稱號設置為“Mrs”，否則設置為“Mr”
    titles.append(title)  # 將“title”添加到“titles”列表中
titanic["Title"] = titles  # 將“titles”列表賦值給“titanic”數據集中的一個新列“Title”

print("Title類別:")  # 打印稱號類別的標題
print(np.unique(titles).shape[0], np.unique(titles))  # 打印唯一稱號的數量和稱號本身



Title類別:
5 ['Miss' 'Mlle' 'Mr' 'Mrs' 'Ms']


## ---修正類別錯誤顯示Title人數---

In [13]:
# 修正類別錯誤
titanic["Title"] = titanic["Title"].replace("Mlle","Miss")
titanic["Title"] = titanic["Title"].replace("Ms","Miss")  
titanic.to_csv("titanic_pre.csv", encoding="utf8")
titanic

Unnamed: 0_level_0,Name,PClass,Age,Sex,Survived,SexCode,Title
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,"Allen, Miss Elisabeth Walton",1.0,29.00,female,1,1,Miss
2,"Allison, Miss Helen Loraine",1.0,2.00,female,0,1,Miss
3,"Allison, Mr Hudson Joshua Creighton",1.0,30.00,male,0,0,Mr
4,"Allison, Mrs Hudson JC (Bessie Waldo Daniels)",1.0,25.00,female,0,1,Mrs
5,"Allison, Master Hudson Trevor",1.0,0.92,male,1,0,Mr
...,...,...,...,...,...,...,...
1309,"Zakarian, Mr Artun",3.0,27.00,male,0,0,Mr
1310,"Zakarian, Mr Maprieder",3.0,26.00,male,0,0,Mr
1311,"Zenni, Mr Philip",3.0,22.00,male,0,0,Mr
1312,"Lievens, Mr Rene",3.0,24.00,male,0,0,Mr


In [14]:
titanic[["Survived"]].groupby(titanic["Title"]).mean()

Unnamed: 0_level_0,Survived
Title,Unnamed: 1_level_1
Miss,0.604
Mr,0.166863
Mrs,0.740566
