# Workshop # 2 : Data Processing

In [1]:
#Import Python Libraries
import numpy as np
import scipy as sp
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Read data
df = pd.read_csv("titanic-train.csv")

df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

## เลือก Variables ที่คิดว่ามีผลกับการทำนาย การรอดตายของผู้ขึ้นเรือ Titanic จาก df แล้ว save เข้า df2

In [3]:
df2 = df.loc[:,['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch']]
df2

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch
0,0,3,male,22.0,1,0
1,1,1,female,38.0,1,0
2,1,3,female,26.0,0,0
3,1,1,female,35.0,1,0
4,0,3,male,35.0,0,0
...,...,...,...,...,...,...
886,0,2,male,27.0,0,0
887,1,1,female,19.0,0,0
888,0,3,female,,1,2
889,1,1,male,26.0,0,0


In [4]:
df2.dtypes

Survived      int64
Pclass        int64
Sex          object
Age         float64
SibSp         int64
Parch         int64
dtype: object

## ดูจำนวน Missing values ในแต่ละ Variables

In [5]:
print('Survived missing ', df2['Survived'].isnull().sum())
print('Age missing ', df2['Age'].isnull().sum())
print('PClass missing ', df2['Pclass'].isnull().sum())
print('SibSp missing ', df2['SibSp'].isnull().sum())
print('Parch missing ', df2['Parch'].isnull().sum())
print('Survived missing ', df2['Survived'].isnull().sum())

Survived missing  0
Age missing  177
PClass missing  0
SibSp missing  0
Parch missing  0
Survived missing  0


## ทดแทน Missing Values ด้วย ค่า Means

โดยปกติแล้วเราสามารถ จัดการกับ Missing Values ได้ดังนี้
- Remove ออกทั้ง Observation
- Replace ด้วย Means สำหรับ variable ที่เป็น Numerical

In [6]:
df2['Age'] = df2['Age'].fillna(df2['Age'].mean())

print('Age missing ', df2['Age'].isnull().sum())

Age missing  0


## Split Data ออกเป็น 2 ก้อน Training Set กับ Test Set

In [7]:
from sklearn.model_selection import train_test_split

train_X, test_X, train_y, test_y = train_test_split(df2[['Pclass','Sex','Age','SibSp','Parch']], 
                                                    df2['Survived'], 
                                                    train_size=0.7, 
                                                    random_state=123,
                                                    stratify=df2['Survived'])
print("Labels for training and testing data")
print(df2.groupby(['Survived']))

Labels for training and testing data
<pandas.core.groupby.generic.DataFrameGroupBy object at 0x14e17d850>


## Label Encode Variable Sex

In [20]:
from sklearn import preprocessing

le = preprocessing.LabelEncoder()
le.fit(["male", "female"])
list(le.classes_)
train_X['Sex_Label'] = le.transform(train_X['Sex'])
train_X['Sex_Label']

199    0
468    1
198    0
574    1
776    1
      ..
737    1
360    1
669    0
245    1
768    1
Name: Sex_Label, Length: 623, dtype: int64

In [24]:
# Using get_dummies
df = pd.get_dummies(df, columns=['Sex'], drop_first=True)

# Display the first few rows
print(df.head())


   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name   Age  SibSp  Parch  \
0                            Braund, Mr. Owen Harris  22.0      1      0   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  38.0      1      0   
2                             Heikkinen, Miss. Laina  26.0      0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  35.0      1      0   
4                           Allen, Mr. William Henry  35.0      0      0   

             Ticket     Fare Cabin Embarked  Sex_male  
0         A/5 21171   7.2500   NaN        S      True  
1          PC 17599  71.2833   C85        C     False  
2  STON/O2. 3101282   7.9250   NaN        S     False  
3            113803  53.1000  C123        S     False  
4            373450   8.0500   NaN        S   

## Transform numeric data to category

In [55]:

train_X['AgeBin'] = pd.cut(train_X['Age'], 5)
print(train_X[['Age','AgeBin']])

           Age            AgeBin
416  34.000000  (32.252, 48.168]
801  31.000000  (16.336, 32.252]
512  36.000000  (32.252, 48.168]
455  29.000000  (16.336, 32.252]
757  18.000000  (16.336, 32.252]
275  63.000000  (48.168, 64.084]
121  29.699118  (16.336, 32.252]
443  28.000000  (16.336, 32.252]
458  50.000000  (48.168, 64.084]
653  29.699118  (16.336, 32.252]
725  20.000000  (16.336, 32.252]
395  22.000000  (16.336, 32.252]
754  48.000000  (32.252, 48.168]
40   40.000000  (32.252, 48.168]
217  42.000000  (32.252, 48.168]
457  29.699118  (16.336, 32.252]
732  29.699118  (16.336, 32.252]
469   0.750000    (0.34, 16.336]
250  29.699118  (16.336, 32.252]
124  54.000000  (48.168, 64.084]
470  29.699118  (16.336, 32.252]
291  19.000000  (16.336, 32.252]
169  28.000000  (16.336, 32.252]
300  29.699118  (16.336, 32.252]
666  25.000000  (16.336, 32.252]
885  39.000000  (32.252, 48.168]
34   28.000000  (16.336, 32.252]
348   3.000000    (0.34, 16.336]
163  17.000000  (16.336, 32.252]
159  29.69

## PCA 

In [25]:
from sklearn.decomposition import PCA

pca = PCA(n_components=3)
pca.fit_transform(train_X)
print(pca.explained_variance_ratio_)

[0.98361654 0.00868067 0.00381721]


In [12]:
x_new = pca.transform(train_X) 
print(x_new)

[[ 4.14214163  0.82936344  0.39795316]
 [ 1.14350405  0.77262179  0.46107859]
 [ 6.19427921 -0.49379444  1.06050476]
 ...
 [ 2.15071229 -0.62121473 -0.76194324]
 [ 0.15162058 -0.65904249 -0.71985962]
 [-0.84792528 -0.67795637 -0.69881781]]
