In [3]:
import pandas as pd
import numpy as np
import array as arr 
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

df=pd.read_csv('titanic.csv', usecols=['Age','Fare','Survived','Sex'])
arr = []
for row in df['Sex']:
    if row == 'male':
        arr.append(1)
    else: 
        arr.append(0)
df['Sex'] = arr
df.head()


Unnamed: 0,Survived,Sex,Age,Fare
0,0,1,22.0,7.25
1,1,0,38.0,71.2833
2,1,0,26.0,7.925
3,1,0,35.0,53.1
4,0,1,35.0,8.05


# Bài 1:
# Hàm xử lý ngoại lệ
Dữ liệu có dạng phân bố chuẩn

In [4]:
def gaussian_distribution(df,feature_age):
    uppper_boundary_age=df['Age'].mean() + 3* df[feature_age].std()
    lower_boundary_age=df['Age'].mean() - 3* df[feature_age].std()

    gaussian_distribution_data=df.copy()
    gaussian_distribution_data.loc[gaussian_distribution_data[feature_age]>=round(uppper_boundary_age),feature_age]=round(uppper_boundary_age)
    gaussian_distribution_data.loc[gaussian_distribution_data[feature_age]<=round(lower_boundary_age),feature_age]=round(lower_boundary_age)
    return gaussian_distribution_data

 Dữ liệu có dạng phân bố lệch

In [5]:
def skewed_distribution(df,feature_fare):
    IQR=df.Fare.quantile(0.75)-df.Fare.quantile(0.25)

    lower_bridge_fare=df[feature_fare].quantile(0.25)-(IQR*3)
    upper_bridge_fare=df[feature_fare].quantile(0.75)+(IQR*3)

    skewed_distribution_data=df.copy()
    skewed_distribution_data.loc[skewed_distribution_data[feature_fare]>=round(upper_bridge_fare,2),feature_fare]=round(upper_bridge_fare,2)
    skewed_distribution_data.loc[skewed_distribution_data[feature_fare]<=round(lower_bridge_fare,2),feature_fare]=round(lower_bridge_fare,2)
    
    return skewed_distribution_data

# Hàm tính độ chính xác

In [6]:
def Logistic_Regression(df,feature1,feature2):
    data = df.copy()
    sum_accuracy = 0
    for random_state in range(0, 9):
        X_train,X_test,y_train,y_test=train_test_split(data[[feature1,feature2]].fillna(0),data['Survived'],test_size=0.3, random_state=random_state)
        classifier=LogisticRegression()
        classifier.fit(X_train,y_train)
        y_pred=classifier.predict(X_test)
        sum_accuracy += accuracy_score(y_test,y_pred)
    return (sum_accuracy/10)*100

def Logistic_Regression_NAN(df,feature1,feature2,feature3):
    data = df.copy()
    sum_accuracy = 0
    for random_state in range(0, 9):
        X_train,X_test,y_train,y_test=train_test_split(data[[feature1,feature2,feature3]].fillna(0),data['Survived'],test_size=0.3, random_state=random_state)
        classifier=LogisticRegression()
        classifier.fit(X_train,y_train)
        y_pred=classifier.predict(X_test)
        sum_accuracy += accuracy_score(y_test,y_pred)
        # print("Accuracy_score: {}".format(accuracy_score(y_test,y_pred)))
    return (sum_accuracy/10)*100
  
    

# Xử lý dữ liệu trống

In [7]:
### Mean
mean=df['Age'].mean()
df['Age_mean']=df['Age'].fillna(round(mean)) 

# Accuracy - Without Outliers
mean_without_outliers = Logistic_Regression(df,'Age_mean','Fare')
print('mean_without_outliers: ',mean_without_outliers)

# Accuracy - With Outliers
mean_data = gaussian_distribution(df,'Age_mean')
mean_data = skewed_distribution(mean_data,'Fare')
mean_with_outliers = Logistic_Regression(mean_data,'Age_mean','Fare')
print('mean_with_outliers: ',mean_with_outliers)
df.tail()

mean_without_outliers:  59.06716417910448
mean_with_outliers:  60.298507462686565


Unnamed: 0,Survived,Sex,Age,Fare,Age_mean
886,0,1,27.0,13.0,27.0
887,1,0,19.0,30.0,19.0
888,0,0,,23.45,30.0
889,1,1,26.0,30.0,26.0
890,0,1,32.0,7.75,32.0


In [8]:
### Median 

median=df['Age'].median()       
df['Age_median']=df['Age'].fillna(median)

# Accuracy - Without Outliers
median_without_outliers = Logistic_Regression(df,'Age_median','Fare')
print('median_without_outliers: ',median_without_outliers)

# Accuracy - With Outliers
median_data = gaussian_distribution(df,'Age_median')
median_data = skewed_distribution(median_data,'Fare')
median_with_outliers = Logistic_Regression(median_data,'Age_median','Fare')
print('median_with_outliers: ',median_with_outliers)

df.tail()

median_without_outliers:  58.99253731343283
median_with_outliers:  60.335820895522396


Unnamed: 0,Survived,Sex,Age,Fare,Age_mean,Age_median
886,0,1,27.0,13.0,27.0,27.0
887,1,0,19.0,30.0,19.0,19.0
888,0,0,,23.45,30.0,28.0
889,1,1,26.0,30.0,26.0,26.0
890,0,1,32.0,7.75,32.0,32.0


In [9]:
### Mode
mode=df['Age'].mode()
df['Age_mode']=df['Age'].fillna(mode[0])

# Accuracy - Without Outliers
mode_without_outliers = Logistic_Regression(df,'Age_mode','Fare')
print('mode_without_outliers: ',mode_without_outliers)

# Accuracy - With Outliers
mode_data = gaussian_distribution(df,'Age_mode')
mode_data = skewed_distribution(mode_data,'Fare')
mode_with_outliers = Logistic_Regression(mode_data,'Age_mode','Fare')
print('mode_with_outliers:    ',mode_with_outliers)

df.tail()


mode_without_outliers:  58.95522388059701
mode_with_outliers:     60.1865671641791


Unnamed: 0,Survived,Sex,Age,Fare,Age_mean,Age_median,Age_mode
886,0,1,27.0,13.0,27.0,27.0,27.0
887,1,0,19.0,30.0,19.0,19.0,19.0
888,0,0,,23.45,30.0,28.0,24.0
889,1,1,26.0,30.0,26.0,26.0,26.0
890,0,1,32.0,7.75,32.0,32.0,32.0


In [10]:
### Random

random_samples = df['Age'].dropna().sample(n=df['Age'].isnull().sum(),random_state=0)
random_samples.index = df[df['Age'].isnull()].index
df['Age_random']=df['Age']
df.loc[df['Age'].isnull(), 'Age_random']=random_samples

# Accuracy - Without Outliers
random_without_outliers = Logistic_Regression(df,'Age_random','Fare')
print('random_without_outliers: ',random_without_outliers)

# Accuracy - With Outliers
random_data = gaussian_distribution(df,'Age_random')
random_data = skewed_distribution(random_data,'Fare')
random_with_outliers = Logistic_Regression(random_data,'Age_random','Fare')
print('random_with_outliers:    ',random_with_outliers)

df.tail()

random_without_outliers:  59.067164179104495
random_with_outliers:     60.44776119402986


Unnamed: 0,Survived,Sex,Age,Fare,Age_mean,Age_median,Age_mode,Age_random
886,0,1,27.0,13.0,27.0,27.0,27.0,27.0
887,1,0,19.0,30.0,19.0,19.0,19.0,19.0
888,0,0,,23.45,30.0,28.0,24.0,15.0
889,1,1,26.0,30.0,26.0,26.0,26.0,26.0
890,0,1,32.0,7.75,32.0,32.0,32.0,32.0


In [11]:
### End of dist (Giá trị đuôi của phân bố)

extreme = df.Age.mean() + 3*df.Age.std()
df['Age_end_dist']=df['Age'].fillna(round(extreme))

# Accuracy - Without Outliers
end_dist_without_outliers = Logistic_Regression(df,'Age_end_dist','Fare')
print('end_dist_without_outliers: ',end_dist_without_outliers)

# Accuracy - With Outliers
end_dist_data = gaussian_distribution(df,'Age_end_dist')
end_dist_data = skewed_distribution(end_dist_data,'Fare')
end_dist_with_outliers = Logistic_Regression(end_dist_data,'Age_end_dist','Fare')
print('end_dist_with_outliers:    ',end_dist_with_outliers)

df.tail()


end_dist_without_outliers:  59.888059701492544
end_dist_with_outliers:     60.26119402985075


Unnamed: 0,Survived,Sex,Age,Fare,Age_mean,Age_median,Age_mode,Age_random,Age_end_dist
886,0,1,27.0,13.0,27.0,27.0,27.0,27.0,27.0
887,1,0,19.0,30.0,19.0,19.0,19.0,19.0,19.0
888,0,0,,23.45,30.0,28.0,24.0,15.0,73.0
889,1,1,26.0,30.0,26.0,26.0,26.0,26.0,26.0
890,0,1,32.0,7.75,32.0,32.0,32.0,32.0,32.0


In [12]:
### Arbitrary Value (Giá trị bất kì)

df['Age_arbitrary'] = df['Age'].fillna(np.random.randint(0, 100))

# Accuracy - Without Outliers
arbitrary_without_outliers = Logistic_Regression(df,'Age_arbitrary','Fare')
print('arbitrary_without_outliers: ',arbitrary_without_outliers)

# Accuracy - With Outliers
arbitrary_data = gaussian_distribution(df,'Age_arbitrary')
arbitrary_data = skewed_distribution(arbitrary_data,'Fare')
arbitrary_with_outliers = Logistic_Regression(arbitrary_data,'Age_arbitrary','Fare')
print('arbitrary_with_outliers:    ',arbitrary_with_outliers)

df.tail()

arbitrary_without_outliers:  60.03731343283582
arbitrary_with_outliers:     60.186567164179095


Unnamed: 0,Survived,Sex,Age,Fare,Age_mean,Age_median,Age_mode,Age_random,Age_end_dist,Age_arbitrary
886,0,1,27.0,13.0,27.0,27.0,27.0,27.0,27.0,27.0
887,1,0,19.0,30.0,19.0,19.0,19.0,19.0,19.0,19.0
888,0,0,,23.45,30.0,28.0,24.0,15.0,73.0,81.0
889,1,1,26.0,30.0,26.0,26.0,26.0,26.0,26.0,26.0
890,0,1,32.0,7.75,32.0,32.0,32.0,32.0,32.0,32.0


In [13]:
### New feature 

df['Age_NAN']=np.where(df['Age'].isnull(),1,0)

# Accuracy - Without Outliers
NAN_without_outliers = Logistic_Regression_NAN(df,'Age_random','Fare','Age_NAN')
print('NAN_without_outliers: ',NAN_without_outliers)

# Accuracy - With Outliers
NAN_data = gaussian_distribution(df,'Age_random')
NAN_data = gaussian_distribution(NAN_data,'Age_NAN')
NAN_data = skewed_distribution(NAN_data,'Fare')
NAN_with_outliers = Logistic_Regression_NAN(NAN_data,'Age_random','Fare','Age_NAN')
print('NAN_with_outliers:    ',NAN_with_outliers)

df.tail()

NAN_without_outliers:  59.66417910447761
NAN_with_outliers:     60.44776119402986


Unnamed: 0,Survived,Sex,Age,Fare,Age_mean,Age_median,Age_mode,Age_random,Age_end_dist,Age_arbitrary,Age_NAN
886,0,1,27.0,13.0,27.0,27.0,27.0,27.0,27.0,27.0,0
887,1,0,19.0,30.0,19.0,19.0,19.0,19.0,19.0,19.0,0
888,0,0,,23.45,30.0,28.0,24.0,15.0,73.0,81.0,1
889,1,1,26.0,30.0,26.0,26.0,26.0,26.0,26.0,26.0,0
890,0,1,32.0,7.75,32.0,32.0,32.0,32.0,32.0,32.0,0


In [14]:
results = {
    'Without Outliers' : [mean_without_outliers, median_without_outliers , mode_without_outliers , random_without_outliers , end_dist_without_outliers , arbitrary_without_outliers , NAN_without_outliers],
    'Outliers' : [mean_with_outliers , median_with_outliers, mode_with_outliers , random_with_outliers ,end_dist_with_outliers , arbitrary_with_outliers , NAN_with_outliers]
}
pd.DataFrame(data=results , index=['Mean' , 'Median' , 'Mode' , 'Random' , 'End Dist' , 'Arbitrary' , 'Age + NAN'])

Unnamed: 0,Without Outliers,Outliers
Mean,59.067164,60.298507
Median,58.992537,60.335821
Mode,58.955224,60.186567
Random,59.067164,60.447761
End Dist,59.88806,60.261194
Arbitrary,60.037313,60.186567
Age + NAN,59.664179,60.447761


- Khi xử lý ngoại lệ độ chính xác sẽ tăng trong tất cả các phương pháp làm sạch dữ liệu
- Kết quả tốt nhất khi xử lý ngoại lệ và sử dụng các phương thức xử lý dữ liệu trống 
    + New feature (thay dữ liệu trống bằng phương pháp Random + Thêm đặc trưng mới Age_NAN) 
    + Random

# Bài 2
# Các phương pháp chuẩn hóa dữ liệu

In [15]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler


# Z-Score + Arbitrary Value + Outliers

In [16]:

# Chuẩn hóa Z-Score
scaler=StandardScaler()
df_scaled=pd.DataFrame(scaler.fit_transform(df),columns=df.columns)
df_scaled['Survived'] = df['Survived']

# Accuracy - With Outliers
scaler_data = gaussian_distribution(df_scaled,'Age_arbitrary')
scaler_data = skewed_distribution(scaler_data,'Fare')
scaler_with_outliers = Logistic_Regression(scaler_data,'Age_arbitrary','Fare')
print('scaler_with_outliers:    ',scaler_with_outliers)


scaler_with_outliers:     60.26119402985074


# Min-max Scaling + Arbitrary Value + Outliers

In [17]:
# Chuẩn hóa MInMax Scaling
min_max=MinMaxScaler()
df_minmax=pd.DataFrame(min_max.fit_transform(df),columns=df.columns)

# Accuracy - With Outliers
minmax_data = gaussian_distribution(df_minmax,'Age_arbitrary')
minmax_data = skewed_distribution(minmax_data,'Fare')
minmax_with_outliers = Logistic_Regression(minmax_data,'Age_arbitrary','Fare')
print('minmax_with_outliers:    ',minmax_with_outliers)

minmax_with_outliers:     58.17164179104477


# Robust Scaler + Arbitrary Value + Outliers

In [18]:
# Chuẩn hóa Robust Scaler
scaler=RobustScaler()
df_robust_scaler=pd.DataFrame(scaler.fit_transform(df),columns=df.columns)

# Accuracy - With Outliers
robust_scale_data = gaussian_distribution(df_robust_scaler,'Age_arbitrary')
robust_scale_data = skewed_distribution(robust_scale_data,'Fare')
robust_scale_with_outliers = Logistic_Regression(robust_scale_data,'Age_arbitrary','Fare')
print('robust_scale_with_outliers:    ',robust_scale_with_outliers)

robust_scale_with_outliers:     60.186567164179095


In [19]:
print('Trước chuẩn hóa: ' , arbitrary_with_outliers)
result1 = {
    'Normalize' : [scaler_with_outliers , minmax_with_outliers, robust_scale_with_outliers]
}
pd.DataFrame(data=result1 , index=['Z-Score' , 'Min-max' , 'Robust' ] )

Trước chuẩn hóa:  60.186567164179095


Unnamed: 0,Normalize
Z-Score,60.261194
Min-max,58.171642
Robust,60.186567


Nhận xét:
- Độ chính xác dự báo  tăng khi sử dụng phương pháp chuẩn hóa Z-Score và Robust
- Độ chính xác dự báo giảm khi sử dụng phương pháp chuẩn hóa Min-Max

# Đánh giá và lựa chọn mô hình
Thử nghiệm bổ sung đặc trưng giới tính để cải thiện độ chính xác cho mô hình hồi quy logistic trên dataset Titanic
Sử dụng phương pháp xử ý dữ liệu trống Arbitrary + chuẩn hóa dữ liệu + có xử lí ngoại lệ 

In [20]:
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn import svm
from sklearn import preprocessing




### Hàm Cross validation

In [21]:
def Cross_validation(df,feature1,feature2,feature3):
    data = df.copy()
    sum_accuracy = 0
    for i in range(0, 9):
        clf = make_pipeline(preprocessing.StandardScaler(), svm.SVC(C=1))
        scores = cross_val_score(clf,  data[[feature1,feature2,feature3]].fillna(0),  data['Survived'], cv=5)
        sum_accuracy += scores.mean()
    return (sum_accuracy/10)*100

In [27]:
### Không áp dụng lựa chọn và đánh giá mô hình

# Z-Score
scaler_data1 = gaussian_distribution(df_scaled,'Age_arbitrary')
scaler_data1 = skewed_distribution(scaler_data1,'Fare')
scaler_data1 = skewed_distribution(scaler_data1,'Sex')
scaler1 = Logistic_Regression_NAN(scaler_data1,'Age_arbitrary','Fare','Sex')
print('Scaler without Model Evaluation and Selection:    ',scaler1)
# MinMax Scaling
minmax_data1 = gaussian_distribution(df_minmax,'Age_arbitrary')
minmax_data1 = skewed_distribution(minmax_data1,'Fare')
minmax_data1 = skewed_distribution(minmax_data1,'Sex')
minmax1 = Logistic_Regression_NAN(minmax_data1,'Age_arbitrary','Fare','Sex')
print('Minmax without Model Evaluation and Selection:    ',minmax1)
# Robust Scaler
robust_scale_data1 = gaussian_distribution(df_robust_scaler,'Age_arbitrary')
robust_scale_data1 = skewed_distribution(robust_scale_data1,'Fare')
robust_scale1 = Logistic_Regression_NAN(robust_scale_data1,'Age_arbitrary','Fare','Sex')
print('Robust without Model Evaluation and Selection:    ',robust_scale1)


### Áp dụng lựa chọn và đánh giá mô hình

#Z-Score
scaler =  Cross_validation(scaler_data1,'Age_arbitrary','Fare' , 'Sex')
print('Scaler with Model Evaluation and Selection:    ',scaler)
# MinMax Scaling
minmax = Cross_validation(minmax_data1,'Age_arbitrary','Fare', 'Sex')
print('Minmax with Model Evaluation and Selection:    ',minmax)
# Robust Scaler
robust = Cross_validation(robust_scale_data1,'Age_arbitrary','Fare','Sex')
print('Robust with Model Evaluation and Selection:    ',robust)

Scaler without Model Evaluation and Selection:     70.52238805970148
Minmax without Model Evaluation and Selection:     70.3731343283582
Robust without Model Evaluation and Selection:     70.33582089552239
Scaler with Model Evaluation and Selection:     70.60517230556776
Minmax with Model Evaluation and Selection:     70.60517230556776
Robust with Model Evaluation and Selection:     70.60517230556776


In [26]:

result2 = {
    'Without Model Evaluation and Selection' : [scaler1 , minmax1, robust_scale1],
    'With Model Evaluation and Selection' : [scaler , minmax, robust]
}
pd.DataFrame(data=result2 , index=['Z-Score' , 'Min-max' , 'Robust' ] )


Unnamed: 0,Without Model Evaluation and Selection,With Model Evaluation and Selection
Z-Score,70.522388,70.605172
Min-max,70.373134,70.605172
Robust,70.335821,70.605172


- Khi bổ sung đặc trưng giới tính thì độ chính xác cho mô hình hồi quy logistic trên dataset Titanic tăng khoảng 0.4%-0.9% khi có đánh giá và lựa chọn mô hình 
- Sử dụng các phương pháp:
    + Xử lý ngoại lệ
    + Xử lý dữ liệu trống: Arbitrary Value (Giá trị bất kì)
    + Chuẩn hóa dữ liệu
    + Đánh giá và lựa chọn mô hình : Cross validation (Đánh giá chéo)

In [25]:

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from scipy.stats import randint


X_train, X_test, y_train, y_test = X_train,X_test,y_train,y_test=train_test_split(scaler_data1[['Age_arbitrary','Fare' , 'Sex']].fillna(0),scaler_data1['Survived'],test_size=0.3, random_state=0)

# define the parameter space that will be searched over
param_distributions = {'n_estimators': randint(1, 100),
                       'max_depth': randint(100, 891)}

# now create a searchCV object and fit it to the data
search = RandomizedSearchCV(estimator=RandomForestRegressor(random_state=0),
                            n_iter=5,
                            param_distributions=param_distributions,
                            random_state=0)
search.fit(X_train, y_train)




search.best_params_

{'max_depth': 729, 'n_estimators': 65}