In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

df=pd.read_csv('titanic.csv', usecols=['Age','Fare','Survived'])
df.head()


Unnamed: 0,Survived,Age,Fare
0,0,22.0,7.25
1,1,38.0,71.2833
2,1,26.0,7.925
3,1,35.0,53.1
4,0,35.0,8.05


# Bài 1:
# Hàm xử lý ngoại lệ
# Dữ liệu có dạng phân bố chuẩn

In [2]:
def gaussian_distribution(df,feature_age):
    uppper_boundary_age=df['Age'].mean() + 3* df[feature_age].std()
    lower_boundary_age=df['Age'].mean() - 3* df[feature_age].std()

    gaussian_distribution_data=df.copy()
    gaussian_distribution_data.loc[gaussian_distribution_data[feature_age]>=round(uppper_boundary_age),feature_age]=round(uppper_boundary_age)
    gaussian_distribution_data.loc[gaussian_distribution_data[feature_age]<=round(lower_boundary_age),feature_age]=round(lower_boundary_age)
    return gaussian_distribution_data

# Dữ liệu có dạng phân bố lệch

In [3]:
def skewed_distribution(df,feature_fare):
    IQR=df.Fare.quantile(0.75)-df.Fare.quantile(0.25)

    lower_bridge_fare=df[feature_fare].quantile(0.25)-(IQR*3)
    upper_bridge_fare=df[feature_fare].quantile(0.75)+(IQR*3)

    skewed_distribution_data=df.copy()
    skewed_distribution_data.loc[skewed_distribution_data[feature_fare]>=round(upper_bridge_fare,2),feature_fare]=round(upper_bridge_fare,2)
    skewed_distribution_data.loc[skewed_distribution_data[feature_fare]<=round(lower_bridge_fare,2),feature_fare]=round(lower_bridge_fare,2)
    
    return skewed_distribution_data

# Hàm tính độ chính xác trung bình

In [4]:
def Logistic_Regression(df,feature1,feature2):
    data = df.copy()
    sum_accuracy = 0
    for random_state in range(0, 9):
        X_train,X_test,y_train,y_test=train_test_split(data[[feature1,feature2]].fillna(0),data['Survived'],test_size=0.3, random_state=random_state)
        classifier=LogisticRegression()
        classifier.fit(X_train,y_train)
        y_pred=classifier.predict(X_test)
        sum_accuracy += accuracy_score(y_test,y_pred)
    return sum_accuracy/10

def Logistic_Regression_NAN(df,feature1,feature2,feature3):
    data = df.copy()
    sum_accuracy = 0
    for random_state in range(0, 9):
        X_train,X_test,y_train,y_test=train_test_split(data[[feature1,feature2,feature3]].fillna(0),data['Survived'],test_size=0.3, random_state=random_state)
        classifier=LogisticRegression()
        classifier.fit(X_train,y_train)
        y_pred=classifier.predict(X_test)
        sum_accuracy += accuracy_score(y_test,y_pred)
        # print("Accuracy_score: {}".format(accuracy_score(y_test,y_pred)))
    return sum_accuracy/10
    
def Logistic_Regression_pipeline(df,feature1,feature2):
    pipe = make_pipeline(
     StandardScaler(),
     LogisticRegression()
)
    data = df.copy()
    sum_accuracy = 0
    for random_state in range(0, 9):
        X_train,X_test,y_train,y_test=train_test_split(data[[feature1,feature2]].fillna(0),data['Survived'],test_size=0.3, random_state=random_state)
        pipe.fit(X_train, y_train)
        sum_accuracy += accuracy_score(pipe.predict(X_test), y_test)
        # print("Accuracy_score: {}".format(accuracy_score(y_test,y_pred)))
    return sum_accuracy/10

    #model selection - Automatic parameter searches

def Logistic_Regression_cross_APS(df,feature1,feature2):
    data = df.copy()
    sum_accuracy = 0
    for random_state in range(0, 9):
        X_train, X_test, y_train, y_test = train_test_split(data[[feature1,feature2]].fillna(0), data['Survived'], random_state=random_state)

        # define the parameter space that will be searched over
        param_distributions = {'n_estimators': randint(1, 100),
                                'max_depth': randint(100,891)}

        # now create a searchCV object and fit it to the data
        search = RandomizedSearchCV(estimator=RandomForestRegressor(random_state=random_state),
                                    n_iter=5,
                                    param_distributions=param_distributions,
                                    random_state=random_state)
        search.fit(X_train, y_train)
        # search.best_params_

        # the search object now acts like a normal random forest estimator
        # with max_depth=9 and n_estimators=4
        sum_accuracy += search.score(X_test, y_test)
    return sum_accuracy/10
  
    

# Xử lý dữ liệu trống

In [5]:
# Mean
mean=df['Age'].mean()
df['Age_mean']=df['Age'].fillna(round(mean)) 

# Accuracy - Without Outliers
mean_without_outliers = Logistic_Regression(df,'Age_mean','Fare')
print('mean_without_outliers: ',mean_without_outliers)

# Accuracy - With Outliers
mean_data = gaussian_distribution(df,'Age_mean')
mean_data = skewed_distribution(mean_data,'Fare')
mean_with_outliers = Logistic_Regression(mean_data,'Age_mean','Fare')
print('mean_with_outliers: ',mean_with_outliers)
df.tail()

mean_without_outliers:  0.5906716417910448
mean_with_outliers:  0.6029850746268657


Unnamed: 0,Survived,Age,Fare,Age_mean
886,0,27.0,13.0,27.0
887,1,19.0,30.0,19.0
888,0,,23.45,30.0
889,1,26.0,30.0,26.0
890,0,32.0,7.75,32.0


In [6]:
# Median 

median=df['Age'].median()       
df['Age_median']=df['Age'].fillna(median)

# Accuracy - Without Outliers
median_without_outliers = Logistic_Regression(df,'Age_median','Fare')
print('median_without_outliers: ',median_without_outliers)

# Accuracy - With Outliers
median_data = gaussian_distribution(df,'Age_median')
median_data = skewed_distribution(median_data,'Fare')
median_with_outliers = Logistic_Regression(median_data,'Age_median','Fare')
print('median_with_outliers: ',median_with_outliers)

df.tail()

median_without_outliers:  0.5899253731343282
median_with_outliers:  0.603358208955224


Unnamed: 0,Survived,Age,Fare,Age_mean,Age_median
886,0,27.0,13.0,27.0,27.0
887,1,19.0,30.0,19.0,19.0
888,0,,23.45,30.0,28.0
889,1,26.0,30.0,26.0,26.0
890,0,32.0,7.75,32.0,32.0


In [7]:
#Mode
mode=df['Age'].mode()
df['Age_mode']=df['Age'].fillna(mode[0])

# Accuracy - Without Outliers
mode_without_outliers = Logistic_Regression(df,'Age_mode','Fare')
print('mode_without_outliers: ',mode_without_outliers)

# Accuracy - With Outliers
mode_data = gaussian_distribution(df,'Age_mode')
mode_data = skewed_distribution(mode_data,'Fare')
mode_with_outliers = Logistic_Regression(mode_data,'Age_mode','Fare')
print('mode_with_outliers:    ',mode_with_outliers)

df.tail()


mode_without_outliers:  0.5895522388059701
mode_with_outliers:     0.6018656716417911


Unnamed: 0,Survived,Age,Fare,Age_mean,Age_median,Age_mode
886,0,27.0,13.0,27.0,27.0,27.0
887,1,19.0,30.0,19.0,19.0,19.0
888,0,,23.45,30.0,28.0,24.0
889,1,26.0,30.0,26.0,26.0,26.0
890,0,32.0,7.75,32.0,32.0,32.0


In [8]:
#Random

random_samples = df['Age'].dropna().sample(n=df['Age'].isnull().sum(),random_state=0)
random_samples.index = df[df['Age'].isnull()].index
df['Age_random']=df['Age']
df.loc[df['Age'].isnull(), 'Age_random']=random_samples

# Accuracy - Without Outliers
random_without_outliers = Logistic_Regression(df,'Age_random','Fare')
print('random_without_outliers: ',random_without_outliers)

# Accuracy - With Outliers
random_data = gaussian_distribution(df,'Age_random')
random_data = skewed_distribution(random_data,'Fare')
random_with_outliers = Logistic_Regression(random_data,'Age_random','Fare')
print('random_with_outliers:    ',random_with_outliers)

df.tail()

random_without_outliers:  0.5906716417910449
random_with_outliers:     0.6044776119402986


Unnamed: 0,Survived,Age,Fare,Age_mean,Age_median,Age_mode,Age_random
886,0,27.0,13.0,27.0,27.0,27.0,27.0
887,1,19.0,30.0,19.0,19.0,19.0,19.0
888,0,,23.45,30.0,28.0,24.0,15.0
889,1,26.0,30.0,26.0,26.0,26.0,26.0
890,0,32.0,7.75,32.0,32.0,32.0,32.0


In [9]:
# End of dist (Giá trị đuôi của phân bố)

extreme = df.Age.mean() + 3*df.Age.std()
df['Age_end_dist']=df['Age'].fillna(round(extreme))

# Accuracy - Without Outliers
end_dist_without_outliers = Logistic_Regression(df,'Age_end_dist','Fare')
print('end_dist_without_outliers: ',end_dist_without_outliers)

# Accuracy - With Outliers
end_dist_data = gaussian_distribution(df,'Age_end_dist')
end_dist_data = skewed_distribution(end_dist_data,'Fare')
end_dist_with_outliers = Logistic_Regression(end_dist_data,'Age_end_dist','Fare')
print('end_dist_with_outliers:    ',end_dist_with_outliers)

df.tail()


end_dist_without_outliers:  0.5988805970149255
end_dist_with_outliers:     0.6026119402985075


Unnamed: 0,Survived,Age,Fare,Age_mean,Age_median,Age_mode,Age_random,Age_end_dist
886,0,27.0,13.0,27.0,27.0,27.0,27.0,27.0
887,1,19.0,30.0,19.0,19.0,19.0,19.0,19.0
888,0,,23.45,30.0,28.0,24.0,15.0,73.0
889,1,26.0,30.0,26.0,26.0,26.0,26.0,26.0
890,0,32.0,7.75,32.0,32.0,32.0,32.0,32.0


In [10]:
# Arbitrary Value (Giá trị bất kì)

df['Age_arbitrary'] = df['Age'].fillna(np.random.randint(0, 100))

# Accuracy - Without Outliers
arbitrary_without_outliers = Logistic_Regression(df,'Age_arbitrary','Fare')
print('arbitrary_without_outliers: ',arbitrary_without_outliers)

# Accuracy - With Outliers
arbitrary_data = gaussian_distribution(df,'Age_arbitrary')
arbitrary_data = skewed_distribution(arbitrary_data,'Fare')
arbitrary_with_outliers = Logistic_Regression(arbitrary_data,'Age_arbitrary','Fare')
print('arbitrary_with_outliers:    ',arbitrary_with_outliers)

df.tail()

arbitrary_without_outliers:  0.5962686567164179
arbitrary_with_outliers:     0.5996268656716417


Unnamed: 0,Survived,Age,Fare,Age_mean,Age_median,Age_mode,Age_random,Age_end_dist,Age_arbitrary
886,0,27.0,13.0,27.0,27.0,27.0,27.0,27.0,27.0
887,1,19.0,30.0,19.0,19.0,19.0,19.0,19.0,19.0
888,0,,23.45,30.0,28.0,24.0,15.0,73.0,47.0
889,1,26.0,30.0,26.0,26.0,26.0,26.0,26.0,26.0
890,0,32.0,7.75,32.0,32.0,32.0,32.0,32.0,32.0


In [11]:
# New feature 

df['Age_NAN']=np.where(df['Age'].isnull(),1,0)

# Accuracy - Without Outliers
NAN_without_outliers = Logistic_Regression_NAN(df,'Age_random','Fare','Age_NAN')
print('NAN_without_outliers: ',NAN_without_outliers)

# Accuracy - With Outliers
NAN_data = gaussian_distribution(df,'Age_random')
NAN_data = gaussian_distribution(NAN_data,'Age_NAN')
NAN_data = skewed_distribution(NAN_data,'Fare')
NAN_with_outliers = Logistic_Regression_NAN(NAN_data,'Age_random','Fare','Age_NAN')
print('NAN_with_outliers:    ',NAN_with_outliers)

df.tail()

NAN_without_outliers:  0.5966417910447761
NAN_with_outliers:     0.6044776119402986


Unnamed: 0,Survived,Age,Fare,Age_mean,Age_median,Age_mode,Age_random,Age_end_dist,Age_arbitrary,Age_NAN
886,0,27.0,13.0,27.0,27.0,27.0,27.0,27.0,27.0,0
887,1,19.0,30.0,19.0,19.0,19.0,19.0,19.0,19.0,0
888,0,,23.45,30.0,28.0,24.0,15.0,73.0,47.0,1
889,1,26.0,30.0,26.0,26.0,26.0,26.0,26.0,26.0,0
890,0,32.0,7.75,32.0,32.0,32.0,32.0,32.0,32.0,0


In [12]:
results = {
    'Without Outliers' : [mean_without_outliers, median_without_outliers , mode_without_outliers , random_without_outliers , end_dist_without_outliers , arbitrary_without_outliers , NAN_without_outliers],
    'Outliers' : [mean_with_outliers , median_with_outliers, mode_with_outliers , random_with_outliers ,end_dist_with_outliers , arbitrary_with_outliers , NAN_with_outliers]
}
pd.DataFrame(data=results , index=['Mean' , 'Median' , 'Mode' , 'Random' , 'End Dist' , 'Arbitrary' , 'Age + NAN'])

Unnamed: 0,Without Outliers,Outliers
Mean,0.590672,0.602985
Median,0.589925,0.603358
Mode,0.589552,0.601866
Random,0.590672,0.604478
End Dist,0.598881,0.602612
Arbitrary,0.596269,0.599627
Age + NAN,0.596642,0.604478


- Khi xử lý ngoại lệ độ chính xác sẽ tăng trong tất cả các phương pháp làm sạch dữ liệu
- Kết quả tốt nhất là 0.605597 , khi xử lý ngoại lệ và sử dụng các phương thức xử lý dữ liệu trống 
+ New feature (thay dữ liệu trống bằng phương pháp Random + Thêm đặc trưng mới Age_NAN) 
+ Random

# Bài 2
# Các phương pháp chuẩn hóa dữ liệu

In [13]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler


# Z-Score + Arbitrary Value + Outliers

In [None]:
# Xử lý dữ liệu trống
# df['Age_NAN']=np.where(df['Age'].isnull(),1,0)

# Chuẩn hóa
scaler=StandardScaler()
df_scaled=pd.DataFrame(scaler.fit_transform(df),columns=df.columns)
df_scaled['Survived'] = df['Survived']

# Accuracy - With Outliers
scaler_data = gaussian_distribution(df_scaled,'Age_arbitrary')
# scaler_data = gaussian_distribution(df_scaled,'Age_random')
# scaler_data = gaussian_distribution(scaler_data,'Age_NAN')
scaler_data = skewed_distribution(scaler_data,'Fare')
scaler_with_outliers = Logistic_Regression(scaler_data,'Age_arbitrary','Fare')
print('scaler_with_outliers:    ',scaler_with_outliers)


# Min-max Scaling + Arbitrary Value + Outliers

In [None]:
# Chuẩn hóa
min_max=MinMaxScaler()
df_minmax=pd.DataFrame(min_max.fit_transform(df),columns=df.columns)

# Accuracy - With Outliers

minmax_data = gaussian_distribution(df_minmax,'Age_arbitrary')
# minmax_data = gaussian_distribution(df_minmax,'Age_random')
# minmax_data = gaussian_distribution(minmax_data,'Age_NAN')
minmax_data = skewed_distribution(minmax_data,'Fare')
minmax_with_outliers = Logistic_Regression(minmax_data,'Age_arbitrary','Fare')
# minmax_with_outliers = Logistic_Regression_NAN(minmax_data,'Age_random','Fare','Age_NAN')
print('minmax_with_outliers:    ',minmax_with_outliers)

minmax_with_outliers:     0.5828358208955223


# Robust Scaler + Arbitrary Value + Outliers

In [None]:
# Chuẩn hóa
scaler=RobustScaler()
df_robust_scaler=pd.DataFrame(scaler.fit_transform(df),columns=df.columns)

# Accuracy - With Outliers
robust_scale_data = gaussian_distribution(df_robust_scaler,'Age_arbitrary')
robust_scale_data = skewed_distribution(robust_scale_data,'Fare')
robust_scale_with_outliers = Logistic_Regression(robust_scale_data,'Age_arbitrary','Fare')
print('robust_scale_with_outliers:    ',robust_scale_with_outliers)

robust_scale_with_outliers:     0.6018656716417911


In [None]:
print('Trước chuẩn hóa: ' , arbitrary_with_outliers)
result1 = {
    'Normalize' : [scaler_with_outliers , minmax_with_outliers, robust_scale_with_outliers]
}
pd.DataFrame(data=result1 , index=['Z-Score' , 'Min-max' , 'Robust' ] )

Trước chuẩn hóa:  0.6018656716417911


Unnamed: 0,Normalize
Z-Score,0.602239
Min-max,0.582836
Robust,0.601866


Nhận xét:
- Độ chính xác dự báo tăng khi sử dụng phương pháp chuẩn hóa Z-Score 
- Độ chính xác dự báo giảm khi sử dụng phương pháp chuẩn hóa Min-Max
- Độ chính xác dự báo không đổi khi sử dụng phương pháp chuẩn hóa Robust

# Tiền xử lý

In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.datasets import fetch_california_housing
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint



In [None]:

#Z-Score
scaler_pipeline = Logistic_Regression_pipeline(scaler_data,'Age_arbitrary','Fare')
print('scaler_pipeline:    ',scaler_pipeline)
#MinMax Scaling
minmax_pipeline = Logistic_Regression_pipeline(minmax_data,'Age_arbitrary','Fare')
print('minmax_pipeline:    ',minmax_pipeline)
# Robust Scaler
robust_pipeline = Logistic_Regression_pipeline(robust_scale_data,'Age_arbitrary','Fare')
print('minmax_pipeline:    ',robust_pipeline)


scaler_pipeline:     0.6018656716417911


In [None]:

result2 = {
    'No Pre-processors' : [scaler_with_outliers , minmax_with_outliers, robust_scale_with_outliers],
    'Pre-processors' : [scaler_pipeline , minmax_pipeline, robust_pipeline]
}
pd.DataFrame(data=result2 , index=['Z-Score' , 'Min-max' , 'Robust' ] )


Unnamed: 0,No Pre-processors,Pre-processors
Z-Score,0.602239,0.601866
Min-max,0.582836,0.602612
Robust,0.601866,0.601866


- Độ chính xác dự báo chỉ tăng nhẹ khi có tiền xử lý