In [116]:
#setting up libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBRegressor
from xgboost import XGBRFClassifier

In [19]:
#load data
train_data = pd.read_csv('data/train.csv')
test_data = pd.read_csv('data/test.csv')
gs_data = pd.read_csv('data/gender_submission.csv')

#merging test_data and gs_data just to make sure the values are aligned
test_data = test_data.merge(gs_data)

In [20]:
def data_preprocess (source_df):
    
    #first we drop the unwanted columns
    source_df_droped = source_df.drop(['PassengerId','Name','Ticket','Cabin'], axis=1)

    
    #then we impute Age
    age_imputer = SimpleImputer()
    source_df_droped_justAge = source_df_droped.loc[:, source_df_droped.columns == 'Age']
    imputed_Age = pd.DataFrame(age_imputer.fit_transform(source_df_droped_justAge))

    #impute removes column name, adding it back
    imputed_Age.columns = source_df_droped_justAge.columns
    

    #adding imputed data to the source df
    source_df_imputed = source_df_droped.drop('Age', axis=1).join(imputed_Age)
    
    #lesson learned, I should impute data before droping rows, and deleting rows is not a good idea, let's impute with most common values
    source_df_imputed["Embarked"] = source_df_imputed["Embarked"].fillna(source_df_imputed["Embarked"].value_counts().index[0])
    source_df_imputed["Fare"] = source_df_imputed["Fare"].fillna(source_df_imputed["Fare"].value_counts().index[0])
    
    #next we apply One-Hot
    OH_en = OneHotEncoder(handle_unknown='ignore', sparse=False)
    OH_source_df_imputed_col = OH_en.fit_transform(source_df_imputed[['Sex','Embarked']])
    OH_source_df_imputed_col = pd.DataFrame(OH_source_df_imputed_col)
    
    #alining index 
    OH_source_df_imputed_col.index = source_df_imputed[['Sex','Embarked']].index
    #alining columns
    OH_source_df_imputed_col.columns = OH_en.get_feature_names(['Sex','Embarked'])
    processed_df = source_df_imputed.drop(['Sex','Embarked'], axis=1).join(OH_source_df_imputed_col)

    
    return processed_df

In [137]:
def kv_feature (train_set, test_set,k_value):
    f_cols = train_set.columns.drop('Survived')
    #we have 10 features, let's keep five to avoid overfitting
    selector = SelectKBest(f_classif, k=k_value)
    X_new = selector.fit_transform(train_set[f_cols], train_set['Survived'])
    
    #transforming back to the original df format
    selected_features = pd.DataFrame(selector.inverse_transform(X_new), 
                                 index=train_set.index, 
                                 columns=f_cols)
    selected_columns = selected_features.columns[selected_features.var() != 0]
    
    return train_set[selected_columns], train_set['Survived'], test_set[selected_columns], test_set['Survived']

In [22]:
def L_one_feature (train_set, test_set):
    f_cols = train_set.columns.drop('Survived')
    X, y = train_set[f_cols], train_set['Survived']

    # Setting up the model
    logistic = LogisticRegression(C=1, penalty="l1", solver='liblinear', random_state=7).fit(X, y)
    model = SelectFromModel(logistic, prefit=True)
    
    #applying the model
    X_new = model.transform(X)
    
    #transforming back to the original df format
    selected_features = pd.DataFrame(model.inverse_transform(X_new), 
                                 index=X.index,
                                 columns=X.columns)
    selected_columns = selected_features.columns[selected_features.var() != 0]
    
    return train_set[selected_columns], train_set['Survived'], test_set[selected_columns], test_set['Survived']

In [23]:
def acc_cal (pred_data, result):
    comp = pred_data == result
    #need to convert to list to use count(True)
    num_r = comp.tolist().count(True)
    acc_p = num_r/len(comp.tolist())
    return acc_p

In [121]:
def dt (X_train, y_train, X_test, y_test):
    dt_model = DecisionTreeClassifier(random_state=1)
    dt_model.fit(X_train, y_train)
    
    score = acc_cal(dt_model.predict(X_test), y_test)
    return score

In [122]:
train_clean = data_preprocess(train_data)
test_clean = data_preprocess(test_data)

In [123]:
acc_dt = dt(train_clean.drop('Survived', axis = 1), train_clean.Survived, test_clean.drop('Survived', axis = 1), test_clean.Survived)

In [124]:
acc_dt

0.784688995215311

In [60]:
#define the Random Forest Model
def rf (X_train, y_train, X_test, y_test):
    forest_model = RandomForestClassifier(random_state=1)
    forest_model.fit(X_train, y_train)
    pred = forest_model.predict(X_test)
    score = acc_cal(pred, y_test)
    return score
    

In [61]:
acc_rf = rf(train_clean.drop('Survived', axis = 1), train_clean.Survived, test_clean.drop('Survived', axis = 1), test_clean.Survived)
acc_rf

0.7990430622009569

In [59]:
#such a low result, could be caused by overfitting

In [98]:
def xgb (X_train, y_train, X_test, y_test):
    xgb_model = XGBRFClassifier(n_estimators=10000)
    xgb_model.fit(X_train, y_train)
    pred = xgb_model.predict(X_test)
    score = acc_cal(pred, y_test)
    return score
    

In [99]:
acc_xgb = xgb(train_clean.drop('Survived', axis = 1), train_clean.Survived, test_clean.drop('Survived', axis = 1), test_clean.Survived)
acc_xgb

0.9712918660287081

In [106]:
def xgb_tuned (X_train, y_train, X_test, y_test):
    xgb_model = XGBRFClassifier(n_estimators=10000, learning_rate=0.05)
    xgb_model.fit(X_train, y_train, 
             early_stopping_rounds=5, 
             eval_set=[(X_test, y_test)], 
             verbose=False)
    pred = xgb_model.predict(X_test)
    score = acc_cal(pred, y_test)
    return score

In [107]:
acc_xgb_t = xgb_tuned(train_clean.drop('Survived', axis = 1), train_clean.Survived, test_clean.drop('Survived', axis = 1), test_clean.Survived)
acc_xgb_t

0.9712918660287081

In [127]:
#define a function to run call tested models
def run_models(X_train, y_train, X_test, y_test):

    #run decision tree model
    score_dt = dt(X_train, y_train, X_test, y_test)
    
    #run the Random Forest Model
    score_rf = rf(X_train, y_train, X_test, y_test)
    
    #run the XGB tuned model
    score_xgb = xgb_tuned(X_train, y_train, X_test, y_test)
    
    
    print(score_dt)
    print(score_rf)
    print(score_xgb)
    

In [128]:
run_models(train_clean.drop('Survived', axis = 1), train_clean.Survived, test_clean.drop('Survived', axis = 1), test_clean.Survived)

0.784688995215311
0.7990430622009569
0.9712918660287081


In [154]:
X_kv_train, y_kv_train, X_kv_test, y_kv_test = kv_feature(train_clean, test_clean, 4)
run_models(X_kv_train, y_kv_train, X_kv_test, y_kv_test)

0.8660287081339713
0.8660287081339713
0.9856459330143541


In [136]:
X_L_one_train, y_L_one_train, X_L_one_test, y_L_one_test = L_one_feature(train_clean, test_clean)
run_models(X_L_one_train, y_L_one_train, X_L_one_test, y_L_one_test)

0.7751196172248804
0.7942583732057417
0.9712918660287081


In [155]:
X_kv_train

Unnamed: 0,Pclass,Fare,Sex_female,Sex_male
0,3,7.2500,0.0,1.0
1,1,71.2833,1.0,0.0
2,3,7.9250,1.0,0.0
3,1,53.1000,1.0,0.0
4,3,8.0500,0.0,1.0
...,...,...,...,...
886,2,13.0000,0.0,1.0
887,1,30.0000,1.0,0.0
888,3,23.4500,1.0,0.0
889,1,30.0000,0.0,1.0


In [145]:
y_kv_train

0      0
1      1
2      1
3      1
4      0
      ..
886    0
887    1
888    0
889    1
890    0
Name: Survived, Length: 891, dtype: int64

In [147]:
X_kv_test

Unnamed: 0,Sex_female
0,0.0
1,1.0
2,0.0
3,0.0
4,1.0
...,...
413,0.0
414,1.0
415,0.0
416,0.0
