# 資料處理常用函示

In [None]:
import pandas as pd
import numpy as np

## 空值處理

In [None]:
from IPython.display import display

# 檢查 DataFrame 空缺值的狀態
def na_check(df_data):
    data_na = (df_data.isnull().sum() / len(df_data)) * 100
    data_na = data_na.drop(data_na[data_na == 0].index).sort_values(ascending=False)
    missing_data = pd.DataFrame({'Missing Ratio' :data_na})
    display(missing_data.head(10))

In [None]:
# Ignore NA, 計算百分位
def get_percentile(nums,series):
    return [np.percentile(series, q = i) for i in nums]

## 特徵工程

In [None]:
#均值編碼
def perform_mean_encoding(f1,f2,df):
    mean_df = df.groupby([f1])[f2].mean().reset_index()
    mean_df.columns = [f1, f'{f1}_mean']
    df = pd.merge(df, mean_df, on=f1, how='left')
    return df.drop([f1] , axis=1)

#眾數編碼
def perform_mode_encoding(f1,f2,df):
    mode_df = df.groupby([f1])[f2].apply(lambda x: x.mode()[0]).reset_index()
    mode_df.columns = [f1, f'{f1}_mode']
    df = pd.merge(df, mode_df, on=f1, how='left')
    return df.drop([f1] , axis=1)

#中位數編碼
def perform_median_encoding(f1,f2,df):
    median_df = df.groupby([f1])[f2].median().reset_index()
    mode_df.columns = [f1, f'{f1}_mode']
    df = pd.merge(df, mode_df, on=f1, how='left')
    return df.drop([f1] , axis=1)

#test
df = pd.DataFrame({"f1":["Jerry","Jerry","Jerry"],"f2":["dog","cat","cat"]})
perform_mode_encoding("f1","f2",df) 

## 特徵重要性

In [None]:
from sklearn import metrics, datasets
from sklearn.ensemble import GradientBoostingRegressor

In [None]:
def get_feature_importance(x,y):
    # 梯度提升樹擬合後, 將結果依照重要性由高到低排序
    estimator = GradientBoostingRegressor()
    estimator.fit(x, y)
    # estimator.feature_importances_ 就是模型的特徵重要性, 這邊先與欄位名稱結合起來, 才能看到重要性與欄位名稱的對照表
    feats = pd.Series(data=estimator.feature_importances_, index=x.columns)
    return feats.sort_values(ascending=False)

#test
feature, target = datasets.make_regression(n_samples=100, n_features=5)
get_feature_importance(pd.DataFrame(feature,columns=["f1","f2","f3","f4","f5"]), target)

## 評估指標

In [None]:
from sklearn import metrics, datasets
from sklearn.linear_model import LinearRegression

In [None]:
#回歸問題 
mae = metrics.mean_absolute_error(prediction, y) # 使用 MAE 評估
mse = metrics.mean_squared_error(prediction, y) # 使用 MSE 評估
r_square = metrics.r2_score(prediction, y) # 使用 r-square 評估

#分類問題 AUC F1-Score (Precision, Recall)
auc = metrics.roc_auc_score(y_test, y_pred)
f1 = metrics.f1_score(y_test, y_pred_binarized) # 使用 F1-Score 評估
precision = metrics.precision_score(y_test, y_pred_binarized) # 使用 Precision 評估
recall  = metrics.recall_score(y_test, y_pred_binarized) # 使用 recall 評估

## stacking
http://rasbt.github.io/mlxtend/user_guide/classifier/StackingCVClassifier/#example-3-stacked-cv-classification-and-gridsearch

In [7]:
from sklearn import metrics, datasets
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB 
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from mlxtend.classifier import StackingCVClassifier

In [None]:
iris = datasets.load_iris()
X = iris.data
y = iris.target

In [None]:
# Initializing models
def StackedCVClassification(X,y):
    clf1 = KNeighborsClassifier(n_neighbors=1)
    clf2 = RandomForestClassifier(random_state=RANDOM_SEED)
    clf3 = GaussianNB()
    lr = LogisticRegression()
    sclf = StackingCVClassifier(classifiers=[clf1, clf2, clf3], 
                                #use_probas=True,
                                meta_classifier=lr,
                                random_state=42)
    params = {'kneighborsclassifier__n_neighbors': [1, 5],
              'randomforestclassifier__n_estimators': [10, 50],
              'meta_classifier__C': [0.1, 10.0]}
    grid = GridSearchCV(estimator=sclf, 
                        param_grid=params, 
                        cv=5,
                        refit=True)
    grid.fit(X, y)
    cv_keys = ('mean_test_score', 'std_test_score', 'params')
    for r, _ in enumerate(grid.cv_results_['mean_test_score']):
        print("%0.3f +/- %0.2f %r"
              % (grid.cv_results_[cv_keys[0]][r],
                 grid.cv_results_[cv_keys[1]][r] / 2.0,
                 grid.cv_results_[cv_keys[2]][r]))

    print('Best parameters: %s' % grid.best_params_)
    print('Accuracy: %.2f' % grid.best_score_)
    return ({"best parameters":grid.best_params_,"accuracy":grid.best_score_})

In [9]:
def get_cross_val_score(model_list,label_list,X,y):
    d = {}
    for clf, label in zip(model_list, label_list):
        scores = model_selection.cross_val_score(clf, X, y, cv=5, scoring='accuracy')
        print("Accuracy: %0.2f (+/- %0.2f) [%s]" 
              % (scores.mean(), scores.std(), label))
        d[label] = scores.mean()
    return d

#test
clf1 = KNeighborsClassifier(n_neighbors=10)
clf2 = GaussianNB()
get_cross_val_score([clf1,clf2],["knn","gau"],X,y)