# 離群值處理

#### Detect 

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

data_path = './data/house_train.csv'
df_train = pd.read_csv(data_path)
ori_series = df_train['土地移轉總面積(平方公尺)']
display(ori_series)
print(len(ori_series))

In [None]:
qt1 = ori_series.quantile(q=0.25)
qt3 = ori_series.quantile(q=0.75)
iqr = qt3-qt1


display(ori_series[ori_series<(qt1 - 1.5*iqr)])
display(ori_series[ori_series>(qt3 + 1.5*iqr)])
plt.boxplot(ori_series)
plt.show()

#### drop outliers

In [None]:
mask = (ori_series>(qt1 - 1.5*iqr)).tolist() and (ori_series<(qt3 + 1.5*iqr)).tolist()
series_drop_outliers = ori_series[mask]
print(len(series_drop_outliers))
plt.boxplot(series_drop_outliers)
plt.show()

#### fill outliers

In [None]:
series_fill = ori_series.copy()
series_fill[series_fill>(qt3 + 1.5*iqr)] = qt3 + 1.5*iqr
print(len(series_fill))
plt.boxplot(series_fill)
plt.show()

# Normalize

#### MinMaxScaler

In [None]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import numpy as np
X = np.array([[ 1., -1.,  2.],
              [ 2.,  0.,  0.],
              [ 0.,  1., -1.]])

print(f"min of X: {X.min(axis=0)}")
print(f"max of X: {X.max(axis=0)}\n")
min_max_scaler = MinMaxScaler().fit(X)
x_minmax_sk = min_max_scaler.transform(X)
print(x_minmax_sk)

#### StandardScaler

In [None]:
X = np.array([[ 1., -1.,  2.],
              [ 2.,  0.,  0.],
              [ 0.,  1., -1.]])
print(f"mean of X: {X.mean(axis=0)}")
print(f"std of X: {X.std(axis=0)}\n")

scaler = StandardScaler().fit(X)
# apply mean and std to standardize data
x_sc_sk = scaler.transform(X)
print(f"mean of scaler: {scaler.mean_}")
print(f"std of scaler: {scaler.scale_}\n")
print(x_sc_sk)

# 資料合併

#### pd.concat()

In [None]:
df1 = pd.DataFrame([['A1','B1'], ['A2','B2']],
             columns=['A', 'B'],
             index=[1,2])

df2 = pd.DataFrame([['A3','B3'], ['A4','B4']],
             columns=['A', 'B'],
             index=[1,2])

df3 = pd.DataFrame([['B5','C5'], ['B6','C6']],
             columns=['B', 'C'],
             index=[5,6])

df4 = pd.concat([df1, df2])
df5 = pd.concat([df1, df2],axis = 1)
df6 = pd.concat([df1, df3])

display(df1)
display(df2)
display(df3)
display(df4)
display(df5)
display(df6)

#### pd.merge()

In [None]:
df1 = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'],
                    'B': ['B0', 'B1', 'B2', 'B3'],
                    'C': ['C0', 'C1', 'C2', 'C3'],
                    'D': ['D0', 'D1', 'D2', 'D3']},
                   index=[0, 1, 2, 3])
df2 = pd.DataFrame({'B': ['B2', 'B3', 'B6', 'B7'],
                    'D': ['D2', 'D10000', 'D6', 'D7'],
                    'F': ['F2', 'F3', 'F6', 'F7']},
                   index=[2, 3, 6, 7])
display(df1)
display(df2)

In [None]:
result_inner = pd.merge(df1, df2, how='inner')
result_inner_B = pd.merge(df1, df2, how='inner', on = 'B')
result_outer = pd.merge(df1, df2, how='outer')
display(result_inner)
display(result_inner_B)
display(result_outer)

#### groupby()

In [None]:
df_train.groupby('鄉鎮市區').mean()

# 補充:

In [None]:
df1 = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'],
                    'B': ['B0', 'B1', 'B2', 'B3'],
                    'C': ['C0', 'C1', 'C2', 'C3'],
                    'D': ['D0', 'D1', 'D2', 'D3']},
                   index=[0, 1, 2, 3])
df_melt = df1.melt()
display(df_melt)

In [None]:
df_pivot=df_melt.set_index(pd.Index([0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]))
df_pivot=df_pivot.pivot(columns='variable')
display(df_pivot)