# 資料探索與分析(Exploratory Data Analysis, EDA) 

## 從兩個面向分析：
### 1. 描述統計量(Descriptive statistics)。
### 2. 資料視覺化(Data Visualization)：統計圖。

In [None]:
from IPython.display import Image
Image('./EDA.png', width=800)

In [None]:
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# fix 中文亂碼 
from matplotlib.font_manager import FontProperties
plt.rcParams['font.sans-serif'] = ['Arial Unicode MS'] 

## 鐵達尼號生存預測

In [None]:
df = sns.load_dataset('titanic')
df.head(10)

## 平均數(Mean)
### $\begin{equation}\bar{x} = \frac{\displaystyle\sum_{i=1}^{n}x_{i}}{n}\end{equation}$
###  ***x&#772;*** ：樣本平均數，&mu;：母體平均數

In [None]:
df.survived.mean()

In [None]:
df['survived'].mean()

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
# object 欄位統計
df.describe(include='O')

In [None]:
# 所有欄位統計
df.describe(include='all')

## 中位數(Median)：較不受離群值(Outlier)影響。
### 資料集筆數為單數：第 $\begin{equation}\frac{n+1}{2}\end{equation}$ 筆
### 資料集筆數為偶數：第 $\begin{equation}\frac{n}{2} \; 與 \; \frac{n}{2} + 1\end{equation}$ 筆的平均數


In [None]:
df.age.median(), df.age.mean()

In [None]:
df.boxplot(column=['age'])
plt.show()

In [None]:
df.boxplot(column=['age', 'fare'])
plt.show()

In [None]:
sns.boxplot(data=df, x='embarked', y='age', hue='embarked')
plt.show()

In [None]:
sns.boxplot(data=df, x='pclass', y='age', hue='pclass')
plt.show()

In [None]:
sns.boxplot(data=df, x='pclass', y='survived', hue='pclass')
plt.show()

## 眾數(Mode)：發生頻率最高的數值，多數決。
### 可能有多組，故回傳資料為list。

In [None]:
df.age.mode()

In [None]:
# pd.options.display.max_rows = None
df.age.value_counts()

In [None]:
sns.countplot(data=df, x='pclass')
plt.show()

In [None]:
df['age'].plot.hist(title='年齡')
plt.show()

In [None]:
sns.histplot(df['age'], bins=10)
plt.show()

In [None]:
sns.histplot(np.log(df['age']), bins=20)
plt.show()

## 偏態(Skewness)

In [None]:
df = pd.DataFrame({'Name': ['Dan', 'Joann', 'Pedro', 'Rosie', 'Ethan', 'Vicky', 'Frederic'],
                   'Salary':[50000,54000,50000,189000,55000,40000,59000]})

salary = df['Salary']
sns.histplot(salary, kde=True)
plt.axvline(salary.mean(), color='magenta', linestyle='dashed', linewidth=2)
plt.axvline(salary.median(), color='green', linestyle='dashed', linewidth=2)
plt.show()

### 資料呈現偏態(skewed)，右邊形成長尾(long tail)，稱之為右偏態(right-skewed)或正偏態，因為有極少的高值將平均數它向右邊。

In [None]:
df = pd.DataFrame({'Name': ['Dan', 'Joann', 'Pedro', 'Rosie', 'Ethan', 'Vicky', 'Frederic'],
                   'Hours':[41,40,36,30,35,39,40]})
Hours = df['Hours']
sns.histplot(Hours, kde=True)
plt.axvline(Hours.mean(), color='magenta', linestyle='dashed', linewidth=2)
plt.axvline(Hours.median(), color='green', linestyle='dashed', linewidth=2)
plt.show()

### 資料呈左偏態(left-skewed)或負偏態。

In [None]:
df = pd.read_csv('./data/president_heights.csv')
df

In [None]:
height = df['height(cm)']
sns.histplot(height, kde=True)
plt.axvline(height.mean(), color='magenta', linestyle='dashed', linewidth=2)
plt.axvline(height.median(), color='green', linestyle='dashed', linewidth=2)
plt.show()

### 資料無偏態。

In [None]:
# 偏態(Skewness)
height.skew()

## 峰度(Kurtosis)：衡量資料集中的程度。
### Pandas 峰度為修正過的值，常態分配峰度=0(未修正的值=3)。

In [None]:
height.kurt()

# 衡量資料變異的程度(Measures of Variance)
### 1. 級距(Range)：最大值 - 最小值。
### 2. 百分位數、四分位數。
### 3. 變異數、標準差。

In [None]:
# 級距(Range)
print(height.max(), height.min())
height.max() - height.min()

In [None]:
# 四分位數
height.describe()

In [None]:
# 百分位數
height.describe(np.arange(0.1, 1.0, 0.1))

## 箱形圖(box plot)或盒鬚圖，可以觀察：
### 1. 中位數
### 2. 四分位數(Q1~Q3)
### 3. 最小值、最大值
### 4. 離群值

In [None]:
from IPython.display import Image
Image('Boxplot.png', width=1000)

In [None]:
sns.__version__

In [None]:
%pip install seaborn -U

In [None]:
df2 = sns.load_dataset('tips')
sns.boxplot(x='day', y='tip', data=df2, hue='day')
plt.show();

## 離群值(Outliers)發生的原因包括輸入錯誤、感測器錯誤或是異常訊號，後者可能是設備故障的前兆。
## 需判別發生原因後，再決定處理方式。

## 變異數(Variance)：衡量資料散佈的程度。
### 母體變異數(Variance)： $\begin{equation}\sigma^{2} = \frac{\displaystyle\sum_{i=1}^{N} (X_{i} -\mu)^{2}}{N}\end{equation}$
### 樣本變異數： $\begin{equation}s^{2} = \frac{\displaystyle\sum_{i=1}^{n} (x_{i} -\bar{x})^{2}}{n-1}\end{equation}$
### 母體標準差(Standard Deviation)： $\begin{equation}\sigma = \sqrt{\frac{\displaystyle\sum_{i=1}^{N} (X_{i} -\mu)^{2}}{N}}\end{equation}$
### 樣本標準差：$\begin{equation}s = \sqrt{\frac{\displaystyle\sum_{i=1}^{n} (x_{i} -\bar{x})^{2}}{n-1}}\end{equation}$

In [None]:
# 樣本變異數
height.var()

In [None]:
# 母體變異數
height.var(ddof=0)

In [None]:
# 樣本標準差
height.std()

In [None]:
# 母體標準差
height.std(ddof=0)

## 常態分配(Normal Distribution)與標準差(Standard Deviation)

In [None]:
import scipy.stats as stats

# Create a random standard normal distribution
df = pd.DataFrame(np.random.randn(100000, 1), columns=['Grade'])

# Plot the distribution as a histogram with a density curve
grade = df['Grade']
density = stats.gaussian_kde(grade)
n, x, _ = plt.hist(grade, color='lightgrey', density=True, bins=100)  
plt.plot(x, density(x))

# Get the mean and standard deviation
s = df['Grade'].std()
m = df['Grade'].mean()

# Annotate 1 stdev
x1 = [m-s, m+s]
y1 = [0.25, 0.25]
plt.plot(x1,y1, color='magenta')
plt.annotate('1s (68.26%)', (x1[1],y1[1]))

# Annotate 2 stdevs
x2 = [m-(s*2), m+(s*2)]
y2 = [0.05, 0.05]
plt.plot(x2,y2, color='green')
plt.annotate('2s (95.45%)', (x2[1],y2[1]))

# Annotate 3 stdevs
x3 = [m-(s*3), m+(s*3)]
y3 = [0.005, 0.005]
plt.plot(x3,y3, color='orange')
plt.annotate('3s (99.73%)', (x3[1],y3[1]))

# Show the location of the mean
plt.axvline(grade.mean(), color='grey', linestyle='dashed', linewidth=1)

plt.show()

# 資料視覺化(Data Visualization)：常用統計圖

In [None]:
# 鐵達尼資料集，欄位說明：https://www.kaggle.com/competitions/titanic/data
df = sns.load_dataset('titanic')
df.head(10)

## 長條圖(Bar Chart)：比較各類資料的大小。

In [None]:
df['survived'].value_counts()

In [None]:
plt.title('生存人數統計', fontsize=20)
df['survived'].value_counts().plot.bar()
plt.show()

In [None]:
plt.title('生存人數統計', fontsize=20)
sns.countplot(x='survived', data=df)
plt.show()

In [None]:
# 依上船港口分類
plt.title('生存人數統計', fontsize=20)
sns.countplot(x='survived', hue='embark_town', data=df)
plt.show()

In [None]:
plt.title("平均年齡", fontsize=20);
sns.barplot(x='survived', y='age', data=df)
plt.show()

In [None]:
plt.title("平均年齡", fontsize=20);
sns.barplot(x='survived', y='age', hue='pclass', data=df)
plt.show()

## 直方圖(Histogram)：針對連續型變數分組(bins)統計筆數。

In [None]:
df['age'].plot.hist(title='年齡') # , bins=5
plt.show()

In [None]:
sns.histplot(df['age'], bins=10)
plt.show()

In [None]:
sns.kdeplot(df['age'])
plt.show()

## 餅圖(Pie Chart)：觀察各類資料的比例

In [None]:
df['survived'].value_counts().plot.pie(title='生存人數統計', labels=['死亡', '生存']) 
# plt.pie(genderCounts, labels=['male', 'female'],explode=[0.2, 0.])
plt.legend()
plt.show()

In [None]:
df['embark_town'].value_counts().plot.pie(title='生存人數統計', explode=[0.2, 0., 0.], autopct='%1.1f%%', shadow=True)
plt.show()

In [None]:
value_list = np.array(list(df['embark_town'].value_counts()))
value_list

In [None]:
def show_value(pct, allvals):
    absolute = int(pct/100.*np.sum(allvals))
    return f"{absolute:d}\n({pct:.1f}%)"

df['embark_town'].value_counts().plot.pie(title='生存人數統計', explode=[0.2, 0., 0.], shadow=True
                             , autopct=lambda pct: show_value(pct, value_list))
plt.show()

## 散佈圖(Scatter Plot)：觀察定量特徵間的關係，也可以觀察是否有離群值(outliers)。

In [None]:
df.plot.scatter(title='生存與年齡是否高度相關?', x='age', y='survived')
plt.show()

In [None]:
df = sns.load_dataset('tips')
df.plot.scatter(title='小費與總計程車費是否高度相關?', x='total_bill', y='tip')
plt.show()

In [None]:
sns.scatterplot(x='total_bill', y='tip', data=df)
plt.show()

In [None]:
sns.scatterplot(x='total_bill', y='tip', hue='sex', data=df)
plt.show()

## 折線圖(Line Chart)：觀察時間的趨勢

In [None]:
df = pd.read_csv('./data/international-airline-passengers.csv', skiprows=1, names=['YearMonth', '乘客數'])
df['YearMonth'] = pd.to_datetime(df['YearMonth'] + '-01') #.replace("-", "", regex=True).astype(int) 
df.plot(title='Airline passengers', x='YearMonth', y='乘客數', legend=None)
plt.show()

In [None]:
sns.lineplot(x='YearMonth', y='乘客數', data=df)
plt.show()

## Pair Plot

In [None]:
df = sns.load_dataset('tips')
# 類別欄位轉為數值
df.sex = df.sex.map({'Female':0, 'Male':1}).astype(int)
df.smoker = df.smoker.map({'No':0, 'Yes':1}).astype(int)
df.day = df.day.map({'Thur':0, 'Fri':1, 'Sat':2, 'Sun':3}).astype(int)
df.time = df.time.map({'Lunch':0, 'Dinner':1})
sns.pairplot(data=df)
plt.show()

## PairGrid

In [None]:
df = sns.load_dataset("penguins")

sns.set_theme(style="white")
g = sns.PairGrid(df, diag_sharey=False)
g.map_upper(sns.scatterplot, s=15)
g.map_lower(sns.kdeplot)
g.map_diag(sns.kdeplot, lw=2)
plt.show()

## 多變數(Multivariate data)分析

In [None]:
# 特徵規模不一致
df = sns.load_dataset('penguins')
df.head(10)

In [None]:
from matplotlib.font_manager import FontProperties
# plt.rcParams['font.sans-serif'] = ['Microsoft JhengHei'] # 微軟正黑體
# or 
plt.rcParams['font.sans-serif'] = ['Arial Unicode MS'] 
plt.rcParams['axes.unicode_minus'] = False

In [None]:
# 喙的長度、腳蹼長度、體重

df[['bill_length_mm', 'flipper_length_mm', 'body_mass_g']].plot.box(title='多變數(Multivariate data)比較')
plt.show()

### 因數量的單位不同，造成特徵規模不一致，影響特徵的比較。

## 特徵縮放(Feature Scaling)優點：
### 1. 使特徵規模一致，求解收斂速度快
### 2. 提高模型預測準確率

## 作法:
### 1. MinMaxScaler
### 2. StandardScaler(標準化)

In [None]:
# MinMaxScaler
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
df2 = df[['bill_length_mm', 'flipper_length_mm', 'body_mass_g']].copy()
df2[['bill_length_mm', 'flipper_length_mm', 'body_mass_g']] = scaler.fit_transform(df2)

# Plot the normalized data
df2.plot.box(title='多變數(Multivariate data)比較')
plt.show()

In [None]:
# StandardScaler(標準化)
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
df2 = df[['bill_length_mm', 'flipper_length_mm', 'body_mass_g']].copy()
df2[['bill_length_mm', 'flipper_length_mm', 'body_mass_g']] = scaler.fit_transform(df2)

# Plot the normalized data
df2.plot.box(title='多變數(Multivariate data)比較')
plt.show()

## 準確率比較

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

df2 = df[['bill_length_mm', 'flipper_length_mm', 'body_mass_g', 'species']].copy()
df2.species = df2.species.map({'Adelie':0, 'Chinstrap':1, 'Gentoo':2})
df2 = df2.dropna()
X = df2.drop('species', axis=1)
y = df2.species

# 資料分割
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)

# 查看陣列維度
X_train.shape, X_test.shape, y_train.shape, y_test.shape

# 模型訓練
print(f'*** 使用原始資料 ***') 
model = LinearRegression()
model.fit(X_train, y_train)
# R2、MSE
y_pred = model.predict(X_test)
print(f'R2 = {r2_score(y_test, y_pred)*100:.4f}') 
print(f'MSE = {mean_squared_error(y_test, y_pred):.4f}') 

print(f'\n*** 使用特徵縮放資料 ***') 
scaler = MinMaxScaler()
X_train_std = scaler.fit_transform(X_train)
X_test_std = scaler.transform(X_test)
model2 = LinearRegression()
model2.fit(X_train_std, y_train)
# R2、MSE
y_pred = model2.predict(X_test_std)
print(f'R2 = {r2_score(y_test, y_pred)*100:.4f}') 
print(f'MSE = {mean_squared_error(y_test, y_pred):.4f}') 

### 因特徵數量只有3個，且特徵之間均呈正相關，故特徵縮放並不能提升準確率。

In [None]:
sns.pairplot(data=df2)
plt.show()

## 手寫阿拉伯數字辨識

In [None]:
# 未特徵縮放
import tensorflow as tf
mnist = tf.keras.datasets.mnist

(x_train, y_train),(x_test, y_test) = mnist.load_data()
#x_train, x_test = x_train / 255.0, x_test / 255.0

model = tf.keras.models.Sequential([
  tf.keras.layers.Input((28, 28)),
  tf.keras.layers.Flatten(),
  tf.keras.layers.Dense(128, activation='relu'),
  tf.keras.layers.Dropout(0.2),
  tf.keras.layers.Dense(10, activation='softmax')
])

model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

model.fit(x_train, y_train, epochs=5)
model.evaluate(x_test, y_test)

In [None]:
# 特徵縮放
(x_train, y_train),(x_test, y_test) = mnist.load_data()
x_train, x_test = x_train / 255.0, x_test / 255.0

model = tf.keras.models.Sequential([
  tf.keras.layers.Input((28, 28)),
  tf.keras.layers.Flatten(),
  tf.keras.layers.Dense(128, activation='relu'),
  tf.keras.layers.Dropout(0.2),
  tf.keras.layers.Dense(10, activation='softmax')
])

model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

model.fit(x_train, y_train, epochs=5)
model.evaluate(x_test, y_test)

### 特徵數量有784個且差異很大，使用特徵縮放可有效提升準確率。

## 變數之間的關聯(Correlation)
### Pearson相關係數：$\begin{equation}r_{x,y} = \frac{\displaystyle\sum_{i=1}^{n} (x_{i} -\bar{x})(y_{i} -\bar{y})}{\sqrt{\displaystyle\sum_{i=1}^{n} (x_{i} -\bar{x})^{2}(y_{i} -\bar{y})^{2}}}\end{equation}$

In [None]:
from sklearn.datasets import load_iris
X, y = load_iris(return_X_y=True, as_frame=True)
X.head(10)

In [None]:
df = X.copy()
df['y'] = y

In [None]:
# 相關係數
df.corr()

In [None]:
# 繪圖
import seaborn as sns
sns.heatmap(df.corr())
plt.show()

In [None]:
# 取絕對值，才能以顏色判斷關聯度
import seaborn as sns
sns.heatmap(df.corr().abs())
plt.show()

In [None]:
import numpy as np
mask = np.triu(np.ones_like(df.corr(), dtype=bool))
sns.heatmap(df.corr().abs(), square=True, mask=mask, annot=True, fmt=".2f", center=0)
plt.show()