# Chapter 3

### 產生資料框

In [3]:
import pandas as pd 
df = pd.DataFrame()
df["Name"] = ["Shao", "Zhao"]
df["Age"] = [23, 24]
df["Driver"]= [True, False]
df

Unnamed: 0,Name,Age,Driver
0,Shao,23,True
1,Zhao,24,False


### 現在要新增一行進去

In [46]:
new_ppl = pd.Series(['Hong', 24, True], index=df.columns)
df = df.append(new_ppl, ignore_index=True)
df

Unnamed: 0,Name,Age,Driver
0,Shao,23,True
1,Zhao,24,False
2,Hong,24,True


### 資料的描述

In [47]:
df.shape

(3, 3)

In [48]:
df.describe()

Unnamed: 0,Age
count,3.0
mean,23.666667
std,0.57735
min,23.0
25%,23.5
50%,24.0
75%,24.0
max,24.0


### 資料的瀏覽

In [49]:
df.iloc[0] # according to dataframe

Name      Shao
Age         23
Driver    True
Name: 0, dtype: object

In [50]:
df.loc['Shao'] # according to index

KeyError: 'Shao'

### 將某一列設成索引

In [51]:
df = df.set_index(df["Name"])
df

Unnamed: 0_level_0,Name,Age,Driver
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Shao,Shao,23,True
Zhao,Zhao,24,False
Hong,Hong,24,True


### Reset index

In [52]:
df = df.reset_index(drop=True)
df

Unnamed: 0,Name,Age,Driver
0,Shao,23,True
1,Zhao,24,False
2,Hong,24,True


### 依條件選取資料列

In [53]:
df[(df["Age"]==23) & (df["Driver"]==True)]

Unnamed: 0,Name,Age,Driver
0,Shao,23,True


### 值的替換

In [58]:
df['Age'] = df['Age'].replace(23, 24)
# df = df.replace(23, 24) 全部都改
# 接受regex regex=True

In [59]:
df

Unnamed: 0,Name,Age,Driver
0,Shao,24,True
1,Zhao,24,False
2,Hong,24,True


### 更改行名

In [61]:
df.rename(columns={"Name":"nickname"})

Unnamed: 0,nickname,Age,Driver
0,Shao,24,True
1,Zhao,24,False
2,Hong,24,True


### columns to dict

In [66]:
import collections
# 產生字典
columns_names = collections.defaultdict(str)
for name in df.columns:
    columns_names[name]

In [67]:
columns_names

defaultdict(str, {'Name': '', 'Age': '', 'Driver': ''})

### 找出最小值最大值總和平均與數目

In [71]:
df['Age'].max()
df['Age'].min()
df['Age'].mean()
df['Age'].sum()
df['Age'].count()
df.count()

Name      3
Age       3
Driver    3
dtype: int64

### 搜尋獨特值加上計數

In [75]:
df["Name"].unique()

array(['Shao', 'Zhao', 'Hong'], dtype=object)

In [77]:
df["Name"].nunique()

3

In [76]:
df["Name"].value_counts()

Hong    1
Zhao    1
Shao    1
Name: Name, dtype: int64

### 刪除某row

In [127]:
df = df.drop(df.index[3])

### (更好的方法)利用條件加上賦值刪除某row

In [144]:
# 把demi刪除
df[df["Name"]!="Demi"]

In [146]:
# 指定index
df[df.index!=0]

Unnamed: 0,Name,Age,Driver
1,Zhao,24.0,0.0
2,Hong,24.0,1.0


### 刪除某col

In [140]:
df.drop('Age', axis=1)

Unnamed: 0,Name,Driver
0,Shao,1.0
1,Zhao,0.0
2,Hong,1.0
3,Demi,0.0


In [142]:
df.drop(df.columns[0], axis=1)

Unnamed: 0,Age,Driver
0,24.0,1.0
1,24.0,0.0
2,24.0,1.0
3,23.0,0.0


### 缺漏值處理

In [128]:
import numpy as np
new_ppl = pd.Series(["Demi", 23, np.nan], index=df.columns)
df = df.append(new_ppl, ignore_index=True)

In [131]:
df

Unnamed: 0,Name,Age,Driver
0,Shao,24.0,1.0
1,Zhao,24.0,0.0
2,Hong,24.0,1.0
3,Demi,23.0,


In [133]:
df[df["Driver"].isnull()]

Unnamed: 0,Name,Age,Driver
3,Demi,23.0,


In [137]:
df.loc[df["Driver"].isnull(), "Driver"] = 0.0

In [138]:
df

Unnamed: 0,Name,Age,Driver
0,Shao,24.0,1.0
1,Zhao,24.0,0.0
2,Hong,24.0,1.0
3,Demi,23.0,0.0


In [139]:
# 直接在read_csv時就處理
df = pd.read_csv(url, na_value=[np.nan, 'NONE', -999])

In [None]:
# 書中不建議使用inplace, del等方法來處理資料

### 丟棄重複資料列(for all)

In [149]:
new_ppl = pd.Series(["Demi", 23, np.nan], index=df.columns)
df = df.append(new_ppl, ignore_index=True)
new_ppl = pd.Series(["Demi", 23, np.nan], index=df.columns)
df = df.append(new_ppl, ignore_index=True)

In [150]:
df

Unnamed: 0,Name,Age,Driver
0,Shao,24.0,1.0
1,Zhao,24.0,0.0
2,Hong,24.0,1.0
3,Demi,23.0,
4,Demi,23.0,


In [155]:
df = df.drop_duplicates()
df

Unnamed: 0,Name,Age,Driver
0,Shao,24.0,1.0
1,Zhao,24.0,0.0
2,Hong,24.0,1.0
3,Demi,23.0,


### 丟棄重複資料列(for cols)

In [157]:
# keep first match
df.drop_duplicates(subset=["Age"])

Unnamed: 0,Name,Age,Driver
0,Shao,24.0,1.0
3,Demi,23.0,


### Groupby

In [165]:
df.groupby("Driver").mean() #後面method要給aggregate stastics

Unnamed: 0_level_0,Age
Driver,Unnamed: 1_level_1
0.0,24.0
1.0,24.0


In [167]:
df.groupby("Driver")["Name"].count()

Driver
0.0    1
1.0    2
Name: Name, dtype: int64

In [170]:
df

Unnamed: 0,Name,Age,Driver
0,Shao,24.0,1.0
1,Zhao,24.0,0.0
2,Hong,24.0,1.0
3,Demi,23.0,


In [169]:
# 兩組的小範例
df.groupby(["Driver", "Age"])["Name"].count()

Driver  Age 
0.0     24.0    1
1.0     24.0    2
Name: Name, dtype: int64

### Time periods

In [10]:
import pandas as pd
import numpy as np
time_index = pd.date_range('06/06/2017', periods=100000, freq='30S')
time_index

DatetimeIndex(['2017-06-06 00:00:00', '2017-06-06 00:00:30',
               '2017-06-06 00:01:00', '2017-06-06 00:01:30',
               '2017-06-06 00:02:00', '2017-06-06 00:02:30',
               '2017-06-06 00:03:00', '2017-06-06 00:03:30',
               '2017-06-06 00:04:00', '2017-06-06 00:04:30',
               ...
               '2017-07-10 17:15:00', '2017-07-10 17:15:30',
               '2017-07-10 17:16:00', '2017-07-10 17:16:30',
               '2017-07-10 17:17:00', '2017-07-10 17:17:30',
               '2017-07-10 17:18:00', '2017-07-10 17:18:30',
               '2017-07-10 17:19:00', '2017-07-10 17:19:30'],
              dtype='datetime64[ns]', length=100000, freq='30S')

In [11]:
time_df = pd.DataFrame(index=time_index)

In [12]:
time_df["Sales"] = np.random.randint(1, 10, 100000)
time_df

Unnamed: 0,Sales
2017-06-06 00:00:00,5
2017-06-06 00:00:30,9
2017-06-06 00:01:00,7
2017-06-06 00:01:30,4
2017-06-06 00:02:00,8
...,...
2017-07-10 17:17:30,4
2017-07-10 17:18:00,1
2017-07-10 17:18:30,7
2017-07-10 17:19:00,2


In [13]:
# 以周為總和(must be datetime-like)
time_df.resample('W').sum()
time_df.resample('W').mean()

Unnamed: 0,Sales
2017-06-11,4.98287
2017-06-18,4.974653
2017-06-25,5.015278
2017-07-02,4.98254
2017-07-09,4.983085
2017-07-16,5.086058


In [14]:
time_df.resample('M').count()

Unnamed: 0,Sales
2017-06-30,72000
2017-07-31,28000


In [15]:
time_df.resample('M', label='left').count()

Unnamed: 0,Sales
2017-05-31,72000
2017-06-30,28000


### 在行上迭代

In [17]:
for name in df["Name"][:]:
    print(name.upper())

SHAO
ZHAO


In [18]:
[name.upper() for name in df["Name"][:]]

['SHAO', 'ZHAO']

### 在行中每一個元素套用函式

In [19]:
def uppercase(x):
    return x.upper()

In [20]:
df["Name"].apply(uppercase)[0:2]

0    SHAO
1    ZHAO
Name: Name, dtype: object

### 在分組上套用函式

In [21]:
df.groupby('Driver').apply(lambda x:x.count())

Unnamed: 0_level_0,Name,Age,Driver
Driver,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
False,1,1,1
True,1,1,1


### 串接資料框(concatenate)

In [4]:
df_a = pd.DataFrame({'id':['1', '2','3'], 'name':['shao', 'zhao', 'hong'],'age':['23', '23', '24']}, columns=['id', 'name', 'age'])
df_b = pd.DataFrame({'id':['4'], 'name':['demi'],'age':['23']}, columns=['id', 'name', 'age'])

In [5]:
pd.concat([df_a, df_b],axis=0) #也可以用APPEND

Unnamed: 0,id,name,age
0,1,shao,23
1,2,zhao,23
2,3,hong,24
0,4,demi,23


In [6]:
pd.concat([df_a, df_b], axis=1)

Unnamed: 0,id,name,age,id.1,name.1,age.1
0,1,shao,23,4.0,demi,23.0
1,2,zhao,23,,,
2,3,hong,24,,,


### 合併資料框(merge)

In [7]:
df_c = pd.DataFrame({'id':['2', '3', '4'], 'level':[160, 120, 80]}, columns=['id', 'level'])

In [8]:
df_c

Unnamed: 0,id,level
0,2,160
1,3,120
2,4,80


In [9]:
df_a

Unnamed: 0,id,name,age
0,1,shao,23
1,2,zhao,23
2,3,hong,24


In [10]:
pd.merge(df_a, df_c, on='id')

Unnamed: 0,id,name,age,level
0,2,zhao,23,160
1,3,hong,24,120


In [11]:
pd.merge(df_a, df_c , on='id', how='outer')

Unnamed: 0,id,name,age,level
0,1,shao,23.0,
1,2,zhao,23.0,160.0
2,3,hong,24.0,120.0
3,4,,,80.0


In [12]:
pd.merge(df_a, df_c , on='id', how='left')

Unnamed: 0,id,name,age,level
0,1,shao,23,
1,2,zhao,23,160.0
2,3,hong,24,120.0


In [13]:
pd.merge(df_a, df_c , on='id', how='outer')

Unnamed: 0,id,name,age,level
0,1,shao,23.0,
1,2,zhao,23.0,160.0
2,3,hong,24.0,120.0
3,4,,,80.0


In [14]:
pd.merge(df_a, df_c, left_on='id', right_on='id')
# 若兩行名稱相同用on, 不同則用left_on, right_on

Unnamed: 0,id,name,age,level
0,2,zhao,23,160
1,3,hong,24,120


# Chapter 4

### 特徵縮放

In [16]:
import numpy as np
from sklearn import preprocessing
feature = np.array([[-500.5], [-100.1], [0], [100.1], [900.9]])

In [19]:
# 產生縮放器
minmax_scale = preprocessing.MinMaxScaler(feature_range=(0, 1))
# Fit -> Transform
scaled_feature = minmax_scale.fit_transform(feature)
scaled_feature

array([[0.        ],
       [0.28571429],
       [0.35714286],
       [0.42857143],
       [1.        ]])

公式: Xi = (Xi-min(X))/max(X)-min(X)

### 特徵標準化(Z-score)

In [20]:
scaler = preprocessing.StandardScaler()
x = np.array([[-1000.1], [-200.2], [500.5], [600.6], [9000.9]])
standardized = scaler.fit_transform(x)
standardized

array([[-0.76058269],
       [-0.54177196],
       [-0.35009716],
       [-0.32271504],
       [ 1.97516685]])

*Note:通常標準化用的多，但大多用在哪取決於學習演算法，例如PCA用標準化比較好，但是類神經網路用min-max縮放比較好，建議預設先使用標準化*

### RobustScaler(若有離群值嚴重的情形，使用中位數到四分位數的範圍)

In [23]:
robust_scaler = preprocessing.RobustScaler()
robust_scaler.fit_transform(x)

array([[-1.87387612],
       [-0.875     ],
       [ 0.        ],
       [ 0.125     ],
       [10.61488511]])

### 觀察值正規化

In [30]:
features = np.array([[0.5, 0.5], [1.1, 3.4], [1.5, 20.2], [1.63, 34.4], [10.9, 3.3]])

In [34]:
normalizer_l1 = preprocessing.Normalizer(norm="l1") # l1 manhattan norm
normalizer_l1.transform(features)

array([[0.5       , 0.5       ],
       [0.24444444, 0.75555556],
       [0.06912442, 0.93087558],
       [0.04524008, 0.95475992],
       [0.76760563, 0.23239437]])

In [35]:
normalizer_l2 = preprocessing.Normalizer(norm="l2") # l2 euclidean norm
normalizer_l2.transform(features)

array([[0.70710678, 0.70710678],
       [0.30782029, 0.95144452],
       [0.07405353, 0.99725427],
       [0.04733062, 0.99887928],
       [0.95709822, 0.28976368]])

### 多項式與互動特徵

In [44]:
features = np.array([[2, 3], [2, 3], [2, 3]])
polynomial_interaction = preprocessing.PolynomialFeatures(degree=2, include_bias=False)
polynomial_interaction.fit_transform(features)
# x, y, xy, x^2, y^2 用作產生新特徵

array([[2., 3., 4., 6., 9.],
       [2., 3., 4., 6., 9.],
       [2., 3., 4., 6., 9.]])

In [48]:
polynomial_interaction = preprocessing.PolynomialFeatures(degree=2, include_bias=False, interaction_only=True)
polynomial_interaction.fit_transform(features)
# x, y, xy (dependent)

array([[2., 3., 6.],
       [2., 3., 6.],
       [2., 3., 6.]])

### 自訂特徵轉換

In [50]:
features = np.array([[2, 3], [2, 3], [2, 3]])
def add_ten(x):
    return x + 10
ten_transformer = preprocessing.FunctionTransformer(add_ten)
ten_transformer.transform(features)



array([[12, 13],
       [12, 13],
       [12, 13]])

In [52]:
# 或是用DataFrame做
import pandas as pd
df = pd.DataFrame(features)
df.apply(add_ten)

Unnamed: 0,0,1
0,12,13
1,12,13
2,12,13


### 離群值偵測

In [53]:
import numpy as np
from sklearn.covariance import EllipticEnvelope
from sklearn.datasets import make_blobs

In [54]:
# 模擬資料
features, _ = make_blobs(n_samples=10, n_features=2, centers=1, random_state=1)
features[0, 0] = 10000
features[0, 1] = 10000

In [55]:
# 離群偵測器
outlier_detector = EllipticEnvelope(contamination=.1) # 離群值的比例
outlier_detector.fit(features)

EllipticEnvelope(assume_centered=False, contamination=0.1, random_state=None,
                 store_precision=True, support_fraction=None)

In [56]:
# 預設離群值
outlier_detector.predict(features)

array([-1,  1,  1,  1,  1,  1,  1,  1,  1,  1])

In [58]:
feature = features[:, 0]
def indicies_of_outliers(x):
    q1, q3 = np.percentile(x, [25, 75])
    iqr = q3-q1
    lower_bound = q1-iqr*1.5
    upper_bound = q3+iqr*1.5
    return np.where((x>upper_bound)|(x<lower_bound))

In [59]:
indicies_of_outliers(feature)

(array([0], dtype=int64),)

### 離群值處理

In [60]:
import pandas as pd
houses = pd.DataFrame()
houses["Price"] = [534433, 392333, 293222, 4322032]
houses["Bathrooms"] = [2, 3.5, 2, 116]
houses["Square_Feet"] = [1500, 2500, 1500, 4800]

In [62]:
houses[houses["Bathrooms"]<20]

Unnamed: 0,Price,Bathrooms,Square_Feet
0,534433,2.0,1500
1,392333,3.5,2500
2,293222,2.0,1500


In [63]:
# 加入離群值特徵(*)
houses["Outlier"] = np.where(houses["Bathrooms"]<20, 0, 1)

In [69]:
# 取log 降低離群值影響
houses["Log_Of_Square_Feet"] = [np.log(x) for x in houses["Square_Feet"]]
houses

Unnamed: 0,Price,Bathrooms,Square_Feet,Outlier,Log_Of_Square_Feet
0,534433,2.0,1500,0,7.31322
1,392333,3.5,2500,0,7.824046
2,293222,2.0,1500,0,7.31322
3,4322032,116.0,4800,1,8.476371


### 特徵離散化

In [70]:
age = np.array([[6], [12], [20], [36], [65]])

In [71]:
# 二元特徵
binarizer = preprocessing.Binarizer(18)
binarizer.fit_transform(age)

In [73]:
# 區間特徵
np.digitize(age, bins=[20, 30, 64])

array([[0],
       [0],
       [1],
       [2],
       [3]], dtype=int64)

In [74]:
np.digitize(age, bins=[20, 30, 64], right=True) # <=包含20

array([[0],
       [0],
       [0],
       [2],
       [3]], dtype=int64)

In [75]:
np.digitize(age, bins=[18]) # 當二元在用

array([[0],
       [0],
       [1],
       [1],
       [1]], dtype=int64)

### 分群法+觀察分組

In [77]:
import pandas as pd
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans

In [80]:
features, _ = make_blobs(n_samples=50, n_features=2, centers=3, random_state=1)
dataframe = pd.DataFrame(features, columns=["feature_1", "feature_2"])

In [81]:
clusterer = KMeans(3, random_state=0)
clusterer.fit(features) # 代特徵進去擬合
dataframe["groups"] = clusterer.predict(features)

### 刪除有缺漏值的觀察(???)

In [185]:
features = np.array([[1.1, 11.1], 
                                 [2.2, 22,2], 
                                 [3.3, 33.3], 
                                 [4.4, 44.4], 
                                 [np.nan, 55.5]])

In [191]:
test2 = [[1.1, 11.1], 
                                 [2.2, 22,2], 
                                 [3.3, 33.3], 
                                 [4.4, 44.4], 
                                 [np.nan, 55.5]]

In [197]:
test2 = [[1.1, 11.1], 
                                 [2.2, 22,2], 
                                 [3.3, 33.3], 
                                 [4.4, 44.4], 
                                 [np.nan, 55.5]]

In [198]:
np.array(test2)

array([list([1.1, 11.1]), list([2.2, 22, 2]), list([3.3, 33.3]),
       list([4.4, 44.4]), list([nan, 55.5])], dtype=object)

In [193]:
features.shape

(2,)

In [175]:
test =[ [1,1]]

In [178]:
np.array([test]).shape

(1, 2, 2)

In [176]:
test.append([2,2])

In [177]:
test

[[1, 1], [2, 2]]

In [184]:
np.array(test)

array([[1, 1],
       [2, 2]])

In [159]:
print(pd.isna(features)

(5,)


### 缺漏值推算 (By KNN)

In [134]:
import numpy as np
from fancyimpute import KNN
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_blobs

Using TensorFlow backend.


In [137]:
features, _ = make_blobs(n_samples =1000, n_features=2, random_state=1)
scaler = StandardScaler()
standardized_features = scaler.fit_transform(features)

true_value = standardized_features[0, 0]
standardized_features[0, 0] = np.nan

In [139]:
features_knn_imputed = KNN(k=5, verbose=0).fit_transform(standardized_features)
print("True Value:", true_value)
print("Imputed Value:", features_knn_imputed[0, 0])

True Value: 0.8730186113995938
Imputed Value: 1.0955332713113226


### 缺漏值推算(By Imputer)

In [149]:
from sklearn.preprocessing import Imputer
mean_imputer = Imputer(strategy="mean", axis=0)
features_mean_imputed = mean_imputer.fit_transform(features)
print("True Value:", true_value)
print("Imputed Value:", features_mean_imputed[0, 0])

True Value: 0.8730186113995938
Imputed Value: -3.058372724614996


