# correlation coefficient

In [None]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

data_path = './data/house_train.csv'
df_train = pd.read_csv(data_path)
display(df_train)

In [None]:
corr = df_train.corr()
display(corr)

In [None]:
corr_drop = corr.iloc[-1,:].drop(['index'])
corr_drop[(corr_drop>0.03) | (corr_drop<-0.03)].index

# Label encoding & One hot encoding

In [None]:
# 記得刪除 單價
if '單價(元/平方公尺)' in df_train:
    df_train = df_train.drop(['單價(元/平方公尺)'] , axis=1)


#只取 int64, float64 兩種數值型欄位, 存於 num_features 中, 其他類別存於notnum_features
num_features = []
notnum_features = []
for dtype, feature in zip(df_train.dtypes, df_train.columns):
    if dtype == 'float64' or dtype == 'int64':
        num_features.append(feature)
    else:
        notnum_features.append(feature)

print(f'lehgth of all featrues : {len(df_train.columns)}\n')
print(f'length of Numeric Features : {len(num_features)}\n Numeric Features : {num_features}\n')
print(f'length of Not Numeric Features : {len(notnum_features)}\n Not Numeric Features : {notnum_features}')

# 削減文字型欄位, 只剩數值型欄位
df_train_num = df_train[num_features]
df_train_num.head()

In [None]:
from sklearn.preprocessing import LabelEncoder
df_train = df_train.fillna('None') # 要先去除遺失值
df_train_le = pd.DataFrame()
for c in notnum_features:
    df_train_le[c] = LabelEncoder().fit_transform(df_train[c])
df_train_le.head()

In [None]:
df_train_temp = pd.DataFrame()
for c in notnum_features:
    df_train_temp[c] = df_train[c]
df_train_one = pd.get_dummies(df_train_temp)
df_train_one.head()

In [None]:
from sklearn.preprocessing import OneHotEncoder
df_train_one_sk = OneHotEncoder().fit_transform(df_train_temp.loc[:,['主要建材', '主要用途']]).toarray()
df_train_one_sk.head()

In [None]:
display(df_train.iloc[:5,1])
display(df_train_one.iloc[:5,:18])
display(df_train_one_sk.iloc[:5,:18])

# 時間特徵

#### 時間特徵拆解

In [None]:
import datetime

now = datetime.datetime.now() # current date and time
print(now,type(now))
year = datetime.datetime.strftime(now,"%Y")
print("year:", year)

In [None]:
str_time = datetime.datetime.strptime('2017-08-12 00:00:00', '%Y-%m-%d %H:%M:%S')
print(str_time)
print(type(str_time))

In [None]:
# 做完特徵工程前的所有準備
import pandas as pd
import numpy as np
import datetime

df_train_time = pd.DataFrame()
df_train_time['transaction_datetime'] = df_train['交易年月日'].apply(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
df_train_time['transaction_year'] = df_train_time['transaction_datetime'].apply(lambda x: datetime.datetime.strftime(x, '%Y')).astype('int64')
df_train_time['transaction_month'] = df_train_time['transaction_datetime'].apply(lambda x: datetime.datetime.strftime(x, '%m')).astype('int64')
df_train_time['transaction_day'] = df_train_time['transaction_datetime'].apply(lambda x: datetime.datetime.strftime(x, '%d')).astype('int64')
df_train_time['transaction_hour'] = df_train_time['transaction_datetime'].apply(lambda x: datetime.datetime.strftime(x, '%H')).astype('int64')
df_train_time['transaction_minute'] = df_train_time['transaction_datetime'].apply(lambda x: datetime.datetime.strftime(x, '%M')).astype('int64')
df_train_time['transaction_second'] = df_train_time['transaction_datetime'].apply(lambda x: datetime.datetime.strftime(x, '%S')).astype('int64')
df_train_time.head()

#### 時間單位整合

In [None]:
import math
df_train_time['year_cycle_half'] = df_train_time['transaction_month']/6 + df_train_time['transaction_day']/180
df_train_time.head()

#### 週期特徵

In [None]:
df_train_time['year_cycle_half']=df_train_time['year_cycle_half'].apply(lambda x:math.cos(x*math.pi))
df_train_time.head()

# 遺失值處理

#### Detect in Series

In [None]:
data = pd.Series([1, np.nan, 3, None]); data
print(data)
print('-'*50)
print(data.isnull())
print('-'*10)
print(data.isnull().any())
print('-'*50)

#### detect in DataFrame
<details>
    <summary>axis image 1</summary>
    <img src="./img/2D_axis_1.jpg">
</details>
<details>
    <summary>axis image 2</summary>
    <img src="./img/2D_axis.jpg">
</details>

In [None]:
df=pd.DataFrame(np.random.randn(1000,100))
df[df>0.9]=pd.np.nan
display(df)

print(df.isnull().any()) # .any()預設為axis='index'(axis=0)，可以查詢哪些欄位有空值
print('-'*50)
print(df.isnull().any(axis=1))
print('-'*50)
print(df.isnull().any().any())

#### 去除遺失值

In [None]:
data_path = './data/house_train.csv'
df_train = pd.read_csv(data_path)
display(df_train['主要建材'])
print('-'*50)
print(len(df_train['主要建材']),df_train['主要建材'].isnull().any().any())

In [None]:
Seires_drop_1 = df_train['主要建材'][df_train['主要建材'].notnull()]
print(Seires_drop_1)
print('_'*10)
print(len(Seires_drop_1),Seires_drop_1.isnull().any().any())
print('_'*50)

Seires_drop_2 = df_train['主要建材'].dropna()
print(Seires_drop_2)
print('_'*10)
print(len(Seires_drop_2),Seires_drop_2.isnull().any().any())
print('_'*50)

#### dropna()的參數 how

In [None]:
df = pd.DataFrame([[1,      np.nan, 2, np.nan],
                   [2,      3,      5, np.nan],
                   [np.nan, 4,      6, np.nan]],columns=['a','b','c','d'])

df_drop_all = df.dropna(how='all',axis=1)
df_drop_any = df.dropna(how='any',axis=1)
display(df)
display(df_drop_all)
display(df_drop_any)

#### 填補遺失值

In [None]:
Series_fill = df_train['主要建材'].fillna('沒紀錄')
display(Series_fill)

#### fillna()的參數 method

In [None]:
df = pd.DataFrame([[1,      np.nan, 2, np.nan],
                   [2,      3,      5, np.nan],
                   [np.nan, 4,      6, np.nan]],columns=['a','b','c','d'])
display(df)
display(df.fillna(method='ffill')) #預設axis=0
display(df.fillna(method='ffill', axis=1))
display(df.fillna(method='bfill'))