# 百日馬拉松-學習精華

### Day004 : EDA/讀取資料與分析流程
- The exploratory data analysis (EDA) : 運用視覺化、基本的統計等工具，來「看」一下資料；以期進行複雜或嚴謹的分析之前，能夠對資料有更多的認識。
- app_train.iloc[:10, 0:5] # 前 10 row 以及前 5 個 column

In [1]:
# 用 python擷取 csv檔
import os
import numpy as np
import pandas as pd
#設定 data_path 檔案路徑
dir_data = './data/'
f_app = os.path.join(dir_data, 'application_train.csv')
print('Path of read in data: %s' % (f_app))
app_train = pd.read_csv(f_app) #app_train這個變數表示這個 csv的內容

Path of read in data: ./data/application_train.csv


### Day005 : 如何新建一個 dataframe? 如何讀取其他資料? (非 csv 的資料)
- 方法一: 用 pd.DataFrame (將 dict轉成DataFrame)
- 方法二: 用 zip  (將多個 list轉成DataFrame)
- numpy.random.rand() 產生隨機數組 [python產生隨機數列表](https://blog.csdn.net/christianashannon/article/details/78867204)
- 如何讀取其他資料? (非 csv 的資料) requests、re

In [2]:
# 方法一: 用 pd.DataFrame (將 dict轉成DataFrame)
data = {'weekday': ['Sun', 'Sun', 'Mon', 'Mon'],
        'city': ['Austin', 'Dallas', 'Austin', 'Dallas'],
        'visitor': [139, 237, 326, 456]}
visitors_1 = pd.DataFrame(data)
print(visitors_1)

  weekday    city  visitor
0     Sun  Austin      139
1     Sun  Dallas      237
2     Mon  Austin      326
3     Mon  Dallas      456


In [7]:
# 方法二: 用 zip  (將多個 list轉成DataFrame)
cities = ['Austin', 'Dallas', 'Austin', 'Dallas']
weekdays = ['Sun', 'Sun', 'Mon', 'Mon']
visitors = [139, 237, 326, 456]

list_cols_content = [cities, weekdays, visitors]
list_labels = ['city', 'weekday', 'visitor']

zipped = list(zip(list_labels, list_cols_content))
visitors_2 = pd.DataFrame(dict(zipped))
print(visitors_2)

     city weekday  visitor
0  Austin     Sun      139
1  Dallas     Sun      237
2  Austin     Mon      326
3  Dallas     Mon      456


In [4]:
# 將網路上的資料url抓進python使用
import re
import requests
target_url = "https://raw.githubusercontent.com/vashineyu/slides_and_others/master/tutorial/examples/imagenet_urls_examples.txt"
response = requests.get(target_url)

#伺服器回應的狀態碼
print(response.status_code) #回應200代表正常

# 檢查狀態碼是否 OK
try:    
    if response.status_code == requests.codes.ok:
      print("OK")
except:
    print("伺服器無回應")

data = response.text #<class 'str'>
# 用 request 傳送回來的資料不會認得斷行符號
print(len(data))
data[0:100]

200
OK
784594


'n00015388_157\thttp://farm1.static.flickr.com/145/430300483_21e993670c.jpg\nn00015388_238\thttp://farm2'

In [5]:
# 將 txt 轉成 pandas dataframe
import pandas as pd
import requests
import re
target_url = 'https://raw.githubusercontent.com/vashineyu/slides_and_others/master/tutorial/examples/imagenet_urls_examples.txt'
response = requests.get(target_url)
data = response.text
data = data.split("\n")  #python內建的split斷行
arrange_data = []

# 利用re正規表示式拿掉亂碼，只留下網址。
#x = re.compile('[a-z0-9]+_[0-9]+') # 去掉n00433802_4955這類的亂碼 #發現連結一部份被拿掉，所以不好用
#x1 = re.compile('\t')

for d in data:
    line = d.split("\t")
    arrange_data.append(line)
#print(type(arrange_data)) <class 'list'>
df = pd.DataFrame(arrange_data)
#print(type(df))  <class 'pandas.core.frame.DataFrame'>
df.head()

Unnamed: 0,0,1
0,n00015388_157,http://farm1.static.flickr.com/145/430300483_2...
1,n00015388_238,http://farm2.static.flickr.com/1005/3352960681...
2,n00015388_304,http://farm1.static.flickr.com/27/51009336_a96...
3,n00015388_327,http://farm4.static.flickr.com/3025/2444687979...
4,n00015388_355,http://img100.imageshack.us/img100/3253/forres...


In [6]:
# 讀取圖片，請讀取上面 data frame 中的前 5 張圖片
from PIL import Image
from io import BytesIO
import numpy as np
import matplotlib.pyplot as plt

def img2arr_fromURLs(url_list, resize = False):
    img_list = []
    for url in url_list:
        response = requests.get(url)
        try:
            img = Image.open(BytesIO(response.content))
            if resize:
                img = img.resize((256,256))  # 假如 resize，就先統一到 256 x 256
            img = np.array(img)
            img_list.append(img)
        except:# 如果圖片失聯了就 pass。
            #   只有在 response.status_code 為 200 時，才可以取得圖片，若有 404 或其他 status code, 會碰到 Error, 所以
            #我們用 Try 語法避開取不到的狀況。
            pass
    return img_list

import requests
result = img2arr_fromURLs(df[0:5][1].values)
print("Total images that we got: %i " % len(result)) # 如果不等於 5, 代表有些連結失效囉

for im_get in result:
    plt.imshow(im_get)
    plt.show()

Total images that we got: 4 


<Figure size 640x480 with 1 Axes>

<Figure size 640x480 with 1 Axes>

<Figure size 640x480 with 1 Axes>

<Figure size 640x480 with 1 Axes>

In [None]:
# 文本(txt)
with open(‘example.txt’, ‘r’) as f: 
    data = f.readlines() 
print(data)

In [None]:
# 讀取Json
import json
with open(‘example.json’, ‘r’) as f:
    data = json.load(f)
print(data)

In [None]:
# 矩陣檔(mat)
import scipy.io as sio
data = sio.loadmat(‘example.mat’)


### Day006 : EDA: 欄位的資料類型介紹及處理
1. Label encoding : 把每個類別 mapping 到某個整數，不會增加新欄位。(原始資料是有序離散值)
* One hot encoding : 為每個類別新增一個欄位，用 0/1 表示是否。(原始資料是無序離散值)
[初學Python手記#3-資料前處理( Label encoding、 One hot encoding)
](https://medium.com/@PatHuang/%E5%88%9D%E5%AD%B8python%E6%89%8B%E8%A8%98-3-%E8%B3%87%E6%96%99%E5%89%8D%E8%99%95%E7%90%86-label-encoding-one-hot-encoding-85c983d63f87)
[Label Encoder vs. One Hot Encoder in Machine Learning]https://medium.com/@contactsunny/label-encoder-vs-one-hot-encoder-in-machine-learning-3fc273365621
- DataFrame如何檢測欄位的型態數量、各欄型態

- Label encoding=> 

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

A = le.fit_transform()
- One hot encoding=> pd.get_dummies(..., drop_first=True)  去掉原本的第一個欄位，建模時避免產生高度共線性。

In [9]:
# 1. Label encoding
'''
有仔細閱讀參考資料的人可以發現，Label encoding 的表示方式會讓同一個欄位底下的類別之間有大小關係
(0<1<2<...)，所以在這裡我們只對有類別數量小於等於 2 的類別型欄位示範使用 Label encoding，但不
表示這樣處理是最好的，一切取決於欄位本身的意義適合哪一種表示方法。
'''
import os
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
# 設定 data_path
dir_data = './data/'
f_app_train = os.path.join(dir_data, 'application_train.csv')
f_app_test = os.path.join(dir_data, 'application_test.csv')

app_train = pd.read_csv(f_app_train)
app_test = pd.read_csv(f_app_test)

In [10]:
# 3. 檢視資料中各個欄位類型的數量
app_train.dtypes.value_counts()

float64    65
int64      41
object     16
dtype: int64

In [11]:
# 3. 檢視資料中類別型欄位各自類別的數量
app_train.select_dtypes(include=["object"]).apply(pd.Series.nunique, axis = 0)

NAME_CONTRACT_TYPE             2
CODE_GENDER                    3
FLAG_OWN_CAR                   2
FLAG_OWN_REALTY                2
NAME_TYPE_SUITE                7
NAME_INCOME_TYPE               8
NAME_EDUCATION_TYPE            5
NAME_FAMILY_STATUS             6
NAME_HOUSING_TYPE              6
OCCUPATION_TYPE               18
WEEKDAY_APPR_PROCESS_START     7
ORGANIZATION_TYPE             58
FONDKAPREMONT_MODE             4
HOUSETYPE_MODE                 3
WALLSMATERIAL_MODE             7
EMERGENCYSTATE_MODE            2
dtype: int64

In [12]:
# Create a label encoder object
le = LabelEncoder()
le_count = 0

# Iterate through the columns
for col in app_train:
    if app_train[col].dtype == 'object':
        # If 2 or fewer unique categories
        if len(list(app_train[col].unique())) <= 2:
            # Train on the training data
            le.fit(app_train[col])
            # Transform both training and testing data
            app_train[col] = le.transform(app_train[col])
            app_test[col] = le.transform(app_test[col])
            
            # Keep track of how many columns were label encoded
            le_count += 1
            
print('%d columns were label encoded.' % le_count)

3 columns were label encoded.


In [16]:
# Label encoding
# 將 Country欄的字串轉換成數字(進行編碼)，這樣才能套入數學模型進行運算。
import numpy as np
import pandas as pd
country=['Taiwan','Australia','Ireland','Australia','Ireland','Taiwan']
age=[25,30,45,35,22,36]
salary=[20000,32000,59000,60000,43000,52000]
dic={'Country':country,'Age':age,'Salary':salary}
data=pd.DataFrame(dic)
data

Unnamed: 0,Country,Age,Salary
0,Taiwan,25,20000
1,Australia,30,32000
2,Ireland,45,59000
3,Australia,35,60000
4,Ireland,22,43000
5,Taiwan,36,52000


In [17]:
from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder()
data_le=pd.DataFrame(dic)
data_le['Country'] = labelencoder.fit_transform(data_le['Country'])
data_le

Unnamed: 0,Country,Age,Salary
0,2,25,20000
1,0,30,32000
2,1,45,59000
3,0,35,60000
4,1,22,43000
5,2,36,52000


In [8]:
# 2. One Hot encoding
import os
import numpy as np
import pandas as pd
# 設定 data_path, 並讀取 app_train
dir_data = './data/'
f_app_train = os.path.join(dir_data, 'application_train.csv')
app_train = pd.read_csv(f_app_train)
sub_train = pd.DataFrame(app_train['WEEKDAY_APPR_PROCESS_START'])
print(sub_train.shape)
sub_train.head()

(307511, 1)


Unnamed: 0,WEEKDAY_APPR_PROCESS_START
0,WEDNESDAY
1,MONDAY
2,MONDAY
3,WEDNESDAY
4,THURSDAY


In [15]:
sub_train = pd.get_dummies(sub_train, drop_first=True)
print(sub_train.shape)
sub_train.head()

(307511, 7)


Unnamed: 0,WEEKDAY_APPR_PROCESS_START_FRIDAY,WEEKDAY_APPR_PROCESS_START_MONDAY,WEEKDAY_APPR_PROCESS_START_SATURDAY,WEEKDAY_APPR_PROCESS_START_SUNDAY,WEEKDAY_APPR_PROCESS_START_THURSDAY,WEEKDAY_APPR_PROCESS_START_TUESDAY,WEEKDAY_APPR_PROCESS_START_WEDNESDAY
0,0,0,0,0,0,0,1
1,0,1,0,0,0,0,0
2,0,1,0,0,0,0,0
3,0,0,0,0,0,0,1
4,0,0,0,0,1,0,0


### Day007 : 特徵類型
- 如何將欄位名稱, 依照所屬類型分開, 並列出指定類型的部分資料