In [None]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter(action = "ignore", category = FutureWarning)

## 데이터 불러오기
- pd.read_csv(): 파일로 저장된 데이터를 DataFrame형태로 불러옴

In [None]:
import os
from urllib.request import urlretrieve

# 인터넷에 있는 데이터 다운로드
url = ("https://archive.ics.uci.edu/ml/machine-learning-databases"
       "/adult/adult.data")
local_filename = os.path.basename(url)
if not os.path.exists(local_filename):
    print("Downloading Adult Census datasets from UCI")
    urlretrieve(url, local_filename)

# 각 열의 명칭 지정    
names = ("age, workclass, fnlwgt, education, education-num, "
         "marital-status, occupation, relationship, race, sex, "
         "capital-gain, capital-loss, hours-per-week, "
         "native-country, income").split(', ')    

# 다운로드받은 데이터를 DataFrame형태로 불러오기
data = pd.read_csv(local_filename, names=names)

## 데이터 탐색하기
- DataFrame.head(): 초반 n행 반환
- DataFrame.tail(): 마지막 n행 반환
- DataFrame.count(): 데이터 수 세기
- DataFame.describe(): 요약 통계 반환

In [None]:
# DataFrame 불러오기
data

age: 나이 <br>
workclass: 직업구분 : 개인사업, 연방정부 등… <br>
fnlwgt: (final weight) 사람수 <br>
education: 교육수준 (categorical) <br>
education-num: 교육수준 <br>
marital-status: 혼인상태 <br>
occupation: 직업군 : 수산업, 기술업, 서비스업 등 <br>
relationship: 가족 <br>
race: 인종 <br>
sex: 성별<br>
capital-gain: 자본이익 총액<br>
capital-loss: 자본손실 총액<br>
hours-per-week: 주당근로시간<br>
native-country: 출생지<br>

In [None]:
# 초반 n행 반환
data.head()
# data.head(n=10)

In [None]:
# 마지막 n행 반환
data.tail()
# data.tail(n=10)

In [None]:
# 데이터 수 세기. axis=0: 가로, axis=1: 세로
data.count()
# data.count(axis=0)
# data.count(axis=1)

In [None]:
# 요약 통계 반환
data.describe()
#data.describe(include=[np.number]) #같음 수치형 변수들만

In [None]:
data.describe(include=[np.object]) #범주형 변수들만

In [None]:
# 엑셀의 pivot table과 유사한 기능
data.groupby('occupation').size()

In [None]:
# 내림차순 정렬
data.groupby('occupation').size().sort_values(ascending=True)

In [None]:
data.groupby('native-country').size().sort_values(ascending=False)

### Quiz 1
- 'education'에 따라 그룹을 지어 각 그룹에 몇 명이 속하는지 오름차순으로 나타내어 보세요

In [None]:
# your answer


## 데이터 탐색하기(시각화)

In [None]:
from math import ceil
fig = plt.figure(figsize=(20,15))
cols = 5
rows = ceil(float(data.shape[1]) / cols)
for i, column in enumerate(data.columns):
    ax = fig.add_subplot(rows, cols, i + 1)
    ax.set_title(column)
    if data.dtypes[column] == np.object:
        data[column].value_counts().plot(kind="bar", axes=ax)
    else:
        data[column].hist(axes=ax)
        plt.xticks(rotation="vertical")
plt.subplots_adjust(hspace=0.7, wspace=0.2)

In [None]:
data.hist(column='education-num', bins=10);

In [None]:
data.hist('hours-per-week', bins=15);

In [None]:
data.hist('hours-per-week', bins=10);

In [None]:
data.groupby('occupation').size().plot(kind='bar')

In [None]:
# 숫자 값인 경우에만 동작
data.groupby('income').mean()

In [None]:
data.groupby('income').count()

In [None]:
data.groupby('income')['income'].count()

### Quiz 2
- 'education'의 바 플롯 그래프, 'age'의 히스토그램 그래프를 그리세요
- 'education'에 따라 그룹을 지어 각 그룹에 평균 나이를 구하세요

In [None]:
#your answer


In [None]:
# unique/distinctx value
data['income'].unique()

In [None]:
# %가 income 50K 이상?
np.mean(data['income'] == ' >50K')

In [None]:
# 우리가 구분하고자 하는 값을 target_names으로 선언
target_names = data['income'].unique()

In [None]:
target_names

In [None]:
low_income = data[data['income'] == ' <=50K']
high_income = data[data['income'] == ' >50K']


bins = np.linspace(10, 90, 20)
plt.hist(data['age'].values, bins=bins, alpha=0.1, label='ALL')
plt.hist(low_income['age'].values, bins=bins, alpha=0.5, label='<=50K')
plt.hist(high_income['age'].values, bins=bins, alpha=0.5, label='>50K')

plt.legend(loc='best');

In [None]:
#data.isnull()
#data.isnull().any()
data.isnull().any().any()

In [None]:
#data = data.dropna()

In [None]:
data.hist(column='age', bins=bins);

In [None]:
data.plot(x='age', y='hours-per-week', kind='scatter',
          alpha=0.02, s=50);

In [None]:
# alpha: 투명도 [0,1]
# s: 사이즈
# c: 색깔(RGB or RGBA 표현 가능)
# marker: . , o v ^ < > * x X + D d | - 등등
# http://matplotlib.org/api/markers_api.html#module-matplotlib.markers 참조
plt.scatter(low_income['age'], low_income['hours-per-week'],
            alpha=0.03, s=50, c='blue', label='<=50K');
plt.scatter(high_income['age'], high_income['hours-per-week'],
            alpha=0.03, s=50, c='green', label='>50K');
plt.legend()
plt.xlabel('age'); plt.ylabel('hours-per-week');

### Quiz 3
- alpha, s, c, marker값등을 변경해 자신만의 그래프를 그려보세요.

In [None]:
# alpha: 투명도 [0,1]
# s: 사이즈
# c: 색깔(RGB or RGBA 표현 가능)
# marker: . , o v ^ < > * x X + D d | - 등등
# http://matplotlib.org/api/markers_api.html#module-matplotlib.markers 참조


## 예측 모델 생성(Decision tree)

In [None]:
# 15개의 column중 'income'은 target으로, 나머지는 features로
target = data['income']
features_data = data.drop('income', axis=1)

In [None]:
print('독립변수 (X): ',features_data.shape, '종속변수 (y)',target.shape)

In [None]:
# 숫자값을 가지는 feature 뽑기
numeric_features = [c for c in features_data if features_data[c].dtype.kind in ('i', 'f')]
numeric_features

In [None]:
numeric_data = features_data[numeric_features]
numeric_data.head(5)

In [None]:
# 기존 features_data에서 숫자 features를 가지는 column을 drop. 남은건 categorical_data
categorical_data = features_data.drop(numeric_features, axis=1)
categorical_data.head(5)

In [None]:
# pd.factorize(): 모델링을 위해 text값을 숫자로 변경할 때 사용
pd.factorize(['a','b','c'])

In [None]:
# 모델링을 위해 text값을 숫자로 변경
categorical_data_encoded = categorical_data.apply(lambda x: pd.factorize(x)[0])
categorical_data_encoded.head(5)

In [None]:
# 기존 numeric_data와 전처리한 categorical_data_encoded 붙여주기
features = pd.concat([numeric_data, categorical_data_encoded], axis=1)
features.head()

In [None]:
# 다른 방법: one-hot encoding
features2 = pd.get_dummies(features_data)
features2.head()

In [None]:
X = features.values.astype(np.float32)
y = (target.values == ' >50K').astype(np.int32)

In [None]:
X.shape

In [None]:
y

In [None]:
# 학습데이터, 테스트데이터 분리하기
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
                                        X, y, test_size=0.2, random_state=0)

In [None]:
print("length of y_train:{}\nlength of y_test: {}".format(len(y_train), len(y_test)))

In [None]:
len(y_test)/(len(y_train)+ len(y_test))

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.cross_validation import cross_val_score

clf = DecisionTreeClassifier(max_depth=8)
clf.fit(X_train, y_train)
scores = clf.predict(X_test)
#scores = cross_val_score(clf, X_train, y_train, cv=5)
print("Cross validation 결과: 평균 {:.4f} 표준편차 +/-{:.4f}".format(
    np.mean(scores), np.std(scores)))

## Confusion matrix

Binary Classification 문제중...

True Positive: 모델이 실제 1을 1로 올바르게 예측한 경우<br>
True Negative: 모델이 실제 0을 0으로 올바르게 예측한 경우<br>
False Positive: 모델이 실제 0을 1로 올바르지 않게 예측한 경우<br>
False Negative: 모델이 실제 1을 0으로 올바르지 않게 예측한 경우<br>

| TN | FP   |
|------|------|
|   FN  | TP|

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, scores)

In [None]:
import seaborn as sn
cm=confusion_matrix(y_test, scores)
df_cm = pd.DataFrame(cm, index = [i for i in range(cm.shape[0])],
                  columns = [i for i in range(cm.shape[1])])
plt.figure(figsize = (5,5))
sn.set(font_scale=1.4)
sn.heatmap(df_cm, annot=True, annot_kws={"size": 16}, fmt='g')

### Accuracy 정확도 =  (TP+TN) / (TP+FP+TN+FN)
### Precision 정밀도 = TP / (TP+FP)
### Recall 재현율 = TP / (TP+FN)
### F1 = 2 x precision x recall / (precision+recall)

In [None]:
from sklearn.metrics import accuracy_score, classification_report

print('Accuracy: ',accuracy_score(y_test, scores),'\n')
print(classification_report(y_test, scores, target_names=['<=50K 0', '>50K 1']))


## Decision tree 시각화

In [None]:
!conda install python-graphviz --yes

In [None]:
from io import StringIO
import pydotplus
from IPython.display import Image
from sklearn.tree import export_graphviz


def treeviz(tree, feature_names ,class_names): 
    dot_data = StringIO()  
    export_graphviz(clf, out_file=dot_data,  
                    feature_names= feature_names,  
                    class_names=class_names,  
                    filled=True, rounded=True,  
                    special_characters=True) 
    graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
    return Image(graph.create_png())  

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.cross_validation import cross_val_score

feature_names = features.columns
target_names = np.array(['50K and under','Over 50K']) ## 부등호는 에러를 발생시켜 텍스트로 대체

clf = DecisionTreeClassifier(max_depth=2)
clf.fit(X_train, y_train)
treeviz(clf, feature_names, target_names) 

## 예측 모델 에러 분석

In [None]:
from sklearn.model_selection import learning_curve


def plot_learning_curve(estimator, X, y, ylim=(0, 1.1), cv=5,
                        n_jobs=-1, train_sizes=np.linspace(.1, 1.0, 5),
                        scoring=None):
    plt.title("Learning curves for %s" % type(estimator).__name__)
    plt.ylim(*ylim); plt.grid()
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    train_sizes, train_scores, validation_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes,
        scoring=scoring)
    train_scores_mean = np.mean(train_scores, axis=1)
    validation_scores_mean = np.mean(validation_scores, axis=1)

    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
    plt.plot(train_sizes, validation_scores_mean, 'o-', color="g",
             label="Cross-validation score")
    plt.legend(loc="best")
    print("Best validation score: {:.4f}".format(validation_scores_mean[-1]))

In [None]:
clf = DecisionTreeClassifier(max_depth=1)
plot_learning_curve(clf, X_train, y_train, scoring='roc_auc')

In [None]:
clf = DecisionTreeClassifier(max_depth=15)
plot_learning_curve(clf, X_train, y_train, scoring='roc_auc')

In [None]:
clf = DecisionTreeClassifier(max_depth=8)
plot_learning_curve(clf, X_train, y_train, scoring='roc_auc')

In [None]:
clf = DecisionTreeClassifier(max_depth=4)
plot_learning_curve(clf, X_train, y_train, scoring='roc_auc')

In [None]:
from sklearn.learning_curve import validation_curve


def plot_validation_curve(estimator, X, y, param_name, param_range,
                          ylim=(0, 1.1), cv=5, n_jobs=-1, scoring=None):
    estimator_name = type(estimator).__name__
    plt.title("Validation curves for %s on %s"
              % (param_name, estimator_name))
    plt.ylim(*ylim); plt.grid()
    plt.xlim(min(param_range), max(param_range))
    plt.xlabel(param_name)
    plt.ylabel("Score")

    train_scores, test_scores = validation_curve(
        estimator, X, y, param_name, param_range,
        cv=cv, n_jobs=n_jobs, scoring=scoring)

    train_scores_mean = np.mean(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    plt.semilogx(param_range, train_scores_mean, 'o-', color="r",
                 label="Training score")
    plt.semilogx(param_range, test_scores_mean, 'o-', color="g",
                 label="Cross-validation score")
    plt.legend(loc="best")
    
    print("Best test score: {:.4f}".format(np.max(test_scores_mean)))

In [None]:
clf = DecisionTreeClassifier(max_depth=8)
param_name = 'max_depth'
param_range = [1, 2, 4, 8, 16, 32]

plot_validation_curve(clf, X_train, y_train,
                      param_name, param_range, scoring='roc_auc')

### 오버피팅
- 어떤게 더 좋은 예측일까
![](http://stephanie-w.github.io/brainscribble/figure/classification-algorithms-on-iris-dataset_50_0.png)
![](http://stephanie-w.github.io/brainscribble/figure/classification-algorithms-on-iris-dataset_48_0.png)


## 변수 주요도

In [None]:
plt.figure(figsize=(10, 5))

clf.fit(X_train, y_train)
ordering = np.argsort(clf.feature_importances_)[::-1]

importances = clf.feature_importances_[ordering]
feature_names = features.columns[ordering]

x = np.arange(len(feature_names))
plt.bar(x, importances)
plt.xticks(x, feature_names, rotation=90, fontsize=15);

![](./images/variable_coefficient.png)

- 양의 상관관계: `Capital Gain`, `Married-civ-spounce`, `Age`, `Hours per
week`, `Exec-managerial`. 
- 음의 상관관계: `Never married`, `Own child`, `Priv-house-serv`, `Divorsed`, `Female`