In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from subprocess import check_output

import os
print(os.listdir("../input"))


# Any results you write to the current directory are saved as output.

In [None]:
train = pd.read_csv("../input/train.csv", header=None)
# header: 指定行来作为列的名字。默认为0，即默认第一行为列名。
# None说明数据本身有列名。
trainLabel = pd.read_csv('../input/trainLabels.csv', header=None)
test = pd.read_csv('../input/test.csv', header=None)
print(plt.style.available) # 查看可用的绘图样式
plt.style.use('ggplot')

In [None]:
print('train shape:', train.shape)
print('test shape:', test.shape)
print('trainLabel shape:', trainLabel.shape)
train.head()

In [None]:
train.info()

In [None]:
train.describe()

### 使用kNN进行分类

In [None]:
# kNN with cross-validation
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score, train_test_split

X,y = train, np.ravel(trainLabel) 
# np.ravel: 将多维数据降为一维，返回的是视图。
# 即若对y进行修改，则trainLabel同时被修改。同时还有np.flatten()，拷贝非视图。
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3)

In [None]:
neig = np.arange(1,25)
kfold = 10
train_accuracy = []
val_accuracy = []
bestKnn = None
bestAcc = 0.0
# Loo over different values of k
for i,k in enumerate(neig): # enumerate(): 同时列出数据和数据下标
    # k from 1 to 25
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
    # train accuracy
    train_accuracy.append(knn.score(X_train, y_train))
    # test accuracy
    val_accuracy.append(np.mean(cross_val_score(knn, X, y, cv=kfold)))
    if np.mean(cross_val_score(knn, X, y, cv=kfold)) > bestAcc:
        bestAcc = np.mean(cross_val_score(knn, X, y, cv=10))
        bestKnn = knn

# Plot
plt.figure(figsize=[13,8])
plt.plot(neig, val_accuracy, label='Validation Accuracy')
plt.plot(neig, train_accuracy, label='Training Accuracy')
plt.legend() # set the position of figure
plt.title('k value VS Accuracy')
plt.xlabel('Number of Neighbors')
plt.ylabel('Accuracy')
plt.xticks(neig) 
plt.show()

print('Best Accuracy without feature scaling:', bestAcc)
print(bestKnn)
    

In [None]:
# predict test
test_fill = np.nan_to_num(test)
# replace nan with 0 and replace infinite with finite numbers
submission = pd.DataFrame(bestKnn.predict(test_fill))
print(submission.shape)
submission.columns = ['Solution']
submission['Id'] = np.arange(1, submission.shape[0]+1)
submission = submission[['Id', 'Solution']]
submission

In [None]:
submission.to_csv('submission_no_normalization.csv', index=False)

In [None]:
print(check_output(["ls", "../working"]).decode("utf8"))

### Add Feature Scaling

In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, Normalizer

std = StandardScaler() 
# 通过删除均值和缩放到单位方差来标准化特征
X_std = std.fit_transform(X)
mms = MinMaxScaler()
# 将属性缩放到一个指定的最大和最小值（通常为1-0）之间
X_mms = mms.fit_transform(X) 
norm = Normalizer()
# 标准化，将数据按比例缩放，使之落入一个小的特定区间
X_norm = norm.fit_transform(X) 

In [None]:
neig = np.arange(1,30)
kfold = 10
val_accuracy = {'std':[], 'mms':[], 'norm':[]}
bestKnn = None
bestAcc = 0.0
bestScaling = None
# Loop over different values of k
for i,k in enumerate(neig):
    knn = KNeighborsClassifier(n_neighbors=k)
    s1 = np.mean(cross_val_score(knn, X_std, y, cv=kfold))
    val_accuracy['std'].append(s1)
    s2 = np.mean(cross_val_score(knn, X_mms, y, cv=kfold))
    val_accuracy['mms'].append(s2)
    s3 = np.mean(cross_val_score(knn, X_norm, y, cv=kfold))
    val_accuracy['norm'].append(s3)
    if s1 > bestAcc:
        bestAcc = s1
        bestKnn = knn
        bestScaling = 'std'
    if s2 > bestAcc:
        bestAcc = s2
        bestKnn = knn
        bestScaling = 'mms'
    if s3 > bestAcc:
        bestAcc = s3
        bestKnn = knn
        bestScaling = 'norm'
# Plot
plt.figure(figsize=[13,8])
plt.plot(neig, val_accuracy['std'], label='CV Accuracy with std')
plt.plot(neig, val_accuracy['mms'], label='CV Accuracy with mms')
plt.plot(neig, val_accuracy['norm'], label='CV Accuracy with norm')
plt.legend()
plt.title('k values VS Accuracy')
plt.xlabel('Number of Neighbors')
plt.ylabel('Accuracy')
plt.xticks(neig)
plt.show()

print('Best Accuracy with feature scaling:', bestAcc)
print('Best kNN classifier:', bestKnn)
print('Best Scaling:', bestScaling)

In [None]:
# predict on test
bestKnn.fit(X_norm, y)
submission = pd.DataFrame(bestKnn.predict(norm.transform(test_fill)))
print(submission.shape)
submission.columns = ['Solution']
submission['Id'] = np.arange(1, submission.shape[0]+1)
submission = submission[['Id','Solution']]
submission

In [None]:
submission.to_csv('submission_with_scaling.csv', index=False)

In [None]:
print(check_output(['ls','../working']).decode('utf8'))

### Feature Selection

In [None]:
f,ax = plt.subplots(figsize=(18,18))
sns.heatmap(pd.DataFrame(X_std).corr(), annot=True, linewidths=.5, fmt='.1f',ax=ax)
# annot: annotate为True时，在heatmap中每个方格写入数据。
# linewidths: 热力图矩阵之间的间隔大小
# fmt: 格式设置

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, confusion_matrix
from sklearn.metrics import accuracy_score

# split data 70% train and 30% val
X_train ,X_val, y_train, y_val = train_test_split(X_std, y, test_size=0.3, random_state=42)
# random_state: 设置相同的random_state值，则多次执行结果相同，可以完全复现结果。
# 若设置为None，则会随机选择一个种子。

# random forest classifier with n_estimators=10(default)
clf_rf = RandomForestClassifier(random_state=43)
clr_rf = clf_rf.fit(X_train, y_train)

ac = accuracy_score(y_val, clr_rf.predict(X_val))
# 计算模型预测准确率
print('Accuracy is:', ac)
cm = confusion_matrix(y_val, clf_rf.predict(X_val))
# confusion_matrix: 混淆矩阵，列为真实值，行为预测值。
# 通过矩阵的形式表现预测结果如何。
sns.heatmap(cm, annot=True, fmt="d")

In [None]:
from sklearn.svm import SVC
from sklearn.feature_selection import RFECV
# 递归特征消除（Recursive feature elimination)
# 递归特征消除通过反复构建模型来选出最好的（或最差的）特征（根据系数来选）。
# 把选出来的特征放置一边，然后在剩余的特征上重复上述过程，直到遍历所有特征。

kfold = 10
bestSVC = None
bestAcc = 0.0
val_accuracy = []
cv_range = np.arange(5,11)
n_feature = []
for cv in cv_range:
    # Create the RFE object and compute a cross-validated score.
    svc = SVC(kernel='linear')
    # classifications
    rfecv = RFECV(estimator=svc, step=1, cv=cv, scoring='accuracy')
    rfecv.fit(X_std, y)
    # estimator: 估计函数，底层的回归模型。
    # step: 对应迭代过程中每次移除的属性的数量。
    # n_features_: 选择特征的数量。
    # support_: 返回一个长度为[n_features]的向量，为True或False，最佳属性为True。
    # ranking_: 返回特征的排序
    # ranking_[i]: 返回第i个特征的排序位置
    val_accuracy += [np.mean(cross_val_score(svc, X_std[:, rfecv.support_], y, cv=kfold))]
    # cross_val_score: 通过交叉验证评估模型分数
    # 将最新计算得出的准确率加入val_accuracy中
    n_feature.append(rfecv.n_features_)
    if val_accuracy[-1] > bestAcc:
        # val_accuracy[-1]返回最新加入的accuracy
        bestAcc = val_accuracy[-1]

# Plot
plt.figure(figsize=[13,8])
plt.plot(cv_range, val_accuracy, label='CV Accuracy')
for i in range(len(cv_range)):
    plt.annotate(str(n_feature[i]), xy=(cv_range[i], val_accuracy[i]))
    # annotate: 文字标注 str():标注内容，xy:标注位置
plt.legend()
plt.title('Cross Validation Accuracy')
plt.xlabel('k fold')
plt.ylabel('Accuracy')
plt.show()

print('Best Accuracy with feature scaling and RFECV:', bestAcc)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.ensemble import VotingClassifier

x_train = train
y_train = trainLabel
x_test = test
x_train = np.asarray(x_train)
y_train = np.asarray(y_train)
x_test = np.asarray(x_test)
y_train = y_train.ravel()
# ravel(): 将数据转换为1维
print('training_x shape:', x_train.shape, ',training_y shape:', y_train.shape, ',testing_x shape:', x_test.shape)

# checking the models
x_all = np.r_[x_train, x_test]
# np.r_: 按row来组合array
print('x_all shape:', x_all.shape)

# using the gaussian mixture model
from sklearn.mixture import GaussianMixture
lowest_bic = np.infty
bic = []
n_components_range = range(1,7)
cv_types = ['spherical', 'tied', 'diag', 'full']
for cv_type in cv_types:
    for n_components in n_components_range:
        # Fit a mixture of Gaussians with EM
        gmm = GaussianMixture(n_components=n_components, covariance_type=cv_type)
        gmm.fit(x_all)
        bic.append(gmm.aic(x_all))
        if bic[-1] < lowest_bic:
            lowest_bic = bic[-1]
            best_gmm = gmm
        
best_gmm.fit(x_all)
x_train = best_gmm.predict_proba(x_train)
x_test = best_gmm.predict_proba(x_test)

# Taking only two models for keeping it simple
knn = KNeighborsClassifier()
rf = RandomForestClassifier()

param_grid = dict()
# Grid search for best tuning parameters for KNN
grid_search_knn = GridSearchCV(knn, param_grid=param_grid, cv=10, scoring='accuracy').fit(x_train, y_train)
print('best estimator KNN:', grid_search_knn.best_estimator_, 'Best Score', grid_search_knn.best_estimator_.score(x_train, y_train))
knn_best = grid_search_knn.best_estimator_

# Grid search for best tuning parameters for RandomForest
grid_search_rf = GridSearchCV(rf, param_grid=dict(), verbose=3, scoring='accuracy', cv=10).fit(x_train, y_train)
print('best estimator RandomForest:', grid_search_rf.best_estimator_, 'Best Score:', grid_search_knn.best_estimator_.score(x_train, y_train))
rf_best = grid_search_rf.best_estimator_

knn_best.fit(x_train, y_train)
print(knn_best.predict(x_test)[0:10])
rf_best.fit(x_train, y_train)
print(rf_best.predict(x_test)[0:10])

# scoring the models
print('Score for KNN:', cross_val_score(knn_best,x_train,y_train,cv=10,scoring='accuracy').mean())
print('Score for Random Forest:', cross_val_score(rf_best, x_train, y_train, cv=10, scoring='accuracy').max())

# framing our solution
knn_best_pred = pd.DataFrame(knn_best.predict(x_test))
rf_best_pred = pd.DataFrame(rf_best.predict(x_test))

knn_best_pred.index += 1
rf_best_pred.index += 1

rf_best_pred.columns = ['Solution']
rf_best_pred['Id'] = np.arange(1, rf_best_pred.shape[0]+1)
rf_best_pred = rf_best_pred[['Id','Solution']]
print(rf_best_pred)

rf_best_pred.to_csv('Submission_rf.csv', index=False)
# index: 若为True，则写入行名称