In [1]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

In [2]:
students = pd.read_csv('./data/student-data.csv')

In [3]:
students.shape

(395, 30)

In [7]:
for column_name in students:
    column_content = students[column_name]
    if column_content.dtypes == object:
        unique_arr = column_content.unique()
        
        def transform_func(item):
            return np.argwhere(unique_arr == item)[0, 0]
        students[column_name] = students[column_name].map(transform_func)

In [8]:
condition = np.abs(students - students.mean()) > 3*students.std()
samples = students.drop(students[condition.any(axis=1)].index).copy()

In [9]:
samples = samples =samples[samples.notnull().all(axis=1)]

In [10]:
samples.isnull().any()

school        False
sex           False
age           False
address       False
famsize       False
Pstatus       False
Medu          False
Fedu          False
Mjob          False
Fjob          False
reason        False
guardian      False
traveltime    False
studytime     False
failures      False
schoolsup     False
famsup        False
paid          False
activities    False
nursery       False
higher        False
internet      False
romantic      False
famrel        False
freetime      False
goout         False
Dalc          False
Walc          False
health        False
passed        False
dtype: bool

In [11]:
samples.shape

(347, 30)

In [12]:
train = samples.iloc[:, :-1]
target = samples.iloc[:, -1]

In [13]:
from sklearn.preprocessing import Normalizer, MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split

In [14]:
n_train = Normalizer().fit_transform(train)
m_train = MinMaxScaler().fit_transform(train)
s_train = StandardScaler().fit_transform(train)

In [18]:
X_train1, X_test1, y_train1, y_test1 = train_test_split(n_train, target, test_size=0.25, random_state=1)
X_train2, X_test2, y_train2, y_test2 = train_test_split(m_train, target, test_size=0.25, random_state=1)
X_train3, X_test3, y_train3, y_test3 = train_test_split(s_train, target, test_size=0.25, random_state=1)

In [19]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()
print("Normalizer score is {}".format(knn.fit(X_train1, y_train1).score(X_test1, y_test1)))
print("MInMaxScaler score is {}".format(knn.fit(X_train2, y_train2).score(X_test2, y_test2)))
print("StandardScaler score is {}".format(knn.fit(X_train3, y_train3).score(X_test3, y_test3)))

Normalizer score is 0.5632183908045977
MInMaxScaler score is 0.5632183908045977
StandardScaler score is 0.6091954022988506


In [21]:
from sklearn.linear_model import LogisticRegression
logistic = LogisticRegression()
print("Normalizer score is {}".format(logistic.fit(X_train1, y_train1).score(X_test1, y_test1)))
print("MinMaxScaler score is {}".format(logistic.fit(X_train2, y_train2).score(X_test2, y_test2)))
print("StandardScaler score is {}".format(logistic.fit(X_train3, y_train3).score(X_test3, y_test3)))

Normalizer score is 0.6206896551724138
MinMaxScaler score is 0.6781609195402298
StandardScaler score is 0.632183908045977


In [22]:
def select_good_model(model, train, target):
    scores = []
    for i in [0.1, 0.2, 0.3]:
        X_train, X_test, y_train, y_test = train_test_split(train, target, test_size=i)
        score = model.fit(X_train, y_train).score(X_test, y_test)
        modelname = model.__class__.__name__
        print("{} scale:{} score:{}".format(modelname, i, score))
        scores.append(score)
    return np.array(scores).std()
        

In [23]:
select_good_model(knn, m_train, target)

KNeighborsClassifier scale:0.1 score:0.6
KNeighborsClassifier scale:0.2 score:0.6
KNeighborsClassifier scale:0.3 score:0.638095238095238


0.017958267458705955

In [24]:
select_good_model(logistic, m_train, target)

LogisticRegression scale:0.1 score:0.6857142857142857
LogisticRegression scale:0.2 score:0.7285714285714285
LogisticRegression scale:0.3 score:0.6761904761904762


0.02278206364191636

In [26]:
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

In [28]:
gaussian = GaussianNB()
svc = SVC(kernel='rbf')
tree = DecisionTreeClassifier()
select_good_model(gaussian, m_train, target)
select_good_model(svc, m_train, target)
select_good_model(tree, m_train, target)

GaussianNB scale:0.1 score:0.5142857142857142
GaussianNB scale:0.2 score:0.6142857142857143
GaussianNB scale:0.3 score:0.6857142857142857
SVC scale:0.1 score:0.6
SVC scale:0.2 score:0.7285714285714285
SVC scale:0.3 score:0.6857142857142857
DecisionTreeClassifier scale:0.1 score:0.5714285714285714
DecisionTreeClassifier scale:0.2 score:0.6714285714285714
DecisionTreeClassifier scale:0.3 score:0.6


0.04205600412537069

In [30]:
def select_best_model(model):
    std_list = []
    for _ in range(10):
        std = select_good_model(model, m_train, target)
        std_list.append(std)
        
    mean = np.array(std_list).mean()
    model_name = model.__class__.__name__
    print('{}最终的运行评分的方差均值为{}'.format(model_name, mean))
    

In [31]:
select_best_model(knn)

KNeighborsClassifier scale:0.1 score:0.6285714285714286
KNeighborsClassifier scale:0.2 score:0.7142857142857143
KNeighborsClassifier scale:0.3 score:0.6476190476190476
KNeighborsClassifier scale:0.1 score:0.7142857142857143
KNeighborsClassifier scale:0.2 score:0.6571428571428571
KNeighborsClassifier scale:0.3 score:0.6476190476190476
KNeighborsClassifier scale:0.1 score:0.7142857142857143
KNeighborsClassifier scale:0.2 score:0.6142857142857143
KNeighborsClassifier scale:0.3 score:0.7142857142857143
KNeighborsClassifier scale:0.1 score:0.6857142857142857
KNeighborsClassifier scale:0.2 score:0.6285714285714286
KNeighborsClassifier scale:0.3 score:0.6
KNeighborsClassifier scale:0.1 score:0.7142857142857143
KNeighborsClassifier scale:0.2 score:0.6285714285714286
KNeighborsClassifier scale:0.3 score:0.6095238095238096
KNeighborsClassifier scale:0.1 score:0.6285714285714286
KNeighborsClassifier scale:0.2 score:0.6571428571428571
KNeighborsClassifier scale:0.3 score:0.6666666666666666
KNeighb

In [33]:
select_best_model(logistic)

LogisticRegression scale:0.1 score:0.8
LogisticRegression scale:0.2 score:0.7142857142857143
LogisticRegression scale:0.3 score:0.6190476190476191
LogisticRegression scale:0.1 score:0.6571428571428571
LogisticRegression scale:0.2 score:0.7428571428571429
LogisticRegression scale:0.3 score:0.7619047619047619
LogisticRegression scale:0.1 score:0.6857142857142857
LogisticRegression scale:0.2 score:0.7571428571428571
LogisticRegression scale:0.3 score:0.7619047619047619
LogisticRegression scale:0.1 score:0.6285714285714286
LogisticRegression scale:0.2 score:0.6714285714285714
LogisticRegression scale:0.3 score:0.7047619047619048
LogisticRegression scale:0.1 score:0.7428571428571429
LogisticRegression scale:0.2 score:0.7428571428571429
LogisticRegression scale:0.3 score:0.7142857142857143
LogisticRegression scale:0.1 score:0.6857142857142857
LogisticRegression scale:0.2 score:0.6857142857142857
LogisticRegression scale:0.3 score:0.6857142857142857
LogisticRegression scale:0.1 score:0.828571

In [34]:
select_best_model(svc)

SVC scale:0.1 score:0.6
SVC scale:0.2 score:0.7
SVC scale:0.3 score:0.7619047619047619
SVC scale:0.1 score:0.6857142857142857
SVC scale:0.2 score:0.7
SVC scale:0.3 score:0.6095238095238096
SVC scale:0.1 score:0.6285714285714286
SVC scale:0.2 score:0.6142857142857143
SVC scale:0.3 score:0.7047619047619048
SVC scale:0.1 score:0.7428571428571429
SVC scale:0.2 score:0.6714285714285714
SVC scale:0.3 score:0.7047619047619048
SVC scale:0.1 score:0.6857142857142857
SVC scale:0.2 score:0.7571428571428571
SVC scale:0.3 score:0.6952380952380952
SVC scale:0.1 score:0.5428571428571428
SVC scale:0.2 score:0.6142857142857143
SVC scale:0.3 score:0.6666666666666666
SVC scale:0.1 score:0.4857142857142857
SVC scale:0.2 score:0.7142857142857143
SVC scale:0.3 score:0.7238095238095238
SVC scale:0.1 score:0.6857142857142857
SVC scale:0.2 score:0.7285714285714285
SVC scale:0.3 score:0.7238095238095238
SVC scale:0.1 score:0.6285714285714286
SVC scale:0.2 score:0.6142857142857143
SVC scale:0.3 score:0.714285714

In [35]:
select_best_model(gaussian)

GaussianNB scale:0.1 score:0.7142857142857143
GaussianNB scale:0.2 score:0.6142857142857143
GaussianNB scale:0.3 score:0.7523809523809524
GaussianNB scale:0.1 score:0.7428571428571429
GaussianNB scale:0.2 score:0.6857142857142857
GaussianNB scale:0.3 score:0.6190476190476191
GaussianNB scale:0.1 score:0.6571428571428571
GaussianNB scale:0.2 score:0.6714285714285714
GaussianNB scale:0.3 score:0.6285714285714286
GaussianNB scale:0.1 score:0.7142857142857143
GaussianNB scale:0.2 score:0.7142857142857143
GaussianNB scale:0.3 score:0.6476190476190476
GaussianNB scale:0.1 score:0.7428571428571429
GaussianNB scale:0.2 score:0.6142857142857143
GaussianNB scale:0.3 score:0.7142857142857143
GaussianNB scale:0.1 score:0.6285714285714286
GaussianNB scale:0.2 score:0.7142857142857143
GaussianNB scale:0.3 score:0.6666666666666666
GaussianNB scale:0.1 score:0.7142857142857143
GaussianNB scale:0.2 score:0.6857142857142857
GaussianNB scale:0.3 score:0.6666666666666666
GaussianNB scale:0.1 score:0.62857

In [36]:
select_best_model(tree)

DecisionTreeClassifier scale:0.1 score:0.6
DecisionTreeClassifier scale:0.2 score:0.6
DecisionTreeClassifier scale:0.3 score:0.5523809523809524
DecisionTreeClassifier scale:0.1 score:0.5142857142857142
DecisionTreeClassifier scale:0.2 score:0.6571428571428571
DecisionTreeClassifier scale:0.3 score:0.6476190476190476
DecisionTreeClassifier scale:0.1 score:0.5714285714285714
DecisionTreeClassifier scale:0.2 score:0.5857142857142857
DecisionTreeClassifier scale:0.3 score:0.580952380952381
DecisionTreeClassifier scale:0.1 score:0.6571428571428571
DecisionTreeClassifier scale:0.2 score:0.6
DecisionTreeClassifier scale:0.3 score:0.5904761904761905
DecisionTreeClassifier scale:0.1 score:0.5142857142857142
DecisionTreeClassifier scale:0.2 score:0.5
DecisionTreeClassifier scale:0.3 score:0.6285714285714286
DecisionTreeClassifier scale:0.1 score:0.6571428571428571
DecisionTreeClassifier scale:0.2 score:0.7
DecisionTreeClassifier scale:0.3 score:0.6
DecisionTreeClassifier scale:0.1 score:0.628571

In [37]:
X_train, X_test, y_train, y_test = train_test_split(m_train, target)
for i in [0.01, 0.1, 0.5, 1, 5, 10]:
    logistic.set_params(C=i)
    score = logistic.fit(X_train, y_train).score(X_test, y_test)
    print("C:{} score:{}".format(i, score))

C:0.01 score:0.6436781609195402
C:0.1 score:0.6436781609195402
C:0.5 score:0.6206896551724138
C:1 score:0.6091954022988506
C:5 score:0.632183908045977
C:10 score:0.632183908045977
