In [67]:
import pandas as pd
import numpy as np
from sklearn import datasets
from sklearn import preprocessing as p

data_source = "https://archive.ics.uci.edu/ml/machine-learning-databases/nursery/nursery.data"

names =  ["parents", "has_nurs", "form", "children" ,"housing", "finance", "social", "health", "class"]
dataframe = pd.read_csv(data_source, names=names)

for column in range(len(names)):
    dataframe[names[column]] = dataframe[names[column]].astype("category")

y_data = dataframe.pop("class")

real_x_data = dataframe
real_x_data.insert(0, "bias", 1)

In [68]:
def coefficient(column):
    feature = real_x_data[column].unique()
    feature_val = real_x_data[column]
    y_val = y_data.unique()

    count = [[0 for j in range(len(y_val))] for i in range(len(feature))]

    for (x,y) in zip(feature_val,y_data):
        count[np.where(feature==x)[0][0]][np.where(y_val==y)[0][0]] += 1
    
    return np.array(count)


for i in ["parents", "has_nurs", "form", "children" ,"housing", "finance", "social", "health"]:
    print(i, ": \n", coefficient(i))

parents : 
 [[   2 1924 1440  196  758]
 [   0 1484 1440  132 1264]
 [   0  858 1440    0 2022]]
has_nurs : 
 [[   2 1344  864  130  252]
 [   0 1344  864  132  252]
 [   0  904  864   66  758]
 [   0  464  864    0 1264]
 [   0  210  864    0 1518]]
form : 
 [[   2 1152 1080  118  888]
 [   0 1092 1080  100  968]
 [   0 1038 1080   70 1052]
 [   0  984 1080   40 1136]]
children : 
 [[   2 1206 1080  148  804]
 [   0 1092 1080  100  968]
 [   0  984 1080   40 1136]
 [   0  984 1080   40 1136]]
housing : 
 [[   2 1618 1440  208 1052]
 [   0 1396 1440  100 1384]
 [   0 1252 1440   20 1608]]
finance : 
 [[   2 2244 2160  218 1856]
 [   0 2022 2160  110 2188]]
social : 
 [[   1 1515 1440  164 1200]
 [   1 1515 1440  164 1200]
 [   0 1236 1440    0 1644]]
health : 
 [[   2 2412    0  328 1578]
 [   0 1854    0    0 2466]
 [   0    0 4320    0    0]]


In [69]:
##변수내의 catagory를 합친다. 

change_x_data = dataframe.copy(deep=True)

change_x_data["has_nurs"][change_x_data["has_nurs"]=='proper']='less_proper'
change_x_data["children"][change_x_data["children"]=='3']='more'
change_x_data["social"][change_x_data["social"]=='slightly_prob']='nonprob'

In [70]:
real_x_data = pd.get_dummies(real_x_data).as_matrix()
change_x_data = pd.get_dummies(change_x_data).as_matrix()

In [71]:
from sklearn.cross_validation import KFold,ShuffleSplit
from sklearn import linear_model

cv  = ShuffleSplit(len(y_data), n_iter=10, test_size=0.4, random_state=0)

real_total=0
change_total=0

for train_index, test_index in cv :
    real_x_train, real_x_test = real_x_data[train_index], real_x_data[test_index]
    change_x_train, change_x_test = change_x_data[train_index], change_x_data[test_index]
    y_train, y_test = y_data[train_index], y_data[test_index]
    
    logreg = linear_model.LogisticRegression(multi_class='multinomial', fit_intercept=True, solver="lbfgs")

    logreg.fit(real_x_train, y_train)
    logreg.fit(change_x_train, y_train)

    real_total +=(sum(logreg.predict(real_x_test) == y_test.ravel()) / len(y_test))
    change_total += (sum(logreg.predict(change_x_test) == y_test.ravel()) / len(y_test))
     
print("real_average : " , real_total/10)
print("change_average : ",change_total/10)



real_average :  0.867226080247
change_average :  0.927662037037


In [72]:
from sklearn import tree

clf = tree.DecisionTreeClassifier()
clf = clf.fit(change_x_train, y_train)

In [66]:
print("DecisionTree : " ,sum(clf.predict(change_x_test) == y_test.ravel()) / len(y_test))

DecisionTree :  0.996720679012
