In [1]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction import DictVectorizer


In [2]:
train_data = pd.read_csv("./train.csv")
test_data = pd.read_csv("./test.csv")
test_label = pd.read_csv("./gender_submission.csv")


In [3]:
#分布
print(train_data.info())
print("="*30)
print(train_data.describe())
print("="*30)
print(train_data.describe(include=['O']))
print("="*30)
print(train_data.head())
print("="*30)
print(train_data.tail())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB
None
       PassengerId    Survived      Pclass         Age       SibSp  \
count   891.000000  891.000000  891.000000  714.000000  891.000000   
mean    446.000000    0.383838    2.308642   29.699118    0.523008   
std     257.353842    0.486592    0.836071   14.526497    1.102743   
min       1.000000    0.000000    1.000000    0.420000    0.000000   
25%     223.500000    0.000000    2.000000   20.125000    0.

In [4]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare           417 non-null float64
Cabin          91 non-null object
Embarked       418 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


In [5]:
train_data.Age.fillna(train_data.Age.mean(),inplace=True)
test_data.Age.fillna(test_data.Age.mean(),inplace=True)

train_data.Fare.fillna(train_data.Fare.mean(),inplace=True)
test_data.Fare.fillna(test_data.Fare.mean(),inplace=True)

train_data.Embarked.fillna('S',inplace=True)
test_data.Embarked.fillna('S',inplace=True)

In [6]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            891 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       891 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [7]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            418 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare           418 non-null float64
Cabin          91 non-null object
Embarked       418 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


In [31]:
#特征选择
features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
# features = ['Pclass', 'Sex', 'Age']
train_label = train_data.Survived
feature_train = train_data[features]

In [32]:
#对字符oneHot编码
def index2str(dataset):
    p_dict={1:'one',2:'two',3:'three'}
    
    t = dataset.drop("Pclass",axis=1)
    t['Pclass']=dataset['Pclass'].apply(lambda x:p_dict[x])
    return t
# feature_train = index2str(feature_train)
dvec = DictVectorizer(sparse=False)
features_t = dvec.fit_transform(feature_train.to_dict(orient='record'))
feature_train.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,male,22.0,1,0,7.25,S
1,1,female,38.0,1,0,71.2833,C
2,3,female,26.0,0,0,7.925,S
3,1,female,35.0,1,0,53.1,S
4,3,male,35.0,0,0,8.05,S


In [33]:
features_t.shape

(891, 10)

In [34]:
dvec.feature_names_

['Age',
 'Embarked=C',
 'Embarked=Q',
 'Embarked=S',
 'Fare',
 'Parch',
 'Pclass',
 'Sex=female',
 'Sex=male',
 'SibSp']

In [35]:
clf = DecisionTreeClassifier(criterion='entropy')
# clf = DecisionTreeClassifier()

In [36]:
clf.fit(features_t,train_label)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [37]:
print(test_data.Pclass.value_counts())
# test_data_str = index2str(test_data[features])
test_data_str = test_data[features]
test_data_str.head()

3    218
1    107
2     93
Name: Pclass, dtype: int64


Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,male,34.5,0,0,7.8292,Q
1,3,female,47.0,1,0,7.0,S
2,2,male,62.0,0,0,9.6875,Q
3,3,male,27.0,0,0,8.6625,S
4,3,female,22.0,1,1,12.2875,S


In [38]:

print(test_data_str.shape)
print(test_data[features].shape)
test_features = dvec.transform(test_data_str.to_dict(orient='record'))
print(test_features.shape)
print(test_features)
test_predict = clf.predict(test_features)

(418, 7)
(418, 7)
(418, 10)
[[34.5         0.          1.         ...  0.          1.
   0.        ]
 [47.          0.          0.         ...  1.          0.
   1.        ]
 [62.          0.          1.         ...  0.          1.
   0.        ]
 ...
 [38.5         0.          0.         ...  0.          1.
   0.        ]
 [30.27259036  0.          0.         ...  0.          1.
   0.        ]
 [30.27259036  1.          0.         ...  0.          1.
   1.        ]]


In [39]:
test_label_do = test_label.Survived

test_label_do = np.array(test_label_do)
# clf.score(test_predict,test_label_do)
np.sum(test_predict.reshape(1,-1)==test_label_do.reshape(1,-1))/len(test_predict)

0.7679425837320574

In [40]:
clf.score(features_t,train_label)

0.9820426487093153

In [41]:
clf.score(test_features,test_label_do)

0.7679425837320574

In [42]:
np.sum(test_label.PassengerId==test_data.PassengerId)

418

In [43]:
from sklearn.model_selection import cross_val_score
cvs = cross_val_score(clf,features_t,train_label,cv=10)
print(np.mean(cvs))


0.781281920326864


In [44]:
from sklearn.metrics import classification_report
print(test_label.shape,test_predict.shape)
print(classification_report(test_label.Survived,test_predict,target_names=['died','survived']))

(418, 2) (418,)
              precision    recall  f1-score   support

        died       0.81      0.83      0.82       266
    survived       0.69      0.66      0.68       152

   micro avg       0.77      0.77      0.77       418
   macro avg       0.75      0.75      0.75       418
weighted avg       0.77      0.77      0.77       418



In [45]:
from sklearn.model_selection import GridSearchCV
param =  [{'criterion':['gini'],'max_depth': np.arange(20,50,10),'min_samples_leaf':np.arange(2,8,2),'min_impurity_decrease':np.linspace(0.1,0.9,10)},
             {'criterion':['gini','entropy']},
             {'min_impurity_decrease':np.linspace(0.1,0.9,10)}]
gscv = GridSearchCV(DecisionTreeClassifier(),param_grid=param)
gscv.fit(features_t,train_label)
print(gscv.best_params_,gscv.best_score_)



{'criterion': 'gini', 'max_depth': 20, 'min_impurity_decrease': 0.1, 'min_samples_leaf': 2} 0.7867564534231201


In [46]:
#使用KNN试一试
from sklearn.neighbors import KNeighborsClassifier
knc = KNeighborsClassifier(5)
knc.fit(features_t,train_label)


KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform')

In [47]:
knc_pred = knc.predict(test_features)

In [48]:
np.sum(knc_pred==test_label_do)/len(test_label_do)

0.7177033492822966

In [50]:
knc_param = [{"n_neighbors":range(3,10),"weights":["uniform","distance"]}]
kncgscv = GridSearchCV(KNeighborsClassifier(3),param_grid=knc_param)
kncgscv.fit(features_t,train_label)
print(kncgscv.best_params_,kncgscv.best_score_)



{'n_neighbors': 6, 'weights': 'distance'} 0.7182940516273849
