In [58]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, LabelBinarizer, OrdinalEncoder
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, precision_score, recall_score
from sklearn.base import TransformerMixin, BaseEstimator        

In [5]:
df_train = pd.read_csv(r'C:/Users/18315/Desktop/模式识别/data/train.csv')


In [8]:
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [10]:
df_train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [11]:
#查看缺失值
df_train.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

### 先排除掉Cabin(缺失值太多)，PassengerId, Name, Ticket(感觉对分类来说没啥意义)

In [16]:
feature_list = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare','Embarked']
df_train_data = df_train[feature_list]
df_train_label = df_train['Survived']

In [22]:
class MyLabelBinarizer(TransformerMixin):
    def __init__(self, *args, **kwargs):
        self.encoder = LabelBinarizer(*args, **kwargs)

    def fit(self, x, y=0):
        self.encoder.fit(x)
        return self

    def transform(self, x, y=0):
        return self.encoder.transform(x)


class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_name):
        self.attribute_name = attribute_name

    def fit(self, x, y=None):
        return self

    def transform(self, x):
        return x[self.attribute_name].values


### 数据预处理

In [66]:
cat_attribs = ['Pclass', 'Sex']
#dis_attribs = ['SibSp', 'Parch']
#con_attribs = ['Age', 'Fare'] 

cat_pipeline = Pipeline([
    ('selector', DataFrameSelector(cat_attribs)),
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder()),
])
'''dis_pipeline = Pipeline([
    ('selector', DataFrameSelector(dis_attribs)),
    ('imputer', SimpleImputer(strategy='most_frequent')),
#    ('encoder', OneHotEncoder())
])

#con_pipeline = Pipeline([
    ('selector', DataFrameSelector(con_attribs)),
    ('imputer', SimpleImputer(strategy='mean')),
])'''

full_pipeline =  FeatureUnion(
    transformer_list=[
        #('con_pipeline', con_pipeline),
        #('dis_pipeline', dis_pipeline),
        ('cat_pipeline', cat_pipeline),
    ]
)


train_x_cleaned = full_pipeline.fit_transform(df_train_data)
train_y = df_train_label

'''full_pipeline1 = FeatureUnion(
    transformer_list=[
        ('con_pipeline', con_pipeline),
    ]
)

full_pipeline2 = FeatureUnion(
    transformer_list=[
        ('dis_pipeline', dis_pipeline),
        ('cat_pipeline', cat_pipeline),
    ]
)'''



"full_pipeline1 = FeatureUnion(\n    transformer_list=[\n        ('con_pipeline', con_pipeline),\n    ]\n)\n\nfull_pipeline2 = FeatureUnion(\n    transformer_list=[\n        ('dis_pipeline', dis_pipeline),\n        ('cat_pipeline', cat_pipeline),\n    ]\n)"

### 多项式分布

In [67]:
from sklearn.naive_bayes import MultinomialNB, GaussianNB
clf = MultinomialNB()
clf.fit(train_x_cleaned, train_y)

MultinomialNB()

In [69]:
accuracy_score(train_y, clf.predict(train_x_cleaned))

0.7867564534231201

In [75]:
test_data = pd.read_csv(r'C:/Users/18315/Desktop/模式识别/data/test.csv')

test_x_cleaned = full_pipeline.fit_transform(test_data)

In [76]:
predicted = clf.predict(test_x_cleaned)
test_data['Survived'] = predicted.astype(int)
test_data[['PassengerId','Survived']].to_csv('C:/Users/18315/Desktop/模式识别/data/submission.csv', sep=',', index=False)

### 关联分析(没啥用)

In [19]:
numeric_features = df_train.select_dtypes(include=[np.number])#提取数值型列

corr = numeric_features.corr()#相关性分析

print (corr['Survived'].sort_values(ascending=False)[:5], '\n')#提取前五正相关变量

print (corr['Survived'].sort_values(ascending=False)[-5:])#提取前五负相关变量


Survived       1.000000
Fare           0.257307
Parch          0.081629
PassengerId   -0.005007
SibSp         -0.035322
Name: Survived, dtype: float64 

Parch          0.081629
PassengerId   -0.005007
SibSp         -0.035322
Age           -0.077221
Pclass        -0.338481
Name: Survived, dtype: float64
