# Titanic: Machine Learning from Disaster

# 案例介绍
    泰坦尼克号的沉没是历史上伤亡人数最多、影响最深远的沉船事件之一。1912年4月15日，泰坦尼克号与冰山相撞，2224名乘客和船员中有1502人丧生。这场轰动性的悲剧震惊了国际社会，但也因此产生了更好的船舶安全法规。这次海难造成人员伤亡的原因之一是没有足够的救生艇供乘客和船员使用。
    
    虽然在沉船中幸存下来有一些运气因素，但有些人比其他人更可能存活下来。在本次学习中，在给定的部分存活人员信息中，将应用python来分析哪些乘客可能在悲剧中存活。此次，棕榈学院将携手Yiyu导师给各位想学习Python、想要在数据行业继续发展学习的同学来讲授如何完成这样一个project，相信会对你们的数据分析技能的提升大有裨益。

# 第四节课

In [332]:
#从sklearn中调用逻辑回归、决策树，随机森林包
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [333]:
#初始化模型，分别存入logr,dtree,rf变量中，并将变量置于名为models的列表中
logr = LogisticRegression()
dtree = DecisionTreeClassifier()
rf = RandomForestClassifier()
models = [logr,dtree,rf]

In [334]:
#这个函数调用逻辑回归模型来学习训练集和测试集
logr.fit(xTrain,yTrain)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [335]:
#根据自变量xTest,xTrain分别利用训练好的模型预测出因变量
y_pred_test = logr.predict(xTest)
y_pred_train = logr.predict(xTrain)

In [336]:
#算出测试集精确度（求均值）
np.mean(y_pred_test == yTest)

0.7821229050279329

In [337]:
#算出训练集精确度
np.mean(y_pred_train == yTrain)

0.824438202247191

In [338]:
#对每一个模型，分别测试训练集和测试集的精确度
for model in models:
    print ('\nThe current model is', model)
    model.fit(xTrain, yTrain)
    print ('\nTraining accuracy is',np.mean(model.predict(xTrain) == yTrain))
    print ('\nTesting accuracy is',np.mean(model.predict(xTest) == yTest))


The current model is LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

Training accuracy is 0.824438202247191

Testing accuracy is 0.7821229050279329

The current model is DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

Training accuracy is 0.9873595505617978

Testing accuracy is 0.7877094972067039

The current model is RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            m

In [339]:
#对第二组数据进行相同的操作，每一个模型，分别测试训练集和测试集的精确度
for model in models:
    print ('\nThe current model is', model)
    model.fit(x2Train, y2Train)
    print ('\nTraining accuracy is',np.mean(model.predict(x2Train) == y2Train))
    print ('\nTesting accuracy is',np.mean(model.predict(x2Test) == y2Test))


The current model is LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

Training accuracy is 0.8300561797752809

Testing accuracy is 0.8156424581005587

The current model is DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

Training accuracy is 0.9873595505617978

Testing accuracy is 0.7821229050279329

The current model is RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            

## 交叉验证

In [340]:
#交叉验证是机器学习领域常用的验证模型是否优秀的方法；简而言之就是把数据切分成几个部分然后在训练集和测试集中交换使用
#比如这次在训练集中用到的数据，下一次会放进测试集来使用，因此被称为 交叉
#简单的交叉验证会直接按百分比来划分训练集和测试集，更难一些的方法是K-Fold，我们会使用这个方法来进行交叉验证
#K-Fold其实就是把数据集切成K份，将每一份儿小数据集分别做一次验证集，每次剩下的K-1组份儿数据作为训练集，得到K个模型
#
#从sklearn中调用k次交叉验证包
from sklearn.model_selection import KFold

In [359]:
#定义k次交叉验证函数
def CVKFold(k, X, y, Model):

    # Random seed: reproducibility
    np.random.seed(1)

    # accuracy score 
    train_accuracy = [0 for i in range(k)] 
    test_accuracy = [0 for i in range(k)] 
   
    # index
    idx = 0
    
    # CV loop
    kf = KFold(n_splits = k, shuffle = True)
    
    # Generate the sets
    for train_index, test_index in kf.split(X):
        # Iteration number
        #print(train_index,len(train_index))
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        # Calling the function/model
               
        if Model == "Logit":
            clf = LogisticRegression(random_state = 0)
                     
        if Model == "RForest":
            clf = RandomForestClassifier(random_state = 0)
            
        # Fit the model
        clf = clf.fit(X_train, y_train)
        y_train_pred = clf.predict(X_train)
        y_test_pred = clf.predict(X_test)
        
        train_accuracy[idx] = np.mean(y_train_pred == y_train)
        test_accuracy[idx] = np.mean(y_test_pred == y_test)
        idx += 1

    print (train_accuracy)
    print (test_accuracy)
    return train_accuracy, test_accuracy

In [362]:
#应用逻辑回归模型，将数据分为10份，每次取一个样本作为验证数据，剩下k-1个样本作为训练数据
train_acc,test_acc = CVKFold(10,all_x,y,"Logit")

[0.8202247191011236, 0.8017456359102244, 0.8154613466334164, 0.8216957605985037, 0.8054862842892768, 0.8104738154613467, 0.8042394014962594, 0.8104738154613467, 0.8092269326683291, 0.816708229426434]
[0.7555555555555555, 0.8539325842696629, 0.7752808988764045, 0.7415730337078652, 0.8539325842696629, 0.7865168539325843, 0.8876404494382022, 0.797752808988764, 0.8314606741573034, 0.797752808988764]


In [363]:
#验证训练数据和测试数据的精确度
np.mean(train_acc),np.mean(test_acc)

(0.811573594104626, 0.8081398252184769)

In [364]:
#应用随机森林模型，将数据分为10份，每次取一个样本作为验证数据，剩下k-1个样本作为训练数据
train_acc,test_acc = CVKFold(10,all_x,y,"RForest")

[0.9650436953807741, 0.9688279301745636, 0.9763092269326683, 0.9763092269326683, 0.972568578553616, 0.9738154613466334, 0.9713216957605985, 0.972568578553616, 0.9750623441396509, 0.9738154613466334]
[0.7555555555555555, 0.7865168539325843, 0.7191011235955056, 0.7865168539325843, 0.8426966292134831, 0.8089887640449438, 0.8314606741573034, 0.8202247191011236, 0.8876404494382022, 0.797752808988764]
