In [1]:
import re
import numpy as np
import pandas as pd
import random as rd
from sklearn import preprocessing
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestRegressor
from sklearn.decomposition import PCA

#打印设置，原始的threshold=10000,linewidth=160
np.set_printoptions(precision=4, threshold=10000, linewidth=160, edgeitems=999, suppress=True)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', 160)
pd.set_option('expand_frame_repr', False)
pd.set_option('precision', 4)


#构建二元特征
def process_embarked():
    global df_titanic_data

    #用众数代替缺失值
    df_titanic_data.Embarked[df_titanic_data.Embarked.isnull()] = df_titanic_data.Embarked.dropna().mode().values

    #将值转换为数字
    df_titanic_data['Embarked'] = pd.factorize(df_titanic_data['Embarked'])[0]

    #对构造的特征进行二值化
    if keep_binary:
        df_titanic_data = pd.concat([df_titanic_data, pd.get_dummies(df_titanic_data['Embarked']).rename(
            columns=lambda x: 'Embarked_' + str(x))], axis=1)



#定义一个辅助函数，可以使用RandomForestClassifier（随机森林分类器）来处理age变量的缺失值
def set_missing_ages():
    global df_titanic_data

    age_data = df_titanic_data[
        ['Age', 'Embarked', 'Fare', 'Parch', 'SibSp', 'Title_id', 'Pclass', 'Names', 'CabinLetter']]
    input_values_RF = age_data.loc[(df_titanic_data.Age.notnull())].values[:, 1::]
    target_values_RF = age_data.loc[(df_titanic_data.Age.notnull())].values[:, 0]

    #从sklearn的随机森林回归函数创建对象
    regressor = RandomForestRegressor(n_estimators=2000, n_jobs=-1)

    #根据上面的输入值和目标值构建模型
    regressor.fit(input_values_RF, target_values_RF)

    #使用训练的模型来预测缺失值
    predicted_ages = regressor.predict(age_data.loc[(df_titanic_data.Age.isnull())].values[:, 1::])

    #在原始的泰坦数据帧中填充预测的年龄
    #(此处源代码age_data.loc[(age_data.Age.isnull()), 'Age'] = predicted_ages存在错误，导致运行结果出现空值，现在已经改正））
    
    df_titanic_data.loc[(df_titanic_data.Age.isnull()), 'Age'] = predicted_ages
    

#辅助函数用于从年龄变量构造特征
def process_age():
    global df_titanic_data

    #调用set_missing_ages辅助函数以使用随机森林回归来预测缺失的年龄值
    set_missing_ages()

    #通过以单位方差为中心围绕均值来缩放年龄变量
    #(下面这段是执行命令，源代码将此处设为注释，此处已经改动)
    if keep_scaled:
        scaler_preprocessing = preprocessing.StandardScaler()
        df_titanic_data['Age_scaled'] = scaler_preprocessing.fit_transform(df_titanic_data.Age.reshape(-1, 1))

    #为儿童构建一个特征
    df_titanic_data['isChild'] = np.where(df_titanic_data.Age < 13, 1, 0)

    #分成四分位数并创建二进制特征
    df_titanic_data['Age_bin'] = pd.qcut(df_titanic_data['Age'], 4)

    if keep_binary:
        df_titanic_data = pd.concat(
            [df_titanic_data, pd.get_dummies(df_titanic_data['Age_bin']).rename(columns=lambda y: 'Age_' + str(y))],
            axis=1)

    if keep_bins:
        df_titanic_data['Age_bin_id'] = pd.factorize(df_titanic_data['Age_bin'])[0] + 1

    if keep_bins and keep_scaled:
        scaler_processing = preprocessing.StandardScaler()
        df_titanic_data['Age_bin_id_scaled'] = scaler_processing.fit_transform(
            df_titanic_data.Age_bin_id.reshape(-1, 1))
      
    
    if not keep_strings:
        df_titanic_data.drop('Age_bin', axis=1, inplace=True)


#辅助函数，用于构建乘客/机组人员姓名的特征
def process_name():
    global df_titanic_data

    #在names变量中获取不同的名称
    df_titanic_data['Names'] = df_titanic_data['Name'].map(lambda y: len(re.split(' ', y)))

    #获得每个人的头衔
    df_titanic_data['Title'] = df_titanic_data['Name'].map(lambda y: re.compile(", (.*?)\.").findall(y)[0])

    #处理出现次数少的头衔
    df_titanic_data['Title'][df_titanic_data.Title == 'Jonkheer'] = 'Master'
    df_titanic_data['Title'][df_titanic_data.Title.isin(['Ms', 'Mlle'])] = 'Miss'
    df_titanic_data['Title'][df_titanic_data.Title == 'Mme'] = 'Mrs'
    df_titanic_data['Title'][df_titanic_data.Title.isin(['Capt', 'Don', 'Major', 'Col', 'Sir'])] = 'Sir'
    df_titanic_data['Title'][df_titanic_data.Title.isin(['Dona', 'Lady', 'the Countess'])] = 'Lady'

    #使所有特征二值化
    if keep_binary:
        df_titanic_data = pd.concat(
            [df_titanic_data, pd.get_dummies(df_titanic_data['Title']).rename(columns=lambda x: 'Title_' + str(x))],
            axis=1)

    #缩放
    if keep_scaled:
        scaler_preprocessing = preprocessing.StandardScaler()
        df_titanic_data['Names_scaled'] = scaler_preprocessing.fit_transform(df_titanic_data.Names.reshape(-1, 1))

    #分级
    if keep_bins:
        df_titanic_data['Title_id'] = pd.factorize(df_titanic_data['Title'])[0] + 1

    if keep_bins and keep_scaled:
        scaler = preprocessing.StandardScaler()
        df_titanic_data['Title_id_scaled'] = scaler.fit_transform(df_titanic_data.Title_id.reshape(-1, 1))



#从客舱输入变量产生特征
def process_cabin():
#指的是包含泰坦尼克例子的全局变量
    global df_titanic_data

    #用U0来替换客舱变量的缺失值
    df_titanic_data['Cabin'][df_titanic_data.Cabin.isnull()] = 'U0'

    #客舱编号是一系列字母数字，因此我们将创建一些特征
    #来自客舱编号字母的一部分
    df_titanic_data['CabinLetter'] = df_titanic_data['Cabin'].map(lambda l: get_cabin_letter(l))
    df_titanic_data['CabinLetter'] = pd.factorize(df_titanic_data['CabinLetter'])[0]

    #将客舱字母特征二值化
    if keep_binary:
        cletters = pd.get_dummies(df_titanic_data['CabinLetter']).rename(columns=lambda x: 'CabinLetter_' + str(x))
        df_titanic_data = pd.concat([df_titanic_data, cletters], axis=1)

    #从客舱的数字侧创建特征
    df_titanic_data['CabinNumber'] = df_titanic_data['Cabin'].map(lambda x: get_cabin_num(x)).astype(int) + 1

    #缩放特征
    if keep_scaled:
        scaler_processing = preprocessing.StandardScaler()
        df_titanic_data['CabinNumber_scaled'] = scaler_processing.fit_transform(df_titanic_data.CabinNumber.reshape(-1, 1))


def get_cabin_letter(cabin_value):
#搜索客舱字母数字值中的字母
    letter_match = re.compile("([a-zA-Z]+)").search(cabin_value)
    
    if letter_match:
        return letter_match.group()
    else:
        return 'U'


def get_cabin_num(cabin_value):
#搜索客舱字母数字值中的数字
    number_match = re.compile("([0-9]+)").search(cabin_value)

    if number_match:
        return number_match.group()
    else:
        return 0


#用于从票价变量构造特征的辅助函数
def process_fare():
    global df_titanic_data

    #用票价的中位数来替换缺失值
    df_titanic_data['Fare'][np.isnan(df_titanic_data['Fare'])] = df_titanic_data['Fare'].median()

    #票价中的0会导致一些分区问题，因此我们将它们设置为最低票价的十分之一
    df_titanic_data['Fare'][np.where(df_titanic_data['Fare'] == 0)[0]] = df_titanic_data['Fare'][
                                                                             df_titanic_data['Fare'].nonzero()[
                                                                                 0]].min() / 10

    #通过将特征分类为分位数来对特征进行二值化
    df_titanic_data['Fare_bin'] = pd.qcut(df_titanic_data['Fare'], 4)

    if keep_binary:
        df_titanic_data = pd.concat(
            [df_titanic_data, pd.get_dummies(df_titanic_data['Fare_bin']).rename(columns=lambda x: 'Fare_' + str(x))],
            axis=1)

    #分级
    if keep_bins:
        df_titanic_data['Fare_bin_id'] = pd.factorize(df_titanic_data['Fare_bin'])[0] + 1

    #缩放值
    if keep_scaled:
        scaler_processing = preprocessing.StandardScaler()
        df_titanic_data['Fare_scaled'] = scaler_processing.fit_transform(df_titanic_data.Fare.reshape(-1, 1))

    if keep_bins and keep_scaled:
        scaler_processing = preprocessing.StandardScaler()
        df_titanic_data['Fare_bin_id_scaled'] = scaler_processing.fit_transform(
            df_titanic_data.Fare_bin_id.reshape(-1, 1))

    if not keep_strings:
        df_titanic_data.drop('Fare_bin', axis=1, inplace=True)


#辅助函数用于从票证变量中构建特征
def process_ticket():
    global df_titanic_data

    df_titanic_data['TicketPrefix'] = df_titanic_data['Ticket'].map(lambda y: get_ticket_prefix(y.upper()))
    df_titanic_data['TicketPrefix'] = df_titanic_data['TicketPrefix'].map(lambda y: re.sub('[\.?\/?]', '', y))
    df_titanic_data['TicketPrefix'] = df_titanic_data['TicketPrefix'].map(lambda y: re.sub('STON', 'SOTON', y))

    df_titanic_data['TicketPrefixId'] = pd.factorize(df_titanic_data['TicketPrefix'])[0]

    #每个票证层进行二值化
    if keep_binary:
        prefixes = pd.get_dummies(df_titanic_data['TicketPrefix']).rename(columns=lambda y: 'TicketPrefix_' + str(y))
        df_titanic_data = pd.concat([df_titanic_data, prefixes], axis=1)

    df_titanic_data.drop(['TicketPrefix'], axis=1, inplace=True)

    df_titanic_data['TicketNumber'] = df_titanic_data['Ticket'].map(lambda y: get_ticket_num(y))
    df_titanic_data['TicketNumberDigits'] = df_titanic_data['TicketNumber'].map(lambda y: len(y)).astype(np.int)
    df_titanic_data['TicketNumberStart'] = df_titanic_data['TicketNumber'].map(lambda y: y[0:1]).astype(np.int)

    df_titanic_data['TicketNumber'] = df_titanic_data.TicketNumber.astype(np.int)

    if keep_scaled:
        scaler_processing = preprocessing.StandardScaler()
        df_titanic_data['TicketNumber_scaled'] = scaler_processing.fit_transform(
            df_titanic_data.TicketNumber.reshape(-1, 1))


def get_ticket_prefix(ticket_value):
#搜索票证字母数字值中的字母
    match_letter = re.compile("([a-zA-Z\.\/]+)").search(ticket_value)
    if match_letter:
        return match_letter.group()
    else:
        return 'U'


def get_ticket_num(ticket_value):
#搜索票证字母数字值中的数字

    match_number = re.compile("([\d]+$)").search(ticket_value)
    if match_number:
        return match_number.group()
    else:
        return '0'


    #从乘客类变量中提取特征
def process_PClass():
    global df_titanic_data

    #用众数来替换缺失值
    df_titanic_data.Pclass[df_titanic_data.Pclass.isnull()] = df_titanic_data.Pclass.dropna().mode().values

    #对特征进行二值化
    if keep_binary:
        df_titanic_data = pd.concat(
            [df_titanic_data, pd.get_dummies(df_titanic_data['Pclass']).rename(columns=lambda y: 'Pclass_' + str(y))],
            axis=1)

    if keep_scaled:
        scaler_preprocessing = preprocessing.StandardScaler()
        df_titanic_data['Pclass_scaled'] = scaler_preprocessing.fit_transform(df_titanic_data.Pclass.reshape(-1, 1))


    #基于SibSp和Parch等家庭变量构建特征
def process_family():
    global df_titanic_data

    #确保在使用交互变量中没有零
    df_titanic_data['SibSp'] = df_titanic_data['SibSp'] + 1
    df_titanic_data['Parch'] = df_titanic_data['Parch'] + 1

    #缩放
    if keep_scaled:
        scaler_preprocessing = preprocessing.StandardScaler()
        df_titanic_data['SibSp_scaled'] = scaler_preprocessing.fit_transform(df_titanic_data.SibSp.reshape(-1, 1))
        df_titanic_data['Parch_scaled'] = scaler_preprocessing.fit_transform(df_titanic_data.Parch.reshape(-1, 1))

    #对所有特征进行二值化
    if keep_binary:
        sibsps_var = pd.get_dummies(df_titanic_data['SibSp']).rename(columns=lambda y: 'SibSp_' + str(y))
        parchs_var = pd.get_dummies(df_titanic_data['Parch']).rename(columns=lambda y: 'Parch_' + str(y))
        df_titanic_data = pd.concat([df_titanic_data, sibsps_var, parchs_var], axis=1)


   #将性别变量二值化
def process_sex():
    global df_titanic_data
    df_titanic_data['Gender'] = np.where(df_titanic_data['Sex'] == 'male', 1, 0)


   #删除掉原始的变量
def process_drops():
    global df_titanic_data
    drops = ['Name', 'Names', 'Title', 'Sex', 'SibSp', 'Parch', 'Pclass', 'Embarked', \
             'Cabin', 'CabinLetter', 'CabinNumber', 'Age', 'Fare', 'Ticket', 'TicketNumber']
    string_drops = ['Title', 'Name', 'Cabin', 'Ticket', 'Sex', 'Ticket', 'TicketNumber']
    if not keep_raw:
        df_titanic_data.drop(drops, axis=1, inplace=True)
    elif not keep_strings:
        df_titanic_data.drop(string_drops, axis=1, inplace=True)


    #处理所有特征工程
def get_titanic_dataset(binary=False, bins=False, scaled=False, strings=False, raw=True, pca=False, balanced=False):
    global keep_binary, keep_bins, keep_scaled, keep_raw, keep_strings, df_titanic_data
    keep_binary = binary
    keep_bins = bins
    keep_scaled = scaled
    keep_raw = raw
    keep_strings = strings

    #用Pandas读取训练集与测试集
    train_data = pd.read_csv('data/train.csv', header=0)
    test_data = pd.read_csv('data/test.csv', header=0)

    #将训练集和测试集连接在一起，以完成整体特征工程
    df_titanic_data = pd.concat([train_data, test_data])

    #通过重新索引数据来移除训练集和测试集所导致的重复索引
    df_titanic_data.reset_index(inplace=True)

    #删除reset_index（）函数生成的索引列
    df_titanic_data.drop('index', axis=1, inplace=True)

    #将列索引为从1开始的索引
    df_titanic_data = df_titanic_data.reindex_axis(train_data.columns, axis=1)

    #使用我们在上面定义的辅助函数处理titanic原始变量
    process_cabin()
    process_ticket()
    process_name()
    process_fare()
    process_embarked()
    process_family()
    process_sex()
    process_PClass()
    process_age()
    process_drops()

    #将servived列移动到第一个列
    columns_list = list(df_titanic_data.columns.values)
    columns_list.remove('Survived')
    new_col_list = list(['Survived'])
    new_col_list.extend(columns_list)
    df_titanic_data = df_titanic_data.reindex(columns=new_col_list)
    

    print("Starting with", df_titanic_data.columns.size,
          "manually constructing features based on the interaction between them...\n", df_titanic_data.columns.values)

    #基于个体特征相互作用的特征构造
    numeric_features = df_titanic_data.loc[:,
                       ['Age_scaled', 'Fare_scaled', 'Pclass_scaled', 'Parch_scaled', 'SibSp_scaled',
                        'Names_scaled', 'CabinNumber_scaled', 'Age_bin_id_scaled', 'Fare_bin_id_scaled']]
    print("\nUsing only numeric features for automated feature generation:\n", numeric_features.head(10))

    new_fields_count = 0
    for i in range(0, numeric_features.columns.size - 1):
        for j in range(0, numeric_features.columns.size - 1):
            if i <= j:
                name = str(numeric_features.columns.values[i]) + "*" + str(numeric_features.columns.values[j])
                df_titanic_data = pd.concat(
                    [df_titanic_data, pd.Series(numeric_features.iloc[:, i] * numeric_features.iloc[:, j], name=name)],
                    axis=1)
                new_fields_count += 1
            if i < j:
                name = str(numeric_features.columns.values[i]) + "+" + str(numeric_features.columns.values[j])
                df_titanic_data = pd.concat(
                    [df_titanic_data, pd.Series(numeric_features.iloc[:, i] + numeric_features.iloc[:, j], name=name)],
                    axis=1)
                new_fields_count += 1
            if not i == j:
                name = str(numeric_features.columns.values[i]) + "/" + str(numeric_features.columns.values[j])
                df_titanic_data = pd.concat(
                    [df_titanic_data, pd.Series(numeric_features.iloc[:, i] / numeric_features.iloc[:, j], name=name)],
                    axis=1)
                name = str(numeric_features.columns.values[i]) + "-" + str(numeric_features.columns.values[j])
                df_titanic_data = pd.concat(
                    [df_titanic_data, pd.Series(numeric_features.iloc[:, i] - numeric_features.iloc[:, j], name=name)],
                    axis=1)
                new_fields_count += 2

    print("\n", new_fields_count, "new features constructed")

    #使用Spearman相关方法去除具有高相关性的特征

    #计算相关矩阵
    df_titanic_data_cor = df_titanic_data.drop(['Survived', 'PassengerId'], axis=1).corr(method='spearman')

    #创建一个能忽略相关的掩码
    mask_ignore = np.ones(df_titanic_data_cor.columns.size) - np.eye(df_titanic_data_cor.columns.size)
    df_titanic_data_cor = mask_ignore * df_titanic_data_cor

    features_to_drop = []

    #删除相关的特征
    for column in df_titanic_data_cor.columns.values:

        #检查我们是否已决定删除此变量
        if np.in1d([column], features_to_drop):
            continue

        #找到高度相关的变量
        corr_vars = df_titanic_data_cor[abs(df_titanic_data_cor[column]) > 0.98].index
        features_to_drop = np.union1d(features_to_drop, corr_vars)

    print("\nWe are going to drop", features_to_drop.shape[0], " which are highly correlated features...\n")
    df_titanic_data.drop(features_to_drop, axis=1, inplace=True)

    #拆分数据集以进行训练和测试并进行PCA
    #(原始代码)train_data = df_titanic_data[:train_data.shape[0]]
    #（原始代码）test_data = df_titanic_data[test_data.shape[0]:]
    train_data = df_titanic_data[:train_data.shape[0]]
    test_data = df_titanic_data[train_data.shape [0]:]


    if pca:
        print("reducing number of variables...")
        train_data, test_data = reduce(train_data, test_data)
    else:
        #删除在集合连接期间创建的测试集的空“Survived”列
        test_data.drop('Survived', axis=1, inplace=True)

    print("\n", train_data.columns.size, "initial features generated...\n")  # , input_df.columns.values

    return train_data, test_data

   #将训练集和测试集降维
    
def reduce(train_data, test_data):
    #将全部数据加到一起
    df_titanic_data = pd.concat([train_data, test_data])
    df_titanic_data.reset_index(inplace=True)
    df_titanic_data.drop('index', axis=1, inplace=True)
    df_titanic_data = df_titanic_data.reindex_axis(train_data.columns, axis=1)

    #将survived 这一列转化格式为series
    #(此处 survived_series = pd.Series(df['Survived'], name='Survived')存在错误，已经改正）
    survived_series = pd.Series(df_titanic_data['Survived'], name='Survived')

    print(df_titanic_data.head())

    #获取输入和目标值
    input_values = df_titanic_data.values[:, 1::]
    target_values = df_titanic_data.values[:, 0]

    print(input_values[0:10])

    #减少到的变量数量应涵盖的最小方差百分比（即为主成分的累积贡献率）
    variance_percentage = .99

    #创建PCA对象
    pca_object = PCA(n_components=variance_percentage)

    #特征转
    input_values_transformed = pca_object.fit_transform(input_values, target_values)

    #用PCA为变换后的变量创建数据帧
    pca_df = pd.DataFrame(input_values_transformed)

    print(pca_df.shape[1], " reduced components which describe ", str(variance_percentage)[1:], "% of the variance")

    #构建一个包含新减少的PCA变量的新数据帧
    df_titanic_data = pd.concat([survived_series, pca_df], axis=1)

    #再次分成单独的输入和测试集
    train_data = df_titanic_data[:train_data.shape[0]]
    test_data = df_titanic_data[train_data.shape[0]:]
    test_data.reset_index(inplace=True)
    test_data.drop('index', axis=1, inplace=True)
    test_data.drop('Survived', axis=1, inplace=True)

    return train_data, test_data


#调用辅助函数
if __name__ == '__main__':
    train,test = get_titanic_dataset(bins=True, scaled=True, binary=True)
    initial_drops = ['PassengerId']
    train.drop(initial_drops, axis=1, inplace=True)
    test.drop(initial_drops, axis=1, inplace=True)

    train, test = reduce(train, test)

    print(train.columns.values)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a

Starting with 105 manually constructing features based on the interaction between them...
 ['Survived' 'PassengerId' 'Pclass' 'Age' 'SibSp' 'Parch' 'Fare' 'Embarked' 'CabinLetter' 'CabinLetter_0' 'CabinLetter_1' 'CabinLetter_2' 'CabinLetter_3'
 'CabinLetter_4' 'CabinLetter_5' 'CabinLetter_6' 'CabinLetter_7' 'CabinLetter_8' 'CabinNumber' 'CabinNumber_scaled' 'TicketPrefixId' 'TicketPrefix_A'
 'TicketPrefix_AQ' 'TicketPrefix_AS' 'TicketPrefix_C' 'TicketPrefix_CA' 'TicketPrefix_CASOTON' 'TicketPrefix_FA' 'TicketPrefix_FC' 'TicketPrefix_FCC'
 'TicketPrefix_LINE' 'TicketPrefix_LP' 'TicketPrefix_PC' 'TicketPrefix_PP' 'TicketPrefix_PPP' 'TicketPrefix_SC' 'TicketPrefix_SCA' 'TicketPrefix_SCAH'
 'TicketPrefix_SCOW' 'TicketPrefix_SCPARIS' 'TicketPrefix_SOC' 'TicketPrefix_SOP' 'TicketPrefix_SOPP' 'TicketPrefix_SOTONO' 'TicketPrefix_SOTONOQ'
 'TicketPrefix_SP' 'TicketPrefix_SWPP' 'TicketPrefix_U' 'TicketPrefix_WC' 'TicketPrefix_WEP' 'TicketNumberDigits' 'TicketNumberStart' 'TicketNumber_scaled'
 '

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


   Survived  Pclass   Age  SibSp  Parch     Fare  Embarked  CabinLetter  CabinLetter_1  CabinLetter_2  CabinLetter_3  CabinLetter_4  CabinLetter_5  CabinLetter_6  CabinLetter_7  CabinLetter_8  CabinNumber  TicketPrefixId  TicketPrefix_A  TicketPrefix_AQ  TicketPrefix_AS  TicketPrefix_C  TicketPrefix_CA  TicketPrefix_CASOTON  TicketPrefix_FA  TicketPrefix_FC  TicketPrefix_FCC  TicketPrefix_LINE  TicketPrefix_LP  TicketPrefix_PC  TicketPrefix_PP  TicketPrefix_PPP  TicketPrefix_SC  TicketPrefix_SCA  TicketPrefix_SCAH  TicketPrefix_SCOW  TicketPrefix_SCPARIS  TicketPrefix_SOC  TicketPrefix_SOP  TicketPrefix_SOPP  TicketPrefix_SOTONO  TicketPrefix_SOTONOQ  TicketPrefix_SP  TicketPrefix_SWPP  TicketPrefix_U  TicketPrefix_WC  TicketPrefix_WEP  TicketNumberDigits  TicketNumberStart  TicketNumber_scaled  Names  Title_Dr  Title_Lady  Title_Master  Title_Miss  Title_Mr  Title_Mrs  Title_Rev  Title_Sir  Title_id  Fare_(0.316, 7.896]  Fare_(7.896, 14.454]  Fare_(14.454, 31.275]  Fare_(31.275, 512.3

31  reduced components which describe  .99 % of the variance
['Survived' 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
