In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
from mylib import dispgraph
from mylib import prepare
from mylib import features

In [3]:
#特徴量作成
def make_feature(data):
    data = pd.get_dummies(data, columns=["Sex"])
    
    data["Title"] = data["Name"].str.split(", ", expand=True)[1].str.split(".", expand=True)[0]
    data["Title"] = data["Title"].replace({"Mlle": "Miss", "Ms": "Miss", "Mme":"Mrs"})
    fixed_title = ["Mr", "Miss", "Mrs", "Master"]
    data.loc[~(data["Title"].isin(fixed_title)), "Title"] = "Rare"
    
    data["Age"].fillna(data["Age"].mean(), inplace=True)

    data = pd.get_dummies(data, columns=["Title"])
    data["LastName"] = data["Name"].map(lambda x: x.split(",")[0])
        
    data["FamilySize"] = data["SibSp"] + data["Parch"] + 1
    data["IsAlone"] = 0
    data["IsAlone"].loc[data["FamilySize"] == 1] = 1

    data["FamilySize_small"] = 0
    data["FamilySize_small"].loc[data["FamilySize"] < 5] = 1
    data["FamilySize_large"] = 0
    data["FamilySize_large"].loc[data["FamilySize"] >= 5] = 1

    data["Embarked"].fillna("S", inplace=True) #by most value
    data = pd.get_dummies(data, columns=["Embarked"])

    data["Fare"].fillna(data["Fare"].median(), inplace=True)

    data["CabinIsNull"] = 0
    data.loc[data["Cabin"].isnull(), "CabinIsNull"] = 1
    
#    data["Ticket"] = data["Ticket"].map(lambda x: x.split(" ")[-1])

    #不要な特徴量を削除
    data.drop(['Name', 'Cabin', 'Ticket'], axis=1, inplace=True)
    
    return data

In [4]:
train_data = pd.read_csv("data/train.csv")
train_data = make_feature(train_data)
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 23 columns):
PassengerId         891 non-null int64
Survived            891 non-null int64
Pclass              891 non-null int64
Age                 891 non-null float64
SibSp               891 non-null int64
Parch               891 non-null int64
Fare                891 non-null float64
Sex_female          891 non-null uint8
Sex_male            891 non-null uint8
Title_Master        891 non-null uint8
Title_Miss          891 non-null uint8
Title_Mr            891 non-null uint8
Title_Mrs           891 non-null uint8
Title_Rare          891 non-null uint8
LastName            891 non-null object
FamilySize          891 non-null int64
IsAlone             891 non-null int64
FamilySize_small    891 non-null int64
FamilySize_large    891 non-null int64
Embarked_C          891 non-null uint8
Embarked_Q          891 non-null uint8
Embarked_S          891 non-null uint8
CabinIsNull         891 non-null

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [5]:
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Title_Master,...,Title_Rare,LastName,FamilySize,IsAlone,FamilySize_small,FamilySize_large,Embarked_C,Embarked_Q,Embarked_S,CabinIsNull
0,1,0,3,22.0,1,0,7.25,0,1,0,...,0,Braund,2,0,1,0,0,0,1,1
1,2,1,1,38.0,1,0,71.2833,1,0,0,...,0,Cumings,2,0,1,0,1,0,0,0
2,3,1,3,26.0,0,0,7.925,1,0,0,...,0,Heikkinen,1,1,1,0,0,0,1,1
3,4,1,1,35.0,1,0,53.1,1,0,0,...,0,Futrelle,2,0,1,0,0,0,1,0
4,5,0,3,35.0,0,0,8.05,0,1,0,...,0,Allen,1,1,1,0,0,0,1,1


In [6]:
test_data = pd.read_csv("data/test.csv")
test_data = make_feature(test_data)
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 22 columns):
PassengerId         418 non-null int64
Pclass              418 non-null int64
Age                 418 non-null float64
SibSp               418 non-null int64
Parch               418 non-null int64
Fare                418 non-null float64
Sex_female          418 non-null uint8
Sex_male            418 non-null uint8
Title_Master        418 non-null uint8
Title_Miss          418 non-null uint8
Title_Mr            418 non-null uint8
Title_Mrs           418 non-null uint8
Title_Rare          418 non-null uint8
LastName            418 non-null object
FamilySize          418 non-null int64
IsAlone             418 non-null int64
FamilySize_small    418 non-null int64
FamilySize_large    418 non-null int64
Embarked_C          418 non-null uint8
Embarked_Q          418 non-null uint8
Embarked_S          418 non-null uint8
CabinIsNull         418 non-null int64
dtypes: float64(2), int64(9), ob

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [7]:
"""
familySurvivedList = train_data[train_data["FamilySize"] > 1].groupby(["LastName", "FamilySize"])["Survived"].mean()
familySurvivedList = familySurvivedList.reset_index()

train_data["FamilySurvivedRate"] = 0.5
test_data["FamilySurvivedRate"] = 0.5
for i in range(len(familySurvivedList)):
    family = familySurvivedList.loc[i, :]
    train_data.loc[(train_data["LastName"] == family["LastName"]) & (train_data["FamilySize"] == family["FamilySize"]),
               "FamilySurvivedRate"] = family["Survived"]
    test_data.loc[(test_data["LastName"] == family["LastName"]) & (test_data["FamilySize"] == family["FamilySize"]),
               "FamilySurvivedRate"] = family["Survived"]
"""

'\nfamilySurvivedList = train_data[train_data["FamilySize"] > 1].groupby(["LastName", "FamilySize"])["Survived"].mean()\nfamilySurvivedList = familySurvivedList.reset_index()\n\ntrain_data["FamilySurvivedRate"] = 0.5\ntest_data["FamilySurvivedRate"] = 0.5\nfor i in range(len(familySurvivedList)):\n    family = familySurvivedList.loc[i, :]\n    train_data.loc[(train_data["LastName"] == family["LastName"]) & (train_data["FamilySize"] == family["FamilySize"]),\n               "FamilySurvivedRate"] = family["Survived"]\n    test_data.loc[(test_data["LastName"] == family["LastName"]) & (test_data["FamilySize"] == family["FamilySize"]),\n               "FamilySurvivedRate"] = family["Survived"]\n'

In [8]:
train_data.to_pickle("work/converted_train.pickle")
test_data.to_pickle("work/converted_test.pickle")