## 1.Loading data

In [1]:
import pandas as pd 
import numpy as np
import warnings
warnings.filterwarnings('ignore')
train_data = pd.read_csv('./train.csv')
test_data = pd.read_csv('./test.csv')
train_y = train_data["Survived"]
train_x = train_data
# Combining the training set and test
all_data = pd.concat((train_x, test_data)).reset_index(drop=True)

## 2.Missing value fill

In [2]:
# According to the calculation of the median sale price of different positions in different ports, the port closest to the fare is filled with Embarked
P1 = all_data[all_data['Pclass']==1][['Fare','Embarked']]
P1.groupby('Embarked')['Fare'].median()
 
all_data.loc[829,'Embarked'] = 'C'
all_data.loc[61,'Embarked'] = 'C'


P3 = all_data[all_data['Pclass']==3][['Fare','Embarked']]
P3.groupby('Embarked')['Fare'].median()
all_data.loc[1043,'Fare'] = 8.05


all_data_Name = all_data['Name']
all_data_Family = all_data_Name.str.split(',',expand=True)[0]
all_data['Family'] = all_data_Family 
only_have_Sibsp = all_data[all_data['Age'].isnull()==True][(all_data['SibSp'] > 0) & (all_data['Parch']==0)]
Sibsp_ticket = only_have_Sibsp['Ticket'].unique()
 
only_have_Sibsp = all_data[all_data['Age'].isnull()==True][(all_data['SibSp'] > 0) & (all_data['Parch']==0)]
Sibsp_ticket = only_have_Sibsp['Ticket'].unique()
 
for f in Sibsp_ticket:
    tmp = all_data[all_data['Ticket'] == f][['Ticket','Family','Age']]
    for i in tmp['Age'][all_data['Age'].isnull() == True].index :
        index_notnull = tmp['Age'][all_data['Age'].isnull() == False].index
        for j in index_notnull:
            if (all_data.loc[i,'Family'] == all_data.loc[j,'Family'] and all_data.loc[i,'SibSp'] == all_data.loc[j,'SibSp']):
                all_data.loc[i,'Age'] =   all_data.loc[j,'Age'] 
            if (all_data.loc[i,'Parch'] == 0 and all_data.loc[j,'Parch']== 0):
                all_data.loc[i,'Age'] = all_data.loc[j,'Age']
# Other age data is populated as median
all_data['Age'] = all_data['Age'].fillna(all_data['Age'].median())

## 3.Adding features

In [3]:
# 1. Family Survival Rate
# 2. Age ranking by class
# 3. Ranking of fares by class
# 4. Number of relatives (simple addition)
# 5. Are there siblings
# 6. Are there children or parents
# 7. Are you alone
 
DEFAULT_SURVIVAL_VALUE = 0.5
all_data['Family_Survival'] = DEFAULT_SURVIVAL_VALUE
 
for grp, grp_df in all_data[['Survived','Name', 'Family', 'Fare', 'Ticket', 'PassengerId',
                           'SibSp', 'Parch', 'Age', 'Cabin']].groupby(['Family', 'Fare']):
    
    if (len(grp_df) != 1):
        for ind, row in grp_df.iterrows():
            smax = grp_df.drop(ind)['Survived'].max()
            smin = grp_df.drop(ind)['Survived'].min()
            passID = row['PassengerId']
            if (smax == 1.0):
                all_data.loc[all_data['PassengerId'] == passID, 'Family_Survival'] = 1
            elif (smin==0.0):
                all_data.loc[all_data['PassengerId'] == passID, 'Family_Survival'] = 0
                
print("Number of passengers with family survival information:", 
      all_data.loc[all_data['Family_Survival']!=0.5].shape[0])

for _, grp_df in all_data.groupby('Ticket'):
    if (len(grp_df) != 1):
        for ind, row in grp_df.iterrows():
            if (row['Family_Survival'] == 0) | (row['Family_Survival']== 0.5):
                smax = grp_df.drop(ind)['Survived'].max()
                smin = grp_df.drop(ind)['Survived'].min()
                passID = row['PassengerId']
                if (smax == 1.0):
                    all_data.loc[all_data['PassengerId'] == passID, 'Family_Survival'] = 1
                elif (smin==0.0):
                    all_data.loc[all_data['PassengerId'] == passID, 'Family_Survival'] = 0
                        
print("Number of passenger with family/group survival information: " 
      +str(all_data[all_data['Family_Survival']!=0.5].shape[0]))

for _, grp_df in all_data.groupby('Ticket'):
    if (len(grp_df) != 1):
        for ind, row in grp_df.iterrows():
            if (row['Family_Survival'] == 0) | (row['Family_Survival']== 0.5):
                smax = grp_df.drop(ind)['Survived'].max()
                smin = grp_df.drop(ind)['Survived'].min()
                passID = row['PassengerId']
                if (smax == 1.0):
                    all_data.loc[all_data['PassengerId'] == passID, 'Family_Survival'] = 1
                elif (smin==0.0):
                    all_data.loc[all_data['PassengerId'] == passID, 'Family_Survival'] = 0
                        
print("Number of passenger with family/group survival information: " 
      +str(all_data[all_data['Family_Survival']!=0.5].shape[0]))
#2
tmp  = all_data[all_data['Pclass'] == 1]
r = 0
pre = 0 
for i in tmp.sort_values('Age').index:
    if pre < all_data.loc[i,'Age']: r = r + 1
    all_data.loc[i,'Pclass1_Age_Rank'] = r
    pre = all_data.loc[i,'Age']

tmp  = all_data[all_data['Pclass'] == 2]
r = 0
pre = 0
for i in tmp.sort_values('Age').index:
    if pre < all_data.loc[i,'Age']: r = r + 1
    all_data.loc[i,'Pclass2_Age_Rank'] = r

tmp  = all_data[all_data['Pclass'] == 3]
r = 0
pre = 0
for i in tmp.sort_values('Age').index:
    if pre < all_data.loc[i,'Age']:  r = r + 1
    all_data.loc[i,'Pclass3_Age_Rank'] = r

all_data['Pclass1_Age_Rank'] = all_data['Pclass1_Age_Rank'].fillna(0)
all_data['Pclass2_Age_Rank'] = all_data['Pclass2_Age_Rank'].fillna(0)
all_data['Pclass3_Age_Rank'] = all_data['Pclass3_Age_Rank'].fillna(0)
tmp = all_data[(all_data['Pclass'] == 1)]
r = 0
pre = 0 
for i in tmp.sort_values('Fare').index:
    if pre < all_data.loc[i,'Fare'] : r = r + 1
    all_data.loc[i,'Pclass1_Fare_Rank'] = r

tmp = all_data[(all_data['Pclass'] == 2)]
r = 0
pre = 0
for i in tmp.sort_values('Fare').index:
    if pre < all_data.loc[i,'Fare'] : r = r + 1
    all_data.loc[i,'Pclass2_Fare_Rank'] = r

tmp = all_data[(all_data['Pclass'] == 3)]
r = 0
pre = 0
for i in tmp.sort_values('Fare').index:
    if pre < all_data.loc[i,'Fare'] : r = r + 1
    all_data.loc[i,'Pclass3_Fare_Rank'] = r
    
all_data['Pclass1_Fare_Rank'] = all_data['Pclass1_Fare_Rank'].fillna(0)
all_data['Pclass2_Fare_Rank'] = all_data['Pclass2_Fare_Rank'].fillna(0)
all_data['Pclass3_Fare_Rank'] = all_data['Pclass3_Fare_Rank'].fillna(0)
all_data['Family_Size'] = all_data['Parch'] + all_data['SibSp'] + 1
all_data['Have_SibSp'] = all_data['SibSp'].apply(lambda x:1 if x>0 else 0)
all_data['Have_Parch'] = all_data['Parch'].apply(lambda x:1 if x>0 else 0)
all_data['Is_Alone'] = 1
all_data['Is_Alone'].loc[all_data['Family_Size'] > 1] = 0

Number of passengers with family survival information: 420
Number of passenger with family/group survival information: 546
Number of passenger with family/group survival information: 546


## 4.Remove features that do not participate in model training

In [4]:
all_data.drop("PassengerId",axis = 1, inplace = True)
all_data.drop("Cabin",axis = 1, inplace = True)
all_data.drop("Ticket",axis = 1,inplace = True)
all_data.drop("Name",axis = 1,inplace = True)
all_data.drop("Family",axis = 1, inplace = True)
all_data.drop("Survived",axis = 1, inplace = True)

## 5. Coding categorical variables

In [5]:
def encoder(x):
    if x == 'C':
        return 1
    elif x=='Q':
        return 2
    else:
        return 3
# Encode feature Embarked
all_data['Embarked'] = all_data['Embarked'].apply(lambda x: encoder(x))

all_data['Sex'] = all_data['Sex'].astype(str)
# One-hot encoding of category data
all_data = pd.get_dummies(all_data)

## 6.Divide the data into training and test sets

In [6]:
n_train = len(train_x)
train_x = all_data[:n_train] 
test_x  = all_data[n_train:]

## 7.Model training

In [7]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn import svm,feature_selection
from sklearn.model_selection import StratifiedKFold, cross_val_score
import warnings
warnings.filterwarnings('ignore')
def acc_cv(model):
    n_folds = 5
    skf = StratifiedKFold(n_folds, shuffle=True, random_state= 8)#for shuffle data
    acc = cross_val_score(model, train_x.values,train_y, scoring="accuracy", cv = skf)
    return(acc.mean(),acc.std())
rbf_svc = make_pipeline(StandardScaler(), svm.SVC(C=1.0,random_state=2))
acc_cv(rbf_svc)

rbf_svc.fit(train_x.values,train_y.values)
test_y = rbf_svc.predict(test_x.values)

sub = pd.DataFrame()
sub['PassengerId'] = test_data['PassengerId']
sub['Survived'] = test_y
sub.to_csv('yongli_submission_0302.csv',index=False)

## 8.Report
### (1)Title link
[Titanic: Machine Learning from Disaster](https://www.kaggle.com/c/titanic)

### (2)References
[Titanic - Advanced Feature Engineering Tutorial](https://www.kaggle.com/gunesevitan/titanic-advanced-feature-engineering-tutorial/)

[Titanic Tutorial](https://www.kaggle.com/alexisbcook/titanic-tutorial)

[Titanic [0.82] - [0.83]](https://www.kaggle.com/alexisbcook/titanic-tutorial)

### (3)Grades & Ranking
Yong Li

Ranking:482

Grades:0.82296

