In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

import seaborn as sns

In [2]:
df = pd.read_excel('train.xlsx')[['Age','Pclass','SibSp','Parch','Survived']]


In [3]:
df.head()


Unnamed: 0,Age,Pclass,SibSp,Parch,Survived
0,22.0,3,1,0,0
1,38.0,1,1,0,1
2,26.0,3,0,0,1
3,35.0,1,1,0,1
4,35.0,3,0,0,0


In [4]:
df.dropna(inplace=True)


In [5]:
df.head()


Unnamed: 0,Age,Pclass,SibSp,Parch,Survived
0,22.0,3,1,0,0
1,38.0,1,1,0,1
2,26.0,3,0,0,1
3,35.0,1,1,0,1
4,35.0,3,0,0,0


In [6]:
X = df.iloc[:,0:4]
y = df.iloc[:,-1]

In [7]:
X.head()


Unnamed: 0,Age,Pclass,SibSp,Parch
0,22.0,3,1,0
1,38.0,1,1,0
2,26.0,3,0,0
3,35.0,1,1,0
4,35.0,3,0,0


In [8]:
np.mean(cross_val_score(LogisticRegression(),X,y,scoring='accuracy',cv=20))




0.6970559845559845

In [9]:
# Applying Feature Construction


In [10]:
X['Family_size'] = X['SibSp'] + X['Parch'] + 1


In [11]:
X.head()


Unnamed: 0,Age,Pclass,SibSp,Parch,Family_size
0,22.0,3,1,0,2
1,38.0,1,1,0,2
2,26.0,3,0,0,1
3,35.0,1,1,0,2
4,35.0,3,0,0,1


In [12]:
def myfunc(num):
    if num == 1:
        #alone
        return 0
    elif num >1 and num <=4:
        # small family
        return 1
    else:
        # large family
        return 2

In [13]:
myfunc(4)


1

In [14]:
X['Family_type'] = X['Family_size'].apply(myfunc)


In [15]:
X.head()


Unnamed: 0,Age,Pclass,SibSp,Parch,Family_size,Family_type
0,22.0,3,1,0,2,1
1,38.0,1,1,0,2,1
2,26.0,3,0,0,1,0
3,35.0,1,1,0,2,1
4,35.0,3,0,0,1,0


In [16]:
X.drop(columns=['SibSp','Parch','Family_size'],inplace=True)


In [17]:
X.head()


Unnamed: 0,Age,Pclass,Family_type
0,22.0,3,1
1,38.0,1,1
2,26.0,3,0
3,35.0,1,1
4,35.0,3,0


In [18]:
np.mean(cross_val_score(LogisticRegression(),X,y,scoring='accuracy',cv=20))




0.7139628914628915

In [19]:
# Feature Splitting

In [21]:
df = pd.read_excel('train.xlsx')


In [22]:
df.head()


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [23]:
df['Name']


0                                Braund, Mr. Owen Harris
1      Cumings, Mrs. John Bradley (Florence Briggs Th...
2                                 Heikkinen, Miss. Laina
3           Futrelle, Mrs. Jacques Heath (Lily May Peel)
4                               Allen, Mr. William Henry
5                                       Moran, Mr. James
6                                McCarthy, Mr. Timothy J
7                         Palsson, Master. Gosta Leonard
8      Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)
9                    Nasser, Mrs. Nicholas (Adele Achem)
10                       Sandstrom, Miss. Marguerite Rut
11                              Bonnell, Miss. Elizabeth
12                        Saundercock, Mr. William Henry
13                           Andersson, Mr. Anders Johan
14                  Vestrom, Miss. Hulda Amanda Adolfina
15                       Hewlett, Mrs. (Mary D Kingcome)
16                                  Rice, Master. Eugene
17                          Wil

In [24]:
df['Title'] = df['Name'].str.split(', ', expand=True)[1].str.split('.', expand=True)[0]


In [25]:
df['Name'].str.split(', ', expand=True)[1].str.split('.', expand=True)[0]


0          Mr
1         Mrs
2        Miss
3         Mrs
4          Mr
5          Mr
6          Mr
7      Master
8         Mrs
9         Mrs
10       Miss
11       Miss
12         Mr
13         Mr
14       Miss
15        Mrs
16     Master
17         Mr
18        Mrs
19        Mrs
20         Mr
21         Mr
22       Miss
23         Mr
24       Miss
25        Mrs
26         Mr
27         Mr
28       Miss
29         Mr
        ...  
861        Mr
862       Mrs
863      Miss
864        Mr
865       Mrs
866      Miss
867        Mr
868        Mr
869    Master
870        Mr
871       Mrs
872        Mr
873        Mr
874       Mrs
875      Miss
876        Mr
877        Mr
878        Mr
879       Mrs
880       Mrs
881        Mr
882      Miss
883        Mr
884        Mr
885       Mrs
886       Rev
887      Miss
888      Miss
889        Mr
890        Mr
Name: 0, Length: 891, dtype: object

In [26]:
df[['Title','Name']]


Unnamed: 0,Title,Name
0,Mr,"Braund, Mr. Owen Harris"
1,Mrs,"Cumings, Mrs. John Bradley (Florence Briggs Th..."
2,Miss,"Heikkinen, Miss. Laina"
3,Mrs,"Futrelle, Mrs. Jacques Heath (Lily May Peel)"
4,Mr,"Allen, Mr. William Henry"
5,Mr,"Moran, Mr. James"
6,Mr,"McCarthy, Mr. Timothy J"
7,Master,"Palsson, Master. Gosta Leonard"
8,Mrs,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)"
9,Mrs,"Nasser, Mrs. Nicholas (Adele Achem)"


In [27]:
(df.groupby('Title').mean()['Survived']).sort_values(ascending=False)


Title
the Countess    1.000000
Mlle            1.000000
Lady            1.000000
Ms              1.000000
Sir             1.000000
Mme             1.000000
Mrs             0.792000
Miss            0.697802
Master          0.575000
Major           0.500000
Col             0.500000
Dr              0.428571
Mr              0.156673
Rev             0.000000
Jonkheer        0.000000
Don             0.000000
Capt            0.000000
Name: Survived, dtype: float64

In [28]:
df['Is_Married'] = 0
df['Is_Married'].loc[df['Title'] == 'Mrs'] = 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [29]:
df['Is_Married']

0      0
1      1
2      0
3      1
4      0
5      0
6      0
7      0
8      1
9      1
10     0
11     0
12     0
13     0
14     0
15     1
16     0
17     0
18     1
19     1
20     0
21     0
22     0
23     0
24     0
25     1
26     0
27     0
28     0
29     0
      ..
861    0
862    1
863    0
864    0
865    1
866    0
867    0
868    0
869    0
870    0
871    1
872    0
873    0
874    1
875    0
876    0
877    0
878    0
879    1
880    1
881    0
882    0
883    0
884    0
885    1
886    0
887    0
888    0
889    0
890    0
Name: Is_Married, Length: 891, dtype: int64