In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("./dataset/titanic/train.csv")

In [3]:
print(df.head())

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  


In [4]:
print(df.describe())

       PassengerId    Survived      Pclass         Age       SibSp  \
count   891.000000  891.000000  891.000000  714.000000  891.000000   
mean    446.000000    0.383838    2.308642   29.699118    0.523008   
std     257.353842    0.486592    0.836071   14.526497    1.102743   
min       1.000000    0.000000    1.000000    0.420000    0.000000   
25%     223.500000    0.000000    2.000000   20.125000    0.000000   
50%     446.000000    0.000000    3.000000   28.000000    0.000000   
75%     668.500000    1.000000    3.000000   38.000000    1.000000   
max     891.000000    1.000000    3.000000   80.000000    8.000000   

            Parch        Fare  
count  891.000000  891.000000  
mean     0.381594   32.204208  
std      0.806057   49.693429  
min      0.000000    0.000000  
25%      0.000000    7.910400  
50%      0.000000   14.454200  
75%      0.000000   31.000000  
max      6.000000  512.329200  


Age has null values, do something about them

In [5]:
df.Age = df.Age.fillna(df.Age.median())

In [6]:
df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.361582,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,13.019697,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,22.0,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,35.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


ticket, cabin, name useless, others convert to numeric

encode sex, male = 0, female = 1

In [7]:
df.loc[df.Sex == 'male', 'Sex'] = 0
df.loc[df.Sex == 'female', 'Sex'] = 1

In [8]:
print(df["Embarked"].unique())

['S' 'C' 'Q' nan]


encode embarked, S = 0, C = 1, Q = 2

In [9]:
df.loc[df.Embarked == 'S', 'Embarked'] = 0
df.loc[df.Embarked == 'C', 'Embarked'] = 1
df.loc[df.Embarked == 'Q', 'Embarked'] = 2
df.Embarked.fillna(0, inplace=True)

In [10]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold

In [11]:
features = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare", "Embarked"]

In [12]:
clf = LinearRegression()

In [13]:
cv = KFold(3, random_state=1)

In [14]:
predictions = []
for traincv, testcv in cv.split(df):
    X = (df[features].iloc[traincv,:])
    y = df["Survived"].iloc[traincv]
    clf.fit(X, y)
    pred = clf.predict(df[features].iloc[testcv,:])
    predictions.append(pred)

In [15]:
import numpy as np

In [16]:
predictions = np.concatenate(predictions, axis=0)

In [17]:
predictions[predictions > .5] = 1
predictions[predictions <=.5] = 0

In [18]:
accuracy = len(predictions[predictions == df.Survived]) / len(predictions)

In [19]:
print("Accuracy =", round(accuracy, 2)*100, "%")

Accuracy = 78.0 %


Logistic Regression

In [20]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

In [21]:
log_clf = LogisticRegression(random_state=1)

In [22]:
scores = cross_val_score(log_clf, df[features], df.Survived, cv=3)

In [23]:
print("Accuracy =", round(scores.mean(), 2)*100, "%")

Accuracy = 79.0 %


Test set

In [24]:
df_test = pd.read_csv("./dataset/titanic/test.csv")

In [25]:
df_test.Age = df_test.Age.fillna(df.Age.median())
df_test.loc[df_test.Sex == 'male', 'Sex'] = 0
df_test.loc[df_test.Sex == 'female', 'Sex'] = 1
df_test.Embarked.fillna('S', inplace=True)
df_test.loc[df_test.Embarked == 'S', 'Embarked'] = 0
df_test.loc[df_test.Embarked == 'C', 'Embarked'] = 1
df_test.loc[df_test.Embarked == 'Q', 'Embarked'] = 2
df_test.Fare = df_test.Fare.fillna(df_test.Fare.median())

In [26]:
clf_test = LogisticRegression(random_state=1)
clf_test.fit(df[features], df.Survived)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=1, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [27]:
predictions = clf_test.predict(df_test[features])

In [28]:
submission = pd.DataFrame({
        "PassengerId": df_test["PassengerId"],
        "Survived": predictions
    })

In [30]:
submission.to_csv("./dataset/titanic/gender_submission.csv", index=False)

Using 3 more techniques
- better machine learning algos
- create better features
- combination of algos

Age and survived are not linearly related, and decision trees are better at picking such relations. But they overfit to the training data. Hence, we will use random forests, which will reduce overfitting due to **randomization**.

In [31]:
from sklearn import cross_validation
from sklearn.ensemble import RandomForestClassifier



In [32]:
rfc = RandomForestClassifier(random_state=1, n_estimators=10, min_samples_split=2, min_samples_leaf=1)

In [33]:
cv = KFold(3, random_state=1)
#kf = cross_validation.KFold(titanic.shape[0], n_folds=3, random_state=1)

In [35]:
scores = cross_val_score(rfc, df[features], df.Survived, cv=3)

In [36]:
print(scores.mean())

0.801346801347
