In [1]:
import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing

In [2]:
df = pd.read_csv("data/titanic-train.csv")
df["Age"] = preprocessing.Imputer(strategy="median").fit_transform(df[["Age"]])

In [5]:
label_encoder = preprocessing.LabelEncoder()
df["Sex"] = label_encoder.fit_transform(df["Sex"])
df["Embarked"] = label_encoder.fit_transform(df["Embarked"])

  flag = np.concatenate(([True], aux[1:] != aux[:-1]))


In [7]:
features = ["Sex", "Pclass", "SibSp", "Embarked", "Age", "Fare"]

In [8]:
X = df[features]
y = df["Survived"]

In [10]:
np.random.seed(12)
rf = RandomForestClassifier(
    n_estimators=1000,
    max_features=2,
    oob_score=True
)
rf.fit(X=X, y=y)
rf.oob_score_

0.82154882154882158

In [19]:
pd.DataFrame(
    zip(features, rf.feature_importances_), 
    columns=["Feature", "Importance"]
).sort_values("Importance", ascending=False)

Unnamed: 0,Feature,Importance
5,Fare,0.287708
4,Age,0.272643
0,Sex,0.26822
1,Pclass,0.088428
2,SibSp,0.050777
3,Embarked,0.032224


In [21]:
df_test = pd.read_csv("data/titanic-test.csv")
df_test["Age"] = preprocessing.Imputer(strategy="median").fit_transform(df_test[["Age"]])
df_test["Sex"] = label_encoder.fit_transform(df_test["Sex"])
df_test["Embarked"] = label_encoder.fit_transform(df_test["Embarked"])

In [23]:
df_test[features].isnull().sum() / len(df_test)

Sex         0.000000
Pclass      0.000000
SibSp       0.000000
Embarked    0.000000
Age         0.000000
Fare        0.002392
dtype: float64

In [24]:
df_test["Fare"] = preprocessing.Imputer(strategy="median").fit_transform(df_test[["Fare"]])

In [25]:
preds = rf.predict(X=df_test[features])

In [27]:
%ls data

h1b_kaggle.csv                        titanic-test.csv
subm_titanic_dtree-2.csv              titanic-train.csv
subm_titanic_dtree.csv                twolves-comments.csv
subm_titanic_logistic_regression.csv  [1m[36muber[m[m/
titanic-gender-submission.csv


In [28]:
subm = pd.DataFrame({
    "PassengerId": df_test["PassengerId"],
    "Survived": preds
})
subm.to_csv("data/subm_titanic_rf.csv", index=False)