## Titanic Competition Tutorial

In [81]:
import pandas as pd
import sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [82]:
train_data = pd.read_csv(fr"data\train.csv")

In [83]:
test_data = pd.read_csv(fr"data\test.csv")

In [84]:
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [85]:
women = train_data[train_data["Sex"]=="female"]["Survived"]
rate_women = sum(women)/len(women)
print(f"Percent of women survived: {rate_women}")

Percent of women survived: 0.7420382165605095


In [86]:
men = train_data[train_data["Sex"]=="male"]["Survived"]
rate_men = sum(men)/len(men)
print(fr"Percent of men survived: {rate_men}")

Percent of men survived: 0.18890814558058924


In [87]:
def pre_process_data(data:pd.DataFrame):
    Y = data["Survived"]
    X = data[[n for n in data.columns if n!= "Survived" and n != "Name"]]
    X = pd.get_dummies(X[X.columns])
    # train-test split
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size=0.6, random_state=42)
    # growing-test split
    X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, test_size=0.4, random_state=42)
    return X_train, Y_train, X_val, Y_val, X_test, Y_test

In [88]:
def build_tree(X_train, Y_train):
    tree = DecisionTreeClassifier(random_state=42, max_depth=5)
    tree.fit(X_train, Y_train)
    return tree

In [89]:
X_train, Y_train, X_val, Y_val, X_test, Y_test = pre_process_data(train_data)


In [90]:
tree = build_tree(X_train, Y_train)

In [None]:
predictions = tree.predict(X_val)

accuracy_score_metric = accuracy_score(Y_val, predictions)
print(fr"Accuracy Score: {accuracy_score_metric}")

Accuracy Score: 0.7476635514018691


In [92]:
Y = train_data["Survived"]

features = ["Pclass", "Sex", "SibSp", "Parch"]
X = pd.get_dummies(train_data[features])
X_test = pd.get_dummies(test_data[features])

In [93]:
model=RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42)
model.fit(X, Y)
predictions = model.predict(X_test)

In [94]:
output = pd.DataFrame({"PassengerId":test_data["PassengerId"],"Sex":test_data["Sex"], "Survived":predictions})
print(output["Sex"].value_counts())


male      266
female    152
Name: Sex, dtype: int64


In [95]:
output[["PassengerId", "Survived"]].to_csv("submission.csv", index = False)