In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

train = pd.read_csv('/kaggle/input/titanic/train.csv')
test = pd.read_csv('/kaggle/input/titanic/test.csv')

In [2]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [4]:
female = train.loc[train.Sex=='female']['Survived']
rate_female = sum(female)/len(female)
print("% of women who survived:", rate_female)

% of women who survived: 0.7420382165605095


In [5]:
male = train.loc[train.Sex=='male']['Survived']
rate_male = sum(male)/len(male)
print("% of men who survived:", rate_male)

% of men who survived: 0.18890814558058924


In [6]:
missing_val_count_by_column = (train.isnull().sum())
print('Columns with missing values:')
print(missing_val_count_by_column[missing_val_count_by_column > 0])

Columns with missing values:
Age         177
Cabin       687
Embarked      2
dtype: int64


In [7]:
#dropping name and ticketid
train.drop(['Ticket', 'Name', 'Cabin'], axis=1, inplace=True)
test.drop(['Ticket', 'Name', 'Cabin'], axis=1, inplace=True)

#setting target
train.dropna(axis=0, subset=['Survived'], inplace=True)
y = train.Survived
train.drop(['Survived'], axis=1, inplace=True)
train.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,3,male,22.0,1,0,7.25,S
1,2,1,female,38.0,1,0,71.2833,C
2,3,3,female,26.0,0,0,7.925,S
3,4,1,female,35.0,1,0,53.1,S
4,5,3,male,35.0,0,0,8.05,S


In [8]:
#nonnumerical cols
object_cols = [col for col in train.columns if train[col].dtype == "object"]
object_nunique = list(map(lambda col: train[col].nunique(), object_cols))
d = dict(zip(object_cols, object_nunique))
sorted(d.items(), key=lambda x: x[1])
print(d)

{'Sex': 2, 'Embarked': 3}


In [9]:
from sklearn.preprocessing import OneHotEncoder

#OneHotEncoder for nonnumerical cols with cardinality less than 10 
low_cardinality_cols = [col for col in object_cols if train[col].nunique() < 10]
print(low_cardinality_cols)

encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
encoded_train_cols = pd.DataFrame(encoder.fit_transform(train[low_cardinality_cols]))
encoded_test_cols = pd.DataFrame(encoder.transform(test[low_cardinality_cols]))

encoded_train_cols.index = train.index
encoded_test_cols.index = test.index

numerical_train = train.drop(object_cols, axis=1)
numerical_test = test.drop(object_cols, axis=1)

encoded_train = pd.concat([numerical_train, encoded_train_cols], axis=1)
encoded_test = pd.concat([numerical_test, encoded_test_cols], axis=1)

encoded_train.columns = encoded_train.columns.astype(str)
encoded_test.columns = encoded_test.columns.astype(str)

encoded_train.head()
encoded_test.head()

['Sex', 'Embarked']




Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,0,1,2,3,4,5
0,892,3,34.5,0,0,7.8292,0.0,1.0,0.0,1.0,0.0,0.0
1,893,3,47.0,1,0,7.0,1.0,0.0,0.0,0.0,1.0,0.0
2,894,2,62.0,0,0,9.6875,0.0,1.0,0.0,1.0,0.0,0.0
3,895,3,27.0,0,0,8.6625,0.0,1.0,0.0,0.0,1.0,0.0
4,896,3,22.0,1,1,12.2875,1.0,0.0,0.0,0.0,1.0,0.0


In [10]:
from sklearn.impute import SimpleImputer

missing_val_count_by_column = (encoded_train.isnull().sum())
print('Columns with missing values BEFORE IMPUTER:')
print(missing_val_count_by_column[missing_val_count_by_column > 0])

#imputting missing values (AGE)
imputer = SimpleImputer()
imputed_X_train = pd.DataFrame(imputer.fit_transform(encoded_train))
imputed_X_test = pd.DataFrame(imputer.transform(encoded_test))

# putting back column names
imputed_X_train.columns = encoded_train.columns
imputed_X_test.columns = encoded_test.columns

imputed_X_train.head()

missing_val_count_by_column = (imputed_X_train.isnull().sum())
print('Columns with missing values AFTER IMPUTER:')
print(missing_val_count_by_column[missing_val_count_by_column > 0])

Columns with missing values BEFORE IMPUTER:
Age    177
dtype: int64
Columns with missing values AFTER IMPUTER:
Series([], dtype: int64)


In [11]:
from sklearn.ensemble import RandomForestClassifier

model_1 = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1)
model_1.fit(imputed_X_train,y)

predictions = model_1.predict(imputed_X_test)

submission = test.PassengerId
output = pd.DataFrame({'PassengerId': submission,
                       'Survived': predictions.squeeze()})

output.head(10)

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0
5,897,0
6,898,1
7,899,0
8,900,1
9,901,0


In [12]:
output = pd.DataFrame({'PassengerId': submission, 'Survived': predictions})
output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")

Your submission was successfully saved!
