In [1]:
import numpy as np
import pandas as pd

In [2]:
data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')
ground_truth = pd.read_csv('gender_submission.csv')


In [3]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [5]:
from sklearn.preprocessing import LabelEncoder

# Encode Sex and Embarked values to numerical form
encoder = LabelEncoder()
encoder.fit(data['Sex']) 

embark_encoder = LabelEncoder()
embark_encoder.fit(data['Embarked'])

In [6]:

# Preprocessing
def preprocessing(data):
    # Remove NaN value in Embarked 
    data = data.dropna(subset=["Embarked"], axis=0)

    data['Sex'] = encoder.transform(data['Sex'])

    data['Embarked'] = embark_encoder.transform(data['Embarked'])

    # Fill missing Age
    data['Age'] = data["Age"].fillna(data.Age.median())

    discard_feature = ['Name', "Ticket", "Fare", "Cabin", "PassengerId", "Survived"]

    # Remove unnecesary feature and dependent variable (y)
    x_train = data.drop(discard_feature, axis=1)
    y_train = data['Survived']

    return x_train, y_train

In [7]:
x_train, y_train = preprocessing(data)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Sex'] = encoder.transform(data['Sex'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Embarked'] = embark_encoder.transform(data['Embarked'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Age'] = data["Age"].fillna(data.Age.median())


In [8]:
#Train Model
from sklearn.linear_model import LogisticRegression

#1. Logistic Regression
logRes = LogisticRegression()
logRes.fit(x_train, y_train)

In [9]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


In [10]:
def test_preprocess(data):
    # Fill empty Age values with median of age
    data["Age"] = data["Age"].fillna(data.Age.median())

    # encode "Sex" and "Embarked"
    data["Sex"] = encoder.transform(data["Sex"])
    data["Embarked"] = embark_encoder.transform(data["Embarked"])

    # Remove unnecessary features
    discard_features = ["PassengerId", "Name", "Ticket", "Fare", "Cabin"]
    x_test = data.drop(discard_features, axis=1)
    return x_test
    

In [11]:
x_test = test_preprocess(test_data)

In [12]:
y_test = logRes.predict(x_test)
y_true = np.array(ground_truth.Survived)

In [13]:
from sklearn.metrics import accuracy_score

def evaluate(y_true, y_test):
    score = accuracy_score(y_true, y_test)
    return score

logRes_score = evaluate(y_true, y_test)
logRes_score

0.9282296650717703

In [14]:
from sklearn.tree import DecisionTreeClassifier

dec_tree = DecisionTreeClassifier(random_state=1)
dec_tree.fit(x_train, y_train)

In [15]:
dec_tree_pred = dec_tree.predict(x_test)

In [16]:
dec_tree_score = evaluate(y_true, dec_tree_pred)
dec_tree_score

0.7033492822966507

In [22]:
from sklearn.ensemble import RandomForestClassifier

rdn_for = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42)
rdn_for.fit(x_train, y_train)

In [23]:
rdn_for_pred = rdn_for.predict(x_test)
rdn_for_pred

array([0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1,
       0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [24]:
rdn_for_score = evaluate(y_true, rdn_for_pred)
rdn_for_score

0.9282296650717703

In [20]:
def generate_output(data, y_pred):

    output_df = pd.DataFrame({
        'PassengerId' : data.PassengerId,
        'Survived' : y_pred
    })

    return output_df

In [21]:
output = generate_output(test_data, y_test)

output

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0
