## Imports

`pandas` for dataframe manipulation

`numpy` for array functionalities

`LogisticRegression` for the prediction model

`matplotlib.pyplot` for data visualization

In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

## Load the training dataset

In [2]:
df_train = pd.read_csv('train.csv')
df_train.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [21]:
df = df_train[['Survived','Sex','Pclass','Age','SibSp','Parch','Fare','Embarked']]

df['Age'] = df['Age'].fillna(df['Age'].mean())
df['Embarked'] = df['Embarked'].fillna('S')

df[60:65]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Age'] = df['Age'].fillna(df['Age'].mean())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Embarked'] = df['Embarked'].fillna('S')


Unnamed: 0,Survived,Sex,Pclass,Age,SibSp,Parch,Fare,Embarked
60,0,male,3,22.0,0,0,7.2292,C
61,1,female,1,38.0,0,0,80.0,S
62,0,male,1,45.0,1,0,83.475,S
63,0,male,3,4.0,3,2,27.9,S
64,0,male,1,29.699118,0,0,27.7208,C


## Select features

In [22]:
X = df[['Pclass','Sex','Age','SibSp','Parch','Fare','Embarked']]
y = df['Survived']

X.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,male,22.0,1,0,7.25,S
1,1,female,38.0,1,0,71.2833,C
2,3,female,26.0,0,0,7.925,S
3,1,female,35.0,1,0,53.1,S
4,3,male,35.0,0,0,8.05,S


## Separate data into training and testing

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

## Change columns to hold numeric values and Scale down values

In [24]:
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler

column_transformer = make_column_transformer(
    (OneHotEncoder(), ['Sex','Embarked']),
    (MinMaxScaler(), ['Pclass','Age','SibSp','Parch','Fare']),
    remainder='passthrough')

X_train = column_transformer.fit_transform(X_train)
X_train = pd.DataFrame(data=X_train, columns=column_transformer.get_feature_names_out())

X_train['onehotencoder__Sex_female'] *= 5
X_train['onehotencoder__Sex_male'] *= 5
X_train['minmaxscaler__Age'] *= 3


X_test = column_transformer.transform(X_test)
X_test = pd.DataFrame(data=X_test, columns=column_transformer.get_feature_names_out())

X_test['onehotencoder__Sex_female'] *= 5
X_test['onehotencoder__Sex_male'] *= 5
X_test['minmaxscaler__Age'] *= 3


## Create and fit a model

In [25]:
clf = LogisticRegression(random_state=1)
clf.fit(X_train,y_train)

## Make predictions

In [26]:
predictions = clf.predict(X_test)

print("Predictions:\n",predictions[:5])

print('\nTrue values:\n',y_test.head())

Predictions:
 [1 0 1 1 1]

True values:
 862    1
223    0
84     1
680    0
535    1
Name: Survived, dtype: int64


## Evaluate accuracy

In [27]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test,predictions)

accuracy

0.7937219730941704

## Make predictions on test file and export

In [35]:
df_test = pd.read_csv('test.csv')

df = df_test[['Sex','Pclass','Age','SibSp','Parch','Fare','Embarked']]

df['Age'] = df['Age'].fillna(df['Age'].mean())
df['Fare'] = df['Fare'].fillna(df['Fare'].mean())

X = df[['Pclass','Sex','Age','SibSp','Parch','Fare','Embarked']]

X = column_transformer.fit_transform(X)
X = pd.DataFrame(data=X, columns=column_transformer.get_feature_names_out())

X['onehotencoder__Sex_female'] *= 5
X['onehotencoder__Sex_male'] *= 5
X['minmaxscaler__Age'] *= 3

predict_test = clf.predict(X)

predictions_df = pd.DataFrame(data={'Survived':predict_test},
                              index=df_test['PassengerId'])

predictions_df.to_csv('My_Predictions.csv')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Age'] = df['Age'].fillna(df['Age'].mean())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Fare'] = df['Fare'].fillna(df['Fare'].mean())
