In [35]:
#importing important libraries and modules and defining dataframe
import numpy as np
import pandas as pd
from sklearn.linear_model import SGDRegressor
from sklearn.preprocessing import StandardScaler
#adding the latin encoding for compatibility with the type of data set
df =pd.read_csv('tested.csv',encoding='latin-1')
print(df)

     PassengerId  Survived  Pclass  \
0            892         0       3   
1            893         1       3   
2            894         0       2   
3            895         0       3   
4            896         1       3   
..           ...       ...     ...   
413         1305         0       3   
414         1306         1       1   
415         1307         0       3   
416         1308         0       3   
417         1309         0       3   

                                             Name     Sex   Age  SibSp  Parch  \
0                                Kelly, Mr. James    male  34.5      0      0   
1                Wilkes, Mrs. James (Ellen Needs)  female  47.0      1      0   
2                       Myles, Mr. Thomas Francis    male  62.0      0      0   
3                                Wirz, Mr. Albert    male  27.0      0      0   
4    Hirvonen, Mrs. Alexander (Helga E Lindqvist)  female  22.0      1      1   
..                                            ...     ...

In [37]:
#checking for null_values
null_values = df.isnull().sum()
#count null values for eachn column
print(null_values)
#as shown there are 24 null values in the Income Column

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64


In [39]:
#typically we would fill in these null values but since we're not using them in anyway we can just omit
#these columns when defining the data set

#encoding columns in dataframe because machine learning models only work with numbers
mappings = {
    'Sex': {'male': 1, 'female': 2},
    'Embarked': {'S': 1, 'Q': 2, 'C': 3}
}
for column, mapping in mappings.items():
    df[column] = df[column].map(mapping)

print(df)




     PassengerId  Survived  Pclass  \
0            892         0       3   
1            893         1       3   
2            894         0       2   
3            895         0       3   
4            896         1       3   
..           ...       ...     ...   
413         1305         0       3   
414         1306         1       1   
415         1307         0       3   
416         1308         0       3   
417         1309         0       3   

                                             Name  Sex   Age  SibSp  Parch  \
0                                Kelly, Mr. James    1  34.5      0      0   
1                Wilkes, Mrs. James (Ellen Needs)    2  47.0      1      0   
2                       Myles, Mr. Thomas Francis    1  62.0      0      0   
3                                Wirz, Mr. Albert    1  27.0      0      0   
4    Hirvonen, Mrs. Alexander (Helga E Lindqvist)    2  22.0      1      1   
..                                            ...  ...   ...    ...    ... 

In [53]:
#checking for null_values
null_values = df.isnull().sum()
#count null values for eachn column
print(null_values)
#as shown there are 24 null values in the Income Column

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64


In [55]:
#Define training Set
#to avoid overfitting, the columns in the training set were carefully selected and determined by the correlation of these inputs and 
#survival
X =  df[['Pclass', 'Sex', 'SibSp', 'Parch', 'Embarked']].values  # the second column
y = df.iloc[:, 1].values   # the first column


# # Displaying the shapes of X and y to verify
print("Shape of X:", X.shape)
print("Shape of y:", y.shape)

print(X[0])
print(y[0])

Shape of X: (418, 5)
Shape of y: (418,)
[3 1 0 0 2]
0


In [57]:
#we see that the features of X are in different ranges so we scale the features of X so they can be in the same range
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
x = scaler.fit_transform(X)


In [59]:
#splitting training set for model evaluation after fitting is done to check for bias(underfitting) or variance(overfitting)
from sklearn.model_selection import train_test_split

# Get 60% of the dataset as the training set. Put the remaining 40% in temporary variables: x_ and y_.
x_train, x_, y_train, y_ = train_test_split(x, y, test_size=0.40, random_state=1)

# Split the 40% subset above into two: one half for cross validation and the other for the test set
x_cv, x_test, y_cv, y_test = train_test_split(x_, y_, test_size=0.50, random_state=1)

print(f"the shape of the training set (input) is: {x_train.shape}")
print(f"the shape of the training set (target) is: {y_train.shape}\n")
print(f"the shape of the cross validation set (input) is: {x_cv.shape}")
print(f"the shape of the cross validation set (target) is: {y_cv.shape}\n")
print(f"the shape of the test set (input) is: {x_test.shape}")
print(f"the shape of the test set (target) is: {y_test.shape}")


the shape of the training set (input) is: (250, 5)
the shape of the training set (target) is: (250,)

the shape of the cross validation set (input) is: (84, 5)
the shape of the cross validation set (target) is: (84,)

the shape of the test set (input) is: (84, 5)
the shape of the test set (target) is: (84,)


In [61]:
#increasing the training set size because the more training examples we have, the better our algorithm learns the training set
#so we duplicate the input and target array by a 1000 times
Xt = np.tile(x_train,(100,1))
Yt= np.tile(y_train,(100))   
print(Xt.shape, Yt.shape)  

(25000, 5) (25000,)


In [63]:
from sklearn.linear_model import LogisticRegression
#creating the regression model
lr_model = LogisticRegression()
#fitting the logisitic regression model to the data
lr_model.fit(Xt, Yt)

In [65]:
#for model evaluation,we use this model paramters to predict the y labels for the training and cross validation set and checking for
#the percentage of errors(how close the prediction is to the target). this will help us identify if the model has overfit or underfit i.e if the cross validation
#has a high rate of error (much greater than the training set) then this means the model has high variance and has overfit the 
#set
#however if the train set has very high rate of error then the model has high bias and has underfit

#we then calculate the error for the train, cross-validation and test set with the fit model to check for 
#high bias(underfitting) and high variance(overfitting)
err_train = 1 -(lr_model.score(x_train, y_train))
err_cv = 1 -(lr_model.score(x_cv, y_cv))
err_test = 1 -(lr_model.score(x_test, y_test))

print("train error:", err_train)
print("cross validation error", err_cv)
print("test error", err_test)

train error: 0.0
cross validation error 0.0
test error 0.0


In [None]:
#this shows the model does very well on all the sets as it has an accuracy of 100% and thus an error of 0
#this means all the predictions were correct