In [1]:
# importing the libraries 
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# reding the dataset 
train_data = pd.read_csv('train.csv')
pred_data = pd.read_csv('test.csv')
print(train_data.head())
print(pred_data.head())

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  
  

In [3]:
passengerId = np.array(pred_data['PassengerId'])

In [4]:
# checking if values are nan in any column of the dataset
print(train_data.isnull().sum())#total number of nan values in column
print("    ")
print(pred_data.isnull().sum())
# there are a lot of missing values in cabin this column so we are going to drop this column for creating the model

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64
    
PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64


In [5]:
# droppimg the columns which i thought would be of no use or having a large number of nan values
train_data = train_data.drop(['PassengerId','Name','Ticket','Cabin'],axis=1)
print(train_data.head())
pred_data = pred_data.drop(['PassengerId','Name','Ticket','Cabin'],axis=1)
print(pred_data.head())

   Survived  Pclass     Sex   Age  SibSp  Parch     Fare Embarked
0         0       3    male  22.0      1      0   7.2500        S
1         1       1  female  38.0      1      0  71.2833        C
2         1       3  female  26.0      0      0   7.9250        S
3         1       1  female  35.0      1      0  53.1000        S
4         0       3    male  35.0      0      0   8.0500        S
   Pclass     Sex   Age  SibSp  Parch     Fare Embarked
0       3    male  34.5      0      0   7.8292        Q
1       3  female  47.0      1      0   7.0000        S
2       2    male  62.0      0      0   9.6875        Q
3       3    male  27.0      0      0   8.6625        S
4       3  female  22.0      1      1  12.2875        S


In [6]:
# we can't drop age column so we filled it with mean value otherwise dropping rows corresponding to nan values would result in loss of a lot of data
# for training set
mean_value = train_data['Age'].mean()
train_data['Age'].fillna(value=mean_value,inplace=True)
# for prediction set
mean_value = pred_data['Age'].mean()
pred_data['Age'].fillna(value=mean_value,inplace=True)

In [7]:
# filling one nan value with the mean in the pred set
mean_value = pred_data['Fare'].mean()
pred_data['Fare'].fillna(value=mean_value,inplace=True)

In [8]:
# checking for nan values again
print(train_data.isnull().sum())
print("       ")
print(pred_data.isnull().sum())
pred_data

Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    2
dtype: int64
       
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
dtype: int64


Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,male,34.50000,0,0,7.8292,Q
1,3,female,47.00000,1,0,7.0000,S
2,2,male,62.00000,0,0,9.6875,Q
3,3,male,27.00000,0,0,8.6625,S
4,3,female,22.00000,1,1,12.2875,S
...,...,...,...,...,...,...,...
413,3,male,30.27259,0,0,8.0500,S
414,1,female,39.00000,0,0,108.9000,C
415,3,male,38.50000,0,0,7.2500,S
416,3,male,30.27259,0,0,8.0500,S


In [9]:
# we can drop 2 rows as they are having nan values in embarked column 
train_data = train_data.dropna(axis=0)
train_data

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.000000,1,0,7.2500,S
1,1,1,female,38.000000,1,0,71.2833,C
2,1,3,female,26.000000,0,0,7.9250,S
3,1,1,female,35.000000,1,0,53.1000,S
4,0,3,male,35.000000,0,0,8.0500,S
...,...,...,...,...,...,...,...,...
886,0,2,male,27.000000,0,0,13.0000,S
887,1,1,female,19.000000,0,0,30.0000,S
888,0,3,female,29.699118,1,2,23.4500,S
889,1,1,male,26.000000,0,0,30.0000,C


In [10]:
# separating y_train from the dataset as it is not having the categorical variables
train_X_data = train_data.iloc[:,1:]
y = np.array(train_data['Survived'])
train_X_data

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,male,22.000000,1,0,7.2500,S
1,1,female,38.000000,1,0,71.2833,C
2,3,female,26.000000,0,0,7.9250,S
3,1,female,35.000000,1,0,53.1000,S
4,3,male,35.000000,0,0,8.0500,S
...,...,...,...,...,...,...,...
886,2,male,27.000000,0,0,13.0000,S
887,1,female,19.000000,0,0,30.0000,S
888,3,female,29.699118,1,2,23.4500,S
889,1,male,26.000000,0,0,30.0000,C


In [11]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct_train = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0,1,-1])], remainder='passthrough')
train_X_data = np.array(ct_train.fit_transform(train_X_data))
ct_pred = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [0,1,-1])], remainder='passthrough')
pred_data = np.array(ct_pred.fit_transform(pred_data))

In [12]:
train_X_data = pd.DataFrame(train_X_data)
print(train_X_data.head(),"     ")
pred_data = pd.DataFrame(pred_data)
print(pred_data.head())

     0    1    2    3    4    5    6    7     8    9   10       11
0  0.0  0.0  1.0  0.0  1.0  0.0  0.0  1.0  22.0  1.0  0.0   7.2500
1  1.0  0.0  0.0  1.0  0.0  1.0  0.0  0.0  38.0  1.0  0.0  71.2833
2  0.0  0.0  1.0  1.0  0.0  0.0  0.0  1.0  26.0  0.0  0.0   7.9250
3  1.0  0.0  0.0  1.0  0.0  0.0  0.0  1.0  35.0  1.0  0.0  53.1000
4  0.0  0.0  1.0  0.0  1.0  0.0  0.0  1.0  35.0  0.0  0.0   8.0500      
     0    1    2    3    4    5    6    7     8    9   10       11
0  0.0  0.0  1.0  0.0  1.0  0.0  1.0  0.0  34.5  0.0  0.0   7.8292
1  0.0  0.0  1.0  1.0  0.0  0.0  0.0  1.0  47.0  1.0  0.0   7.0000
2  0.0  1.0  0.0  0.0  1.0  0.0  1.0  0.0  62.0  0.0  0.0   9.6875
3  0.0  0.0  1.0  0.0  1.0  0.0  0.0  1.0  27.0  0.0  0.0   8.6625
4  0.0  0.0  1.0  1.0  0.0  0.0  0.0  1.0  22.0  1.0  1.0  12.2875


In [13]:
train_X_data = np.array(train_X_data)
train_X_data
X_pred = np.array(pred_data)
X_pred

array([[ 0.    ,  0.    ,  1.    , ...,  0.    ,  0.    ,  7.8292],
       [ 0.    ,  0.    ,  1.    , ...,  1.    ,  0.    ,  7.    ],
       [ 0.    ,  1.    ,  0.    , ...,  0.    ,  0.    ,  9.6875],
       ...,
       [ 0.    ,  0.    ,  1.    , ...,  0.    ,  0.    ,  7.25  ],
       [ 0.    ,  0.    ,  1.    , ...,  0.    ,  0.    ,  8.05  ],
       [ 0.    ,  0.    ,  1.    , ...,  1.    ,  1.    , 22.3583]])

In [14]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train_X_data,y,test_size=0.01,random_state=1)

In [15]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)
print(pred_data.shape)

(880, 12)
(880,)
(9, 12)
(9,)
(418, 12)


X_TRAIN AND Y_TRAIN ARE HERE. NOW WE HAVE TO APPLY A ALGORITHM TO CREATE A MODEL

In [16]:
X_train = np.concatenate((np.ones((880,1)),X_train),axis=1)
X_test = np.concatenate((np.ones((9,1)),X_test),axis=1)
X_pred = np.concatenate((np.ones((X_pred.shape[0],1)),X_pred),axis=1)
X_pred = X_pred.T
X_train = X_train.T 
X_test = X_test.T
y_train = y_train.reshape((1,880))

In [17]:
print(X_train.shape)
print(X_test.shape)
print(X_pred.shape)

(13, 880)
(13, 9)
(13, 418)


In [None]:
def hypothesis(W,X):
    return 1/(1+np.exp(-W.T@X))

In [None]:
def compute_cost(X,y,W):
    return -(1/X.shape[0])*np.sum(y*np.log(hypothesis(W,X)) + (1-y)*np.log(1-hypothesis(W,X)))

In [None]:
def grad(X,W,y):
    m = X.shape[1]
    return np.sum((hypothesis(W,X)-y)*X,axis=1)

In [None]:
def update(W,step_size,X,y):
    grads = grad(X,W,y).reshape((W.shape[0],W.shape[1]))
    W = W - step_size*grads
    return W

In [None]:
np.random.seed(0)
W = np.random.rand(13,1)*0.001
for i in range(0,100001):
    W = update(W,0.000005,X_train,y_train)
    if(i%10000==0):
        print("W = ",W)
        print("cost after ",i," iterations is ",compute_cost(X_train,y_train,W))

In [None]:
# predictions on test set
np.concatenate((hypothesis(W,X_test),y_test.reshape((1,9))),axis=0)

In [None]:
# predictions on training set
pred_train = hypothesis(W,X_train)
pred_train = pred_train.reshape((pred_train.shape[1],))
ans_train=[]
for i in range(len(pred_train)):
    if(pred_train[i]>=0.5):
        ans_train.append(1)
    else:
        ans_train.append(0)
print(1+int(np.sum(np.absolute(np.array(ans_train)-np.array(pred_train)))),"predictions are wrong in training set")

In [None]:
pred = hypothesis(W,X_pred)
pred = pred.reshape((pred.shape[1],))
ans=[]
for i in range(len(pred)):
    if(pred[i]>=0.5):
        ans.append(1)
    else:
        ans.append(0)

In [None]:
s = len(ans)
passengerId = np.array(passengerId).reshape((s,1))
ans = np.array(ans).reshape((s,1))

In [None]:
submit = np.concatenate((passengerId,ans),axis=1)

In [None]:
submission = pd.DataFrame(submit)
submission.columns = ['PassengerId','Survived']
submission

In [None]:
submission.to_csv("submission1.csv")