# Dataset and Library Loading

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use('ggplot')            #popular R style of plots
#print(plt.style.available)   To view all available styles
from collections import Counter
#Sci-Kit Library
from sklearn.utils import shuffle
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
#Pytorch
import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.autograd import Variable


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

df_train = pd.read_csv('../input/titanic/train.csv')
df_test  = pd.read_csv('../input/titanic/test.csv')
df_sub   = pd.read_csv('../input/titanic/gender_submission.csv')

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Preprocessing dataset for model

* Drop redundant columns
* One hot encoding categorical variables
* Impute (filling in missing data using suitable value) necessary columns 
* Feature Engineering
     *     Combining SibSp and Parch into a single Family variable
     *     Drop the redundant SibSp and Parch
* Scale both train and test data for linear models
* Randomly shuffle dataset, split data for train and test

In [None]:
# df_train.info()
#Useless columns: PassengerId(similar to the index),Name(self explanatory), Ticket, Cabin(too many null values)
df_train.drop(['PassengerId','Name','Ticket','Cabin'],axis=1,inplace=True)
df_test.drop(['PassengerId','Name','Ticket','Cabin'],axis=1,inplace=True)


#Sex and Embarked are categorical data that must be one hot encoded for the model to process it
#drop_first=True to reduce size of encoded data
#Original columns are not redundant and can be dropped and encoded data is concatenated back into dataframe
sex=pd.get_dummies(df_train['Sex'],drop_first=True)
embark=pd.get_dummies(df_train['Embarked'],drop_first=True)
df_train=pd.concat([df_train,sex,embark],axis=1)
df_train.drop(['Sex','Embarked'],axis=1,inplace=True)

sex=pd.get_dummies(df_test['Sex'],drop_first=True)
embark=pd.get_dummies(df_test['Embarked'],drop_first=True)
df_test=pd.concat([df_test,sex,embark],axis=1)
df_test.drop(['Sex','Embarked'],axis=1,inplace=True)

In [None]:
#df_train.info()
#Age has null values still unfilled
#df_test.info()
#Age and Fare has null values still unfilled
df_train.fillna(df_train.mean(),inplace=True)
df_test.fillna(df_test.mean(),inplace=True)

In [None]:
df_train['Fam']=df_train['SibSp']+df_train['Parch']
df_train['Family']=df_train['Fam']
df_train.loc[df_train['Family'] > 0, 'Family'] = 1 
df_train.loc[df_train['Family'] == 0, 'Family'] = 0 
df_train.drop(['SibSp','Parch','Fam'],axis=1,inplace=True)

df_test['Fam']=df_test['SibSp']+df_test['Parch']
df_test['Family']=df_test['Fam']
df_test.loc[df_test['Family'] > 0, 'Family'] = 1 
df_test.loc[df_test['Family'] == 0, 'Family'] = 0 
df_test.drop(['SibSp','Parch','Fam'],axis=1,inplace=True)

In [None]:
#Scale the data
Scaler1 = StandardScaler()   #instantiate StandardScalar object
Scaler2 = StandardScaler()

train_columns=df_train.columns
test_columns=df_test.columns
 
df_train = pd.DataFrame(Scaler1.fit_transform(df_train))    #why do we need to instantiate 2 instances for the train and test
df_test  = pd.DataFrame(Scaler2.fit_transform(df_test))     #Scaling removes the column headers and we have to reassign them 

df_train.columns=train_columns
df_test.columns=test_columns

In [None]:
#features=df_train.iloc[:,2:] grabs all the rows but columns starting from 2 onwards
#features=df_train.iloc[:,2:].columns returns an object that contains the columns headers
#features=df_train.iloc[:,2:].columns.tolist() returns a list that contains the columns headers
#target=df_train.loc[:,'Survived'] returns the series of Survived
#target=df_train.loc[:,'Survived'].name returns the name of the series  which is a string called "Survived"
features=df_train.iloc[:,1:].columns.tolist()                    
target=df_train.loc[:,'Survived'].name

#Shuffle rows, drop the new index that is automatically created
df_train=df_train.sample(frac=1,axis=0).reset_index(drop=True)
df_train=df_train.sample(frac=1,axis=0).reset_index(drop=True)

X_train = df_train.iloc[:,1:].values
y_train = df_train.loc[:, 'Survived'].values

#7 Features we are training on are ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'male', 'Q', 'S']
#y value is the values of Survived

In [None]:
df_train

# PyTorch Logistic Regression 


* Model Definition and Instantiation 
* Loss Function
* Optimizer Function
* Training
* Prediction



In [None]:
#Define the NN model class object
class Net(nn.Module):
    def __init__(self):
        
        super(Net,self).__init__()           #No inputs have been defined for __init__(), hence the NN parameters are defined by default
        self.fc1=nn.Linear(7,512)            #4 Fully connected layers with 2 hidden layers
        self.fc2=nn.Linear(512,512)
        self.fc3=nn.Linear(512,2)
        self.dropout=nn.Dropout(0.5)
        
        # Input layer(7 features)-->512 Neurons-->512 Neurons--> Output (2 classes)
        #Input---_------------------Drop out   -->Drop out   --> Output   
        #Dropout rate of neurons at 0.5
        
    def forward(self,x):                     #Forward Propagation
        x=F.relu(self.fc1(x))                #Input --> Hidden 1
        x=self.dropout(x)                    #Dropout Hidden 1
        x=F.relu(self.fc2(x))                #Hidden 1 --> Hidden 2
        x=self.dropout(x)                    #Dropout Hidden 2
        x=self.fc3(x)                        #Hidden 2 --> Output
        return x
    
#Instantiate an NN object and display param
model=Net()
print(model)

In [None]:
criterion=nn.CrossEntropyLoss()                         #For Logistic regression problem
optimizer=torch.optim.SGD(model.parameters(),lr=0.06)

In [None]:
#Mini-batch gradient descent is performed here. Batch size of 64
batch_size=9
n_epochs=500
batch_no=len(X_train) // batch_size                            #X_train has length 891

losses=[]
train_loss=0
train_loss_min=np.Inf

for epoch in range(n_epochs):                                    #A single epoch is a single pass over the entire input X_train
    for i in range(batch_no):           
        
        #Define mini batches
        start = i*batch_size
        end = start+batch_size
        x_var = Variable(torch.FloatTensor(X_train[start:end]))  
        y_var = Variable(torch.LongTensor(y_train[start:end])) 
        
        #Training and Back propagation
        optimizer.zero_grad()
        output=model(x_var)
        loss=criterion(output,y_var)
        loss.backward()
        optimizer.step()
        
        #Calculating the accuracy
        values,labels =torch.max(output,1)
        num_right = np.sum(labels.data.numpy() == y_var.numpy())    #y_train[start:end] was changed to y_var[start:end]
        train_loss += loss.item()*batch_size                             #sum up losses per batch multiplied by batch_size(64)
        
    train_loss = train_loss / len(X_train)                          #train_loss for that epoch is averaged by the length of X_train
    
    if train_loss <= train_loss_min:
        
#       print("Validation loss decreased ({:6f} ===> {:6f}). Saving the model...".format(train_loss_min,train_loss))
#       torch.save(model.state_dict(), "model.pt")
        print("Validation loss decreased ({:6f} ===> {:6f}) from Epoc {} to {}".format(train_loss_min,train_loss,epoch-1,epoch))
        train_loss_min = train_loss
        
    if epoch % 200==0:
        
        print('')
        print("Epoch: {} \tTrain Loss: {} \tTrain Accuracy: {}".format(epoch+1, train_loss,num_right / len(y_train[start:end]) ))
        
    losses.append(train_loss)
print('\nTraining Ended!\nFinal training loss is : {} \nFinal Accuracy is : {}'.format(losses[-1],num_right / len(y_train[start:end])))

In [None]:
#Plot the losses
fig, axes = plt.subplots(figsize=(12,3),dpi=200)

axes.plot(range(0,n_epochs), losses, 'r')
axes.set_xlabel('Epoch')
axes.set_ylabel('Cross Entropy Loss')

fig.savefig("Training Loss.png", dpi=200)

In [None]:
# X_test=df_test.iloc[:,1:].values
X_test=df_test.values
X_test_var = Variable(torch.FloatTensor(X_test), requires_grad=False) 
with torch.no_grad():           #Turn off the gradient update
    test_result=model(X_test_var)
values, labels=torch.max(test_result,1)
survived=labels.data.numpy()

# Submission

In [None]:
submission=pd.DataFrame({'PassengerId':df_sub['PassengerId'],'Survived':survived})
submission.to_csv('submission.csv',index=False)

In [None]:
X_train[0:5]

In [None]:
df_train.head()

In [None]:
values,label=torch.max(output,1)
label.data.numpy==y_train[start:end]

In [None]:
output

In [None]:
label.data.numpy

In [None]:
y_var = Variable(torch.LongTensor(y_train[0:64])) 
y_var.numpy()

In [None]:
num_right = np.sum(labels.data.numpy() == y_train[start:end])
len(labels.data.numpy())
len(y_train[start:end])

In [None]:
data = {'day': ['Mon', 'Tues', 'Wed', 'Thurs', 'Fri'],
       'color': ['Blue', 'Red', 'Green', 'Yellow', 'Black'],
       'Number': [11, 8, 10, 15, 11]}

dataframe = pd.DataFrame(data)

In [None]:
dataframe

In [None]:
dataframe=dataframe.sample(frac=1,axis=0).reset_index(drop=True)
dataframe

In [None]:
dataframe