In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# filter warnings that can be ignored
import warnings
warnings.filterwarnings('ignore')


data = pd.read_csv('titanic.csv')
data.head()
# It returns a number of rows and columns in a dataset.
training_set.shape
# It returns column headings.
training_set.columns
#It returns a number of null values in each column.
training_set.isnull().sum()

# Dropping of columns
# In this step, we are going to drop columns with the least priority. The column such as
# ‘PassengerId’ and ‘Ticket’ comes under this category. Use drop() to drop the columns.

#dropping ticket column
training_set.drop(['Ticket','PassengerId'], 1, inplace=True)

training_set.info()

# ‘Cabin’: Though Cabin column has 687 missing values, when you see carefully, it has a unique character 
# at the beginning which denotes the deck number, therefore, we are going to create a column named Deck
# to extract this information, which may be used later in our prediction.

def assignDeckValue(CabinCode):
    if pd.isnull(CabinCode):
        category = 'Unknown'
    else:
        category = CabinCode[0]
    return category
  
Deck = np.array([assignDeckValue(cabin) for cabin in training_set['Cabin'].values])

training_set = training_set.assign(Deck = Deck)

training_set

#‘ParCh’ and ‘SibSp’ are the details related to family size, so let’s derive a new column named ‘Size of the Family’.
training_set['FamilySize'] = training_set['SibSp'] + training_set['Parch'] + 1

# ‘Name’: Instead of dropping right away, from the Name of the Passenger, we need to get only their Title
# Using expression pattern to extract the Title of the passenger

training_set['Title'] = training_set.Name.str.extract(' ([A-Za-z]+)\.', expand=False)

# Changing to common category
training_set['Title'] = training_set['Title'].replace(['Dr', 'Rev', 'Col', 'Major', 'Countess', 'Sir', 'Jonkheer', 'Lady', 'Capt', 'Don'], 'Others')
training_set['Title'] = training_set['Title'].replace('Ms', 'Miss')
training_set['Title'] = training_set['Title'].replace('Mme', 'Mrs')
training_set['Title'] = training_set['Title'].replace('Mlle', 'Miss')

training_set

# Now, let's drop Cabin, Name columns, we have extracted needed information from these two.

training_set.drop(['Cabin','Name'],1,inplace=True)

training_set

#  Handling missing values
# ‘Embarked’: Only two rows are missing the values for Embarked column.
#  Embarked takes categorical values such as C = Cherbourg; Q = Queenstown; S = Southampton, 
# here we can simply impute the missing values with most commonly occurred value, which is ‘S’ in this case.

# Returns count for each category
training_set['Embarked'].value_counts()

# Fills null values with 'S'-most common occurence
common = 'S'
training_set['Embarked']=training_set['Embarked'].fillna('S')

# Checking the no of null values now
training_set['Embarked'].isnull().sum()

#  Encoding categorical features
#  Many machine learning algorithms cannot support categorical values without being converted 
#  to numerical values. Fortunately, the python tools of pandas and sci-kit-learn provide several
# approaches to handle this situation.

#  Initially, we are just going to map the categorical values into numerical data using map().

training_set['Embarked'] = training_set['Embarked'].map({'C':0, 'Q':1, 'S':2})
training_set['Sex'] = training_set['Sex'].map({'male':0, 'female':1})
training_set['Title'] = training_set['Title'].map({'Master':0,'Miss':1,'Mr':2,'Mrs':3,'Others':4})

# Now, let's drop Cabin, Name columns, we have extracted needed information from these two.

training_set.drop(['Cabin','Name'],1,inplace=True)

training_set

# Let’s do one conversion using LabelEncoder() provided by sklearn.preprocessing library.

from sklearn import preprocessing
le = preprocessing.LabelEncoder()
training_set['Deck'] = le.fit_transform(training_set['Deck'])