In [33]:
import pandas as pd

In [34]:
df = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

In [35]:
print(df.shape)
print(df.size)

(891, 12)
10692


In [36]:
df.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [37]:
df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [38]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [39]:
df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [40]:
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [41]:
def get_title(name):
    if '.' in name:
        return name.split(",")[1].split(".")[0].strip()
    else:
        return "No titles in name"

titles = sorted(set([x for x in df.Name.map(lambda x: get_title(x))]))
print("Different titles found on the dataset:")
print(len(titles), ':', titles)

Different titles found on the dataset:
17 : ['Capt', 'Col', 'Don', 'Dr', 'Jonkheer', 'Lady', 'Major', 'Master', 'Miss', 'Mlle', 'Mme', 'Mr', 'Mrs', 'Ms', 'Rev', 'Sir', 'the Countess']


In [42]:
def shorter_titles(x):
    title = x['Title']
    if title in ['Capt', 'Col', 'Major']:
        return 'Officer'
    elif title in ['Jonkheer', 'Don',  "the Countess", "Done", "Lady", "Sir"]:
        return "Royality"
    elif title == "Mme":
        return "Mrs"
    elif title in ["Mlle", "Ms"]:
        return "Miss"
    else:
        return title

In [43]:
# Creating
df['Title'] = df.Name.map(lambda x: get_title(x))

df['Title'] = df.apply(shorter_titles, axis=1)

In [44]:
df.sample()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title
451,452,0,3,"Hagland, Mr. Ingvald Olai Olsen",male,,1,0,65303,19.9667,,S,Mr


In [45]:
df.drop("Name", axis=1, inplace=True)

In [46]:
# Completing - Remove null values
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
Title            0
dtype: int64

In [47]:
# Cabin more null values so drop
df.drop("Cabin", axis=1, inplace=True)

In [48]:
# Age - 177 null values so calculate the median and set

df['Age'].fillna(df['Age'].median(), inplace=True)
df['Fare'].fillna(df['Fare'].median(), inplace=True)


In [49]:
# Embarked - 2 replace null values with most occurence characters
df.Embarked.value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [50]:
df['Embarked'].fillna('S', inplace=True)

In [51]:
# Converting - all the values in the dataset should be in binary or numbers so change to 0, 1 - just replace
df.sample(10)



Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,Title
299,300,1,1,female,50.0,0,1,PC 17558,247.5208,C,Mrs
68,69,1,3,female,17.0,4,2,3101281,7.925,S,Miss
174,175,0,1,male,56.0,0,0,17764,30.6958,C,Mr
326,327,0,3,male,61.0,0,0,345364,6.2375,S,Mr
798,799,0,3,male,30.0,0,0,2685,7.2292,C,Mr
420,421,0,3,male,28.0,0,0,349254,7.8958,C,Mr
862,863,1,1,female,48.0,0,0,17466,25.9292,S,Mrs
572,573,1,1,male,36.0,0,0,PC 17474,26.3875,S,Mr
6,7,0,1,male,54.0,0,0,17463,51.8625,S,Mr
330,331,1,3,female,28.0,2,0,367226,23.25,Q,Miss


In [52]:
# Sex - replace M-0 , F- 1
df.Sex.replace(('male', 'female'), (0, 1), inplace=True)
df.Embarked.replace(('S', 'C', 'Q'), (0, 1, 2), inplace=True)
df.Title.replace(("Mr", "Miss", "Mrs", "Master", "Dr", "Rev", "Royality", "Officer"), (0, 1, 2, 3, 4, 5, 6, 7), inplace=True)

# looks not in 0,1 not able to replace so drop
df.drop("Ticket", axis=1, inplace=True)

In [53]:
# Survived - Dependent Variable - Y
# Pclass	Sex	Age	SibSp	Parch	Fare	Embarked	Title  - Independent Variables - X

In [54]:
corr = df.corr()
corr.Survived.sort_values(ascending=False)

Survived       1.000000
Sex            0.543351
Title          0.359365
Fare           0.257307
Embarked       0.106811
Parch          0.081629
PassengerId   -0.005007
SibSp         -0.035322
Age           -0.064910
Pclass        -0.338481
Name: Survived, dtype: float64

In [55]:
from sklearn.model_selection import train_test_split

x = df.drop(['Survived', 'PassengerId'], axis=1)
# print(x)
y = df["Survived"]
# print(y)
# splitting 10% OF THE TRAINING data into (y_val,x_val) and 90% into training data(x_train, y_train)
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.1)
print(x_train)
print(y_train)

     Pclass  Sex    Age  SibSp  Parch      Fare  Embarked  Title
181       2    0  28.00      0      0   15.0500         1      0
322       2    1  30.00      0      0   12.3500         2      1
428       3    0  28.00      0      0    7.7500         2      0
721       3    0  17.00      1      0    7.0542         0      0
249       2    0  54.00      1      0   26.0000         0      5
..      ...  ...    ...    ...    ...       ...       ...    ...
134       2    0  25.00      0      0   13.0000         0      0
527       1    0  28.00      0      0  221.7792         0      0
874       2    1  28.00      1      0   24.0000         1      2
78        2    0   0.83      0      2   29.0000         0      3
223       3    0  28.00      0      0    7.8958         0      0

[801 rows x 8 columns]
181    0
322    1
428    0
721    0
249    0
      ..
134    0
527    0
874    1
78     1
223    0
Name: Survived, Length: 801, dtype: int64


In [56]:
# for Saving the model
import pickle
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

randomforest = RandomForestClassifier()
# train my model
randomforest.fit(x_train, y_train)
# Predict using my x validation my y validation is accurate or not
y_pred = randomforest.predict(x_val)
print(y_pred)
# I'm just calculating the accurary myself using accuracy_score by taking y_pred and actual groupond truth - y_val * 100
acc_randomforest = round(accuracy_score(y_pred, y_val) * 100, 2)
print("Accuracy : {} ".format(acc_randomforest))
# wb - write binary
pickle.dump(randomforest, open('titanic_model.sav', 'wb'))

[0 1 0 1 0 1 1 0 0 1 1 0 1 1 0 1 1 0 1 0 0 1 0 1 0 0 0 0 0 0 0 1 0 0 0 1 1
 1 1 0 1 0 0 1 1 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 1 0 1 0 0 0 0 1 0 0 0 1 1 0 0 0]
Accuracy : 83.33 


In [57]:
df_test = pd.read_csv("test.csv")
df_test['Title'] = df_test['Name'].map(lambda x: get_title(x))

df_test['Title'] = df_test.apply(shorter_titles, axis=1)
ids = df_test['PassengerId']

df_test['Age'].fillna(df_test['Age'].median(), inplace=True)
df_test['Fare'].fillna(df_test['Fare'].median(), inplace=True)
df_test['Embarked'].fillna('S', inplace=True)

df_test.drop("Cabin", axis=1, inplace=True)
df_test.drop("Ticket", axis=1, inplace=True)
df_test.drop("Name", axis=1, inplace=True)
df_test.drop("PassengerId", axis=1, inplace=True)
df_test.Sex.replace(('male', 'female'), (0,1), inplace=True)
df_test.Embarked.replace(('S', 'C', 'Q'), (0, 1, 2), inplace=True)
df_test.Title.replace(("Mr", "Miss", "Mrs", "Master", "Dr", "Rev", "Royality", "Officer", "Dona"), (0, 1, 2, 3, 4, 5, 6, 7, 8), inplace=True)

df_test


Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Title
0,3,0,34.5,0,0,7.8292,2,0
1,3,1,47.0,1,0,7.0000,0,2
2,2,0,62.0,0,0,9.6875,2,0
3,3,0,27.0,0,0,8.6625,0,0
4,3,1,22.0,1,1,12.2875,0,2
...,...,...,...,...,...,...,...,...
413,3,0,27.0,0,0,8.0500,0,0
414,1,1,39.0,0,0,108.9000,1,8
415,3,0,38.5,0,0,7.2500,0,0
416,3,0,27.0,0,0,8.0500,0,0


In [70]:
# with the given data frame test, I got predictions of y - survivalIds
predications = randomforest.predict(df_test)
# print(predications)
# df_test
# created a dataFrame with passengerIds and survivalIds
output = pd.DataFrame({'PassengerId': ids, 'Survived': predications})
output.to_csv('submissions.csv', index=False)

[0 0 0 1 1 0 0 0 1 0 0 0 1 0 1 1 0 1 0 0 0 1 1 0 1 0 1 1 1 0 0 0 1 0 0 1 1
 0 0 1 0 1 0 1 1 0 1 0 1 1 0 0 1 1 0 0 0 0 0 1 0 0 0 1 1 1 1 0 0 1 1 0 0 0
 1 0 0 1 0 1 1 0 0 0 0 0 1 0 1 1 0 1 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0
 1 1 0 1 0 0 1 0 1 1 0 1 0 0 1 0 1 1 0 1 0 1 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0
 1 0 1 0 0 1 0 0 1 1 1 1 1 1 1 1 0 1 0 0 1 0 0 0 0 0 0 1 1 0 1 1 0 0 1 0 1
 0 1 0 0 0 0 0 1 0 1 0 1 0 0 0 1 1 1 1 0 0 0 0 1 0 0 0 0 1 1 0 1 0 1 0 1 0
 1 1 1 0 0 1 0 0 0 1 0 0 1 0 0 1 1 1 1 1 0 0 0 0 1 0 1 0 1 0 0 0 0 0 0 0 1
 0 0 0 1 1 0 0 0 0 1 0 0 0 1 1 0 1 0 0 0 0 0 1 1 0 1 0 0 0 0 0 0 0 0 1 0 0
 1 0 0 0 0 0 0 0 1 1 1 1 0 0 0 1 0 0 1 1 0 0 0 0 0 0 0 1 1 0 1 0 0 0 1 1 0
 1 0 0 1 0 0 1 0 0 0 1 1 1 0 1 0 1 1 0 0 0 1 0 1 0 0 1 0 1 1 1 1 0 0 0 1 1
 0 1 0 1 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 1 0 0 1 0 1 0 0 1 0 1 0 0 0 0
 0 1 1 1 1 0 0 1 0 0 1]


In [59]:
print(x_val)
print(y_val)

     Pclass  Sex   Age  SibSp  Parch     Fare  Embarked  Title
519       3    0  32.0      0      0   7.8958         0      0
11        1    1  58.0      0      0  26.5500         0      1
45        3    0  28.0      0      0   8.0500         0      0
151       1    1  22.0      1      0  66.6000         0      2
363       3    0  35.0      0      0   7.0500         0      0
..      ...  ...   ...    ...    ...      ...       ...    ...
585       1    1  18.0      0      2  79.6500         0      1
19        3    1  28.0      0      0   7.2250         1      2
606       3    0  30.0      0      0   7.8958         0      0
623       3    0  21.0      0      0   7.8542         0      0
783       3    0  28.0      1      2  23.4500         0      0

[90 rows x 8 columns]
519    0
11     1
45     0
151    1
363    0
      ..
585    1
19     1
606    0
623    0
783    0
Name: Survived, Length: 90, dtype: int64


In [30]:
from google.colab import files
files.download('submissions.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [64]:
# Correct order in the dataframe
def prediction_model(pclass, sex, age, sibsp, parch, fare, embarked, title):
  import pickle
  x = [[pclass, sex, age, sibsp, parch, fare, embarked, title]]
  print(x)
  randomforest = pickle.load(open('titanic_model.sav', 'rb'))
  predications = randomforest.predict(x)
  print(predications)

In [65]:
prediction_model(1, 1, 19, 2, 1, 2, 1, 2)

[[1, 1, 19, 2, 1, 2, 1, 2]]
[1]


