In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib notebook

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

import warnings
warnings.filterwarnings('ignore')

In [3]:
train = pd.read_csv("C:/Users/Teja/Desktop/train.csv")
print("Training dataset shape:",train.shape)
train.head()

Training dataset shape: (891, 12)


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
test = pd.read_csv("C:/Users/Teja/Desktop/test.csv")
print("Testing Dataset shape:",test.shape)
test.head()

Testing Dataset shape: (418, 11)


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [5]:
train["Survived"].value_counts()

0    549
1    342
Name: Survived, dtype: int64

In [6]:
sns.countplot(x="Survived",data = train)

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x258d16f8fc8>

In [7]:
sns.countplot(x = "Survived",hue = 'Sex',data = train)

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x258d6e51d48>

In [8]:
sns.countplot(x = "Survived",hue = "Pclass",data = train)

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x258d71329c8>

In [9]:
sns.boxplot(x="Pclass",y ="Age",data= train)

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x258d74303c8>

In [10]:
print("Null in Training Set")
print(train.isnull().sum())

Null in Training Set
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64


In [11]:
print("Null in testing set")
print(test.isnull().sum())

Null in testing set
PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64


In [12]:
def add_age(cols):
    Age = cols[0]
    Pclass = cols[1]
    if pd.isnull(Age):
        return int(train[train["Pclass"]== Pclass]["Age"].mean())
    else:
        return Age

In [13]:
train["Age"] = train[["Age","Pclass"]].apply(add_age,axis=1)
train['Age'] = test[["Age","Pclass"]].apply(add_age,axis = 1)

In [14]:
train.drop("Cabin",inplace = True,axis=1)
test.drop("Cabin",inplace = True, axis =1)

In [15]:
train["Embarked"].fillna(train["Embarked"].mode()[0],inplace=True)
test["Embarked"].fillna(test["Embarked"].mode()[0],inplace = True)

In [16]:
test["Fare"].fillna(test["Fare"].mean(),inplace = True)

In [17]:
def combine(df, col1, col2):
    df["Family"] = df[col1] + df[col2]
    df.drop([col1,col2],inplace = True, axis = 1)
    return df

train = combine(train,"SibSp","Parch")
test = combine(test,"SibSp","Parch")

In [18]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,Ticket,Fare,Embarked,Family
0,1,0,3,"Braund, Mr. Owen Harris",male,34.5,A/5 21171,7.25,S,1
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,47.0,PC 17599,71.2833,C,1
2,3,1,3,"Heikkinen, Miss. Laina",female,62.0,STON/O2. 3101282,7.925,S,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,27.0,113803,53.1,S,1
4,5,0,3,"Allen, Mr. William Henry",male,22.0,373450,8.05,S,0


In [19]:
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,Ticket,Fare,Embarked,Family
0,892,3,"Kelly, Mr. James",male,34.5,330911,7.8292,Q,0
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,363272,7.0,S,1
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,240276,9.6875,Q,0
3,895,3,"Wirz, Mr. Albert",male,27.0,315154,8.6625,S,0
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,3101298,12.2875,S,2


In [20]:
def process_age(df,cut_points,label_names):
    df["Age"] = df["Age"].fillna(-0.5)
    df["Age_categories"] = pd.cut(df["Age"],cut_points,labels = label_names)
    return df

cut_points = [-1,0,5,12,18,35,60,100]
label_names = ["Missing","Infant","Child","Teenager","Young Adult","Adult","Senior"]
train = process_age(train,cut_points,label_names)
test = process_age(test,cut_points,label_names)

pivot = train.pivot_table(index = "Age_categories",values = "Survived")
pivot.plot.bar()

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x258d7806f08>

In [21]:
def create_dummies(df, column_name):
    dummies = pd.get_dummies(df[column_name],prefix = column_name)
    df = pd.concat([df,dummies],axis = 1)
    return df

for column in ["Pclass","Sex","Age_categories","Embarked"]:
    train = create_dummies(train,column)
    test = create_dummies(test,column)

In [22]:
train.drop(["Name","Sex","Ticket","Pclass","Age_categories","Embarked"],inplace = True, axis = 1)
test.drop(["Name","Sex","Ticket","Pclass","Age_categories","Embarked"],inplace = True, axis = 1)

In [24]:
lr = LogisticRegression()
columns = ["PassengerId","Age","Fare","Family","Pclass_1","Pclass_2","Pclass_3","Sex_female","Sex_male","Age_categories_Missing","Age_categories_Infant","Age_categories_Child","Age_categories_Teenager","Age_categories_Young Adult","Age_categories_Adult","Age_categories_Senior"]
lr.fit(train[columns],train["Survived"])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [25]:
X = train[columns]
y = train["Survived"]

train_X, val_X, train_y,val_y = train_test_split(X,y, test_size = 0.2,random_state = 0)

In [26]:
lr = LogisticRegression()
lr.fit(train_X,train_y)
predictions = lr.predict(val_X)
accuracy = accuracy_score(val_y,predictions)
print(accuracy)
print(classification_report(val_y,predictions))

0.8100558659217877
              precision    recall  f1-score   support

           0       0.83      0.87      0.85       110
           1       0.78      0.71      0.74        69

    accuracy                           0.81       179
   macro avg       0.80      0.79      0.80       179
weighted avg       0.81      0.81      0.81       179



In [27]:
scores = cross_val_score(lr,X,y,cv = 10)
scores.sort()
accuracy = scores.mean()

print(scores)
print(accuracy)

[0.76404494 0.7752809  0.7752809  0.78651685 0.78888889 0.79775281
 0.79775281 0.79775281 0.82022472 0.84269663]
0.7946192259675404


In [28]:
lr.fit(X,y)
predict_test = lr.predict(test[columns])

In [30]:
submission = pd.read_csv("C:/Users/Teja/Desktop/gender_submission.csv")
submission_df = pd.DataFrame({"PassengerId": test["PassengerId"],"Survived":predict_test})
submission_df.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1


In [31]:
submission_df.to_csv("C:/Users/Teja/Desktop/gender_submission.csv")