In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold

In [2]:
train = pd.read_csv('train.csv')

In [3]:
test = pd.read_csv('test.csv')

In [4]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
test.shape

(418, 11)

In [6]:
train1 = train.drop(['Name','Ticket','Cabin'],axis = 1)

In [7]:
train1.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0,3,male,22.0,1,0,7.25,S
1,2,1,1,female,38.0,1,0,71.2833,C
2,3,1,3,female,26.0,0,0,7.925,S
3,4,1,1,female,35.0,1,0,53.1,S
4,5,0,3,male,35.0,0,0,8.05,S


In [8]:
test1 = test.drop(['Name','Ticket','Cabin'],axis = 1)

In [9]:
test1.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,892,3,male,34.5,0,0,7.8292,Q
1,893,3,female,47.0,1,0,7.0,S
2,894,2,male,62.0,0,0,9.6875,Q
3,895,3,male,27.0,0,0,8.6625,S
4,896,3,female,22.0,1,1,12.2875,S


In [10]:
def meta_data(data):
    total = data.isnull().sum()
    percent = (total/len(data))*100
    datatype = data.dtypes
    unique = data.nunique()
    
    return pd.concat([total,percent,datatype,unique],axis = 1, keys = ['total_missing_values','percent','datatype','unique'])

In [11]:
meta_data(train1)

Unnamed: 0,total_missing_values,percent,datatype,unique
PassengerId,0,0.0,int64,891
Survived,0,0.0,int64,2
Pclass,0,0.0,int64,3
Sex,0,0.0,object,2
Age,177,19.86532,float64,88
SibSp,0,0.0,int64,7
Parch,0,0.0,int64,7
Fare,0,0.0,float64,248
Embarked,2,0.224467,object,3


In [12]:
train2 = pd.get_dummies(train1)

In [13]:
train2.head()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,1,0,3,22.0,1,0,7.25,0,1,0,0,1
1,2,1,1,38.0,1,0,71.2833,1,0,1,0,0
2,3,1,3,26.0,0,0,7.925,1,0,0,0,1
3,4,1,1,35.0,1,0,53.1,1,0,0,0,1
4,5,0,3,35.0,0,0,8.05,0,1,0,0,1


In [14]:
test2 = pd.get_dummies(test1)

In [15]:
test2.head()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,892,3,34.5,0,0,7.8292,0,1,0,1,0
1,893,3,47.0,1,0,7.0,1,0,0,0,1
2,894,2,62.0,0,0,9.6875,0,1,0,1,0
3,895,3,27.0,0,0,8.6625,0,1,0,0,1
4,896,3,22.0,1,1,12.2875,1,0,0,0,1


In [16]:
cols = list(train2.columns)
cols1 = []
cols1.append(cols[0])
cols1.extend(cols[2:])
cols1.append(cols[1])

In [17]:
cols1

['PassengerId',
 'Pclass',
 'Age',
 'SibSp',
 'Parch',
 'Fare',
 'Sex_female',
 'Sex_male',
 'Embarked_C',
 'Embarked_Q',
 'Embarked_S',
 'Survived']

In [18]:
train2 = train2[cols1]

In [19]:
train2.head()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Survived
0,1,3,22.0,1,0,7.25,0,1,0,0,1,0
1,2,1,38.0,1,0,71.2833,1,0,1,0,0,1
2,3,3,26.0,0,0,7.925,1,0,0,0,1,1
3,4,1,35.0,1,0,53.1,1,0,0,0,1,1
4,5,3,35.0,0,0,8.05,0,1,0,0,1,0


In [20]:
#train2 = train2.dropna(how = 'any')
#train2.reset_index(drop = True)
train2['Age'] = train2['Age'].fillna(train2['Age'].mean())
train2['Fare'] = train2['Fare'].fillna(train2['Fare'].mean())

In [21]:
#test2 = test2.dropna(how = 'any')
#test2.reset_index(drop = True)
test2['Age'] = test2['Age'].fillna(test2['Age'].mean())
test2['Fare'] = test2['Fare'].fillna(test2['Fare'].mean())

In [22]:
scalar = MinMaxScaler()

In [23]:
cols = ['Age','Fare']

In [24]:
train2[cols] = scalar.fit_transform(train2[cols])

In [25]:
train2.head()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Survived
0,1,3,0.271174,1,0,0.014151,0,1,0,0,1,0
1,2,1,0.472229,1,0,0.139136,1,0,1,0,0,1
2,3,3,0.321438,0,0,0.015469,1,0,0,0,1,1
3,4,1,0.434531,1,0,0.103644,1,0,0,0,1,1
4,5,3,0.434531,0,0,0.015713,0,1,0,0,1,0


In [26]:
test2[cols] = scalar.fit_transform(test2[cols])

In [27]:
test2.head()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,892,3,0.452723,0,0,0.015282,0,1,0,1,0
1,893,3,0.617566,1,0,0.013663,1,0,0,0,1
2,894,2,0.815377,0,0,0.018909,0,1,0,1,0
3,895,3,0.353818,0,0,0.016908,0,1,0,0,1
4,896,3,0.287881,1,1,0.023984,1,0,0,0,1


In [28]:
x_train = train2.drop(['Survived'],axis = 1)

In [29]:
y_train = train2.Survived

In [30]:
x_test = test2

In [31]:
lr = LogisticRegression(solver='lbfgs', max_iter=1000)

In [32]:
lr.fit(x_train,y_train)

LogisticRegression(max_iter=1000)

In [33]:
y_train_predict = lr.predict(x_train)

In [34]:
accuracy_score(y_train_predict,y_train)

0.8013468013468014

In [35]:
def get_feature_importance(model,features):
    feature_importance =( pd.DataFrame(
        {'variable': features,
         'coefficient': model.coef_[0]
        }).round(3).sort_values('coefficient',ascending = False).style.bar(color = ['red','green'],align = 'zero'))
    return feature_importance

In [36]:
get_feature_importance(lr,x_train.columns)

Unnamed: 0,variable,coefficient
6,Sex_female,1.373
5,Fare,0.475
8,Embarked_C,0.065
0,PassengerId,0.0
9,Embarked_Q,-0.009
4,Parch,-0.069
3,SibSp,-0.282
10,Embarked_S,-0.377
1,Pclass,-1.038
7,Sex_male,-1.274


In [37]:
x_test.to_csv('test1.csv')

In [38]:
y_test_predict = lr.predict(x_test)

In [39]:
len(y_test_predict)

418

In [40]:
y_test_predict

array([0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0,
       1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [41]:
results = pd.DataFrame(pd.np.column_stack([x_test['PassengerId'],y_test_predict]))

  results = pd.DataFrame(pd.np.column_stack([x_test['PassengerId'],y_test_predict]))


In [42]:
results.columns = ['PassengerId','Survived']

In [43]:
results

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [44]:
results.to_csv("predictions1.csv",index = False)

In [45]:
from sklearn.ensemble import RandomForestClassifier

In [46]:
clf = RandomForestClassifier(max_depth=2, random_state=0)

In [47]:
clf.fit(x_train,y_train)

RandomForestClassifier(max_depth=2, random_state=0)

In [48]:
y_train_predict = clf.predict(x_train)

In [49]:
accuracy_score(y_train_predict,y_train)

0.7867564534231201

In [50]:
y_test_predict = clf.predict(x_test)

In [51]:
results = pd.DataFrame(pd.np.column_stack([x_test['PassengerId'],y_test_predict]))

  results = pd.DataFrame(pd.np.column_stack([x_test['PassengerId'],y_test_predict]))


In [52]:
results.columns = ['PassengerId','Survived']

In [53]:
results.to_csv("predictions2.csv",index = False)