In [101]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegressionCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from scipy.stats import mode
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import GridSearchCV

In [2]:
data = pd.read_csv("train.csv")

In [3]:
data.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


In [4]:
# required columns
data = data[["Pclass", "Sex", "Age", "Fare", "Embarked", "Survived"]]

In [5]:
# check for NA in each of them

In [6]:
any(data["Pclass"].isna())

False

In [7]:
any(data["Sex"].isna())

False

In [8]:
any(data["Age"].isna())

True

In [9]:
any(data["Fare"].isna())

False

In [10]:
any(data["Embarked"].isna())

True

In [11]:
# remove the 2 rows

In [12]:
data = data.drop(list(data[data["Embarked"].isna()].index.values))

In [13]:
X = data[["Pclass", "Sex", "Age", "Fare", "Embarked"]]

In [14]:
X['Pclass'] = X['Pclass'].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [15]:
y = data["Survived"]

In [16]:
X = pd.get_dummies(X)

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=43)

In [18]:
fareMedian = X_train["Fare"].median()

In [20]:
ageMedian = X_train["Age"].median()

In [21]:
X_train["Age"].fillna(ageMedian, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


In [22]:
X_test["Age"].fillna(ageMedian, inplace=True)

### Scaling

In [23]:
scaler = StandardScaler().fit(X_train["Age"].values.reshape(-1,1))

In [24]:
X_train["Age"] = scaler.transform(X_train["Age"].values.reshape(-1,1))
X_test["Age"] = scaler.transform(X_test["Age"].values.reshape(-1,1))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [26]:
scaler2 = StandardScaler().fit(X_train["Fare"].values.reshape(-1,1))

In [27]:
X_train["Fare"] = scaler2.transform(X_train["Fare"].values.reshape(-1,1))
X_test["Fare"] = scaler2.transform(X_test["Fare"].values.reshape(-1,1))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


## Baseline

### Logistic Regression

In [28]:
lr = LogisticRegressionCV(cv=10).fit(X_train, y_train)

In [29]:
lr.score(X_train, y_train)

0.7862165963431786

In [30]:
lr.score(X_test, y_test)

0.8258426966292135

In [31]:
confusion_matrix(y_test, lr.predict(X_test))

array([[98, 12],
       [19, 49]], dtype=int64)

### Linear Discriminant Analysis

In [32]:
lda = LinearDiscriminantAnalysis().fit(X_train, y_train)



In [33]:
lda.score(X_train, y_train)

0.790436005625879

In [34]:
lda.score(X_test, y_test)

0.8146067415730337

In [35]:
confusion_matrix(y_test, lda.predict(X_test))

array([[95, 15],
       [18, 50]], dtype=int64)

### Naive Bayes

In [36]:
nb = GaussianNB().fit(X_train, y_train)

In [37]:
nb.score(X_train, y_train)

0.770745428973277

In [38]:
nb.score(X_test, y_test)

0.8033707865168539

In [39]:
confusion_matrix(y_test, lda.predict(X_test))

array([[95, 15],
       [18, 50]], dtype=int64)

### Random Forest

In [40]:
rf = RandomForestClassifier(n_estimators=10).fit(X_train, y_train)

In [41]:
rf.score(X_train, y_train)

0.9662447257383966

In [42]:
rf.score(X_test, y_test)

0.8146067415730337

In [43]:
confusion_matrix(y_test, lda.predict(X_test))

array([[95, 15],
       [18, 50]], dtype=int64)

### Ensemble Aggregating all 4 libraries

In [44]:
confusion_matrix(y_test, list(mode([lr.predict(X_test), lda.predict(X_test), nb.predict(X_test), rf.predict(X_test)], axis=0)[0][0]))

array([[98, 12],
       [18, 50]], dtype=int64)

In [46]:
148/178.0

0.8314606741573034

## Test data

In [70]:
test = pd.read_csv("test.csv")

In [71]:
test = test[["Pclass", "Sex", "Age", "Fare", "Embarked"]]

In [72]:
any(test["Pclass"].isna())

False

In [73]:
any(test["Sex"].isna())

False

In [74]:
any(test["Age"].isna())

True

In [75]:
any(test["Fare"].isna())

True

In [76]:
any(test["Embarked"].isna())

False

In [77]:
test["Age"].fillna(ageMedian, inplace=True)

In [78]:
test["Fare"].fillna(15.0, inplace=True)

In [79]:
test["Age"] = scaler.transform(test["Age"].values.reshape(-1,1))

In [80]:
test["Fare"] = scaler2.transform(test["Fare"].values.reshape(-1,1))

In [81]:
X_train.head()

Unnamed: 0,Age,Fare,Pclass_1,Pclass_2,Pclass_3,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
630,3.795202,-0.053546,1,0,0,0,1,0,0,1
839,-0.111278,-0.059339,1,0,0,0,1,1,0,0
613,-0.111278,-0.48325,0,0,1,0,1,0,1,0
189,0.489719,-0.480434,0,0,1,0,1,0,0,1
110,1.31609,0.371331,1,0,0,0,1,0,0,1


In [82]:
test['Pclass'] = test['Pclass'].astype(str)

In [96]:
op = pd.DataFrame(list(mode([lr.predict(pd.get_dummies(test)), lda.predict(pd.get_dummies(test)), nb.predict(pd.get_dummies(test)), rf.predict(pd.get_dummies(test))], axis=0)[0][0]))

In [97]:
op.to_csv("02-28.csv", index=False)

In [103]:
ExtC = ExtraTreesClassifier()


## Search grid for optimal parameters
ex_param_grid = {"max_depth": [None],
              "max_features": [1, 3, 10],
              "min_samples_split": [2, 3, 10],
              "min_samples_leaf": [1, 3, 10],
              "bootstrap": [False],
              "n_estimators" :[100,300],
              "criterion": ["gini"]}


gsExtC = GridSearchCV(ExtC,param_grid = ex_param_grid, cv=10, scoring="accuracy", n_jobs= 4, verbose = 1)

In [107]:
ecFit = gsExtC.fit(X_train, y_train)

Fitting 10 folds for each of 54 candidates, totalling 540 fits


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    6.9s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:   30.3s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:  1.3min
[Parallel(n_jobs=4)]: Done 540 out of 540 | elapsed:  1.6min finished


In [108]:
ecFit.best_score_

0.8270042194092827

In [115]:
pd.DataFrame(list(ecFit.best_estimator_.predict(pd.get_dummies(test)))).to_csv("ex.csv")