In [1]:
import warnings
warnings.filterwarnings('ignore')

In [67]:
import pandas as pd
import matplotlib.pyplot as plt 
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, KFold
# from sklearn.neighbors import KNeighborsClassifier, RadiusNeighborsClassifier
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV, SGDClassifier
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import GridSearchCV, learning_curve, validation_curve
# from sklearn.tree import DecisionTreeClassifier 
from sklearn.preprocessing import PolynomialFeatures, OneHotEncoder, LabelEncoder
# from sklearn.neural_network import MLPClassifier
# from sklearn.svm import SVC
# from sklearn.ensemble import RandomForestClassifier
import re
# from scipy.sparse import csr_matrix, hstack, vstack
%matplotlib inline

In [3]:
RANDOM_STATE = 5
train = pd.read_csv('train.csv', index_col='PassengerId')
test = pd.read_csv('test.csv', index_col='PassengerId')

In [4]:
train.count()

Survived    891
Pclass      891
Name        891
Sex         891
Age         714
SibSp       891
Parch       891
Ticket      891
Fare        891
Cabin       204
Embarked    889
dtype: int64

In [5]:
train = pd.read_csv('train.csv', index_col='PassengerId')
test = pd.read_csv('test.csv', index_col='PassengerId')
y_train = train['Survived']
X_train = train.drop(['Survived'], axis=1)
X_test = test.copy()

X_full = [X_train, X_test]

def get_title(name):
    title_search = re.search('([A-Za-z]+)\.', name)
    if title_search:
        return title_search.group(1)
    else:
        return ""

for df in X_full:
    df['FamilySize'] = df['SibSp'] + df['Parch'] + 1

    df['IsAlone'] = df['FamilySize'].apply(lambda x: 1 if x == 1 else 0)

    df['Embarked'] = df['Embarked'].fillna('S')
    df['Embarked'] = df['Embarked'].map({'S': 0, 'C': 1, 'Q': 2})

    df['Fare'] = df['Fare'].fillna(X_train['Fare'].median())

    age_mean = X_train['Age'].mean()
    age_std = X_train['Age'].std()
    age_null_count = df['Age'].isnull().sum()
    age_null_random_list = np.random.randint(age_mean - age_std, age_mean + age_std, size=age_null_count)
    df['Age'][np.isnan(df['Age'])] = age_null_random_list
    df['Age'] = df['Age'].astype(int)

    df['Sex'] = df['Sex'].map({'male': 0, 'female': 1}).astype(int)
    
    df['Title'] = df['Name'].apply(get_title)

    df['Title'] = df['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 
                                       'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')

    df['Title'] = df['Title'].replace('Mlle', 'Miss')
    df['Title'] = df['Title'].replace('Ms', 'Miss')
    df['Title'] = df['Title'].replace('Mme', 'Mrs')
    
    title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}
    df['Title'] = df['Title'].map(title_mapping)
    df['Title'] = df['Title'].fillna(0)
    
    df.loc[ df['Fare'] <= 7.91, 'Fare'] = 0
    df.loc[(df['Fare'] > 7.91) & (df['Fare'] <= 14.454), 'Fare'] = 1
    df.loc[(df['Fare'] > 14.454) & (df['Fare'] <= 31), 'Fare']   = 2
    df.loc[ df['Fare'] > 31, 'Fare'] = 3
    df['Fare'] = df['Fare'].astype(int)
    
    # Mapping Age
    df.loc[ df['Age'] <= 16, 'Age'] = 0
    df.loc[(df['Age'] > 16) & (df['Age'] <= 32), 'Age'] = 1
    df.loc[(df['Age'] > 32) & (df['Age'] <= 48), 'Age'] = 2
    df.loc[(df['Age'] > 48) & (df['Age'] <= 64), 'Age'] = 3
    df.loc[ df['Age'] > 64, 'Age'] = 4
    
    df.drop(['Name', 'Ticket', 'Cabin', 'SibSp'], axis=1, inplace=True)

In [6]:
X, y = X_train, y_train
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.33, random_state=RANDOM_STATE)

In [7]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 596 entries, 241 to 868
Data columns (total 9 columns):
Pclass        596 non-null int64
Sex           596 non-null int64
Age           596 non-null int64
Parch         596 non-null int64
Fare          596 non-null int64
Embarked      596 non-null int64
FamilySize    596 non-null int64
IsAlone       596 non-null int64
Title         596 non-null int64
dtypes: int64(9)
memory usage: 46.6 KB


In [7]:
logit = LogisticRegression(random_state=RANDOM_STATE)
logit.fit(X_train, y_train)
y_preds = logit.predict(X_valid)
print('ROC AUC:', roc_auc_score(y_preds, y_valid))

ROC AUC: 0.8200000000000001


In [42]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

In [9]:
cross_val_score(logit, X, y, cv=skf).mean()

0.8080949376672777

In [19]:
logit = LogisticRegression(penalty='l2', random_state=RANDOM_STATE, n_jobs=-1)
gs = GridSearchCV(estimator=logit, param_grid={'tol': [1e-3, 1e-4, 1e-5], 
                                              'C': np.linspace(1e-3, 10, 10),
                                            'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
                                            'max_iter': [50, 100, 200, 300, 400, 500],
                                            'class_weight': [None, 'balanced']},
                  scoring='accuracy', cv=skf)
gs.fit(X_train, y_train);
gs.best_params_

{'C': 1.1119999999999999,
 'class_weight': None,
 'max_iter': 50,
 'solver': 'liblinear',
 'tol': 0.001}

In [37]:
params = {'C': 1.,
 'class_weight': None,
 'max_iter': 50,
 'penalty': 'l2',
 'solver': 'liblinear',
 'tol': 0.001
         }         
logit = LogisticRegression(**params, random_state=RANDOM_STATE)
logit.fit(X_train, y_train)
y_preds = logit.predict(X_valid)
print('ROC AUC:', roc_auc_score(y_preds, y_valid))

ROC AUC: 0.8200000000000001


In [16]:
params = {'C': 1.1119999999999999,
 'class_weight': None,
 'max_iter': 50,
 'penalty': 'l1',
 'solver': 'saga',
 'tol': 0.001
         }         
logit = LogisticRegression(**params, random_state=RANDOM_STATE, n_jobs=-1)
logit.fit(X_train, y_train)
y_preds = logit.predict(X_valid)
print('ROC AUC:', roc_auc_score(y_preds, y_valid))

ROC AUC: 0.81684661797396


In [40]:
logitCV = LogisticRegressionCV(solver='liblinear', max_iter=50, penalty='l2', tol=0.001,
                               random_state=RANDOM_STATE, n_jobs=-1)
logitCV.fit(X_train, y_train);
y_preds = logitCV.predict(X_valid)
print('ROC AUC:', roc_auc_score(y_preds, y_valid))

ROC AUC: 0.81684661797396


### Submission #001

In [None]:
logit = LogisticRegression(penalty='l2', random_state=RANDOM_STATE)
gs = GridSearchCV(estimator=logit, param_grid={'tol': [1e-2, 1e-3, 1e-4], 
                                              'C': [0.7, 0.8, 0.9, 1., 1.1, 1.2, 1.3],
                                            'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
                                            'max_iter': [200, 300, 400, 500],
                                            'class_weight': [None, 'balanced']},
                  scoring='accuracy', cv=skf)
gs.fit(X, y);
gs.best_params_

In [64]:
params = {'C': 0.7,
 'class_weight': None,
 'max_iter': 200,
 'solver': 'sag',
 'tol': 0.001
         }
logit = LogisticRegression(**params, random_state=RANDOM_STATE, n_jobs=-1)
logit.fit(X, y)
y_preds = logit.predict(X_test)

In [65]:
y_fin = pd.DataFrame(y_preds, columns=['Survived'], index=X_test.index)
y_fin.Survived.value_counts()

0    247
1    171
Name: Survived, dtype: int64

In [66]:
y_fin.to_csv('submissions/logit_001.csv', header=True, index_label='PassengerId') #0.79425

In [None]:
logit = LogisticRegression(penalty='l2', random_state=RANDOM_STATE)
gs = GridSearchCV(estimator=logit, param_grid={'tol': [1e-2, 1e-3, 1e-4], 
                                              'C': [0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.],
                                            'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
                                            'max_iter': [200, 300, 400, 500],
                                            'class_weight': [None, 'balanced']},
                  scoring='accuracy', cv=skf)
gs.fit(X, y);
gs.best_params_

### Submission #002

In [78]:
params = {'C': 0.2,
 'class_weight': None,
 'max_iter': 200,
 'solver': 'liblinear',
 'tol': 0.01}
logit = LogisticRegression(**params, random_state=RANDOM_STATE)
logit.fit(X, y)
y_preds = logit.predict(X_test)

In [79]:
y_fin = pd.DataFrame(y_preds, columns=['Survived'], index=X_test.index)
y_fin.Survived.value_counts()

0    251
1    167
Name: Survived, dtype: int64

In [80]:
y_fin.to_csv('submissions/logit_002.csv', header=True, index_label='PassengerId') #0.78...

In [81]:
logit = LogisticRegression(penalty='l2', random_state=RANDOM_STATE)
gs = GridSearchCV(estimator=logit, param_grid={'tol': [1e-2, 1e-3, 1e-4], 
                                              'C': [0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.],
                                            'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
                                            'max_iter': [200, 300, 400, 500],
                                            'class_weight': [None, 'balanced']},
                  scoring='accuracy', cv=skf)
gs.fit(X, y);
gs.best_params_

{'C': 0.4,
 'class_weight': None,
 'max_iter': 200,
 'solver': 'saga',
 'tol': 0.01}

### Submission #003

In [82]:
params = {'C': 0.4,
 'class_weight': None,
 'max_iter': 200,
 'solver': 'saga',
 'tol': 0.01}
logit = LogisticRegression(**params, random_state=RANDOM_STATE)
logit.fit(X, y)
y_preds = logit.predict(X_test)

In [83]:
y_fin = pd.DataFrame(y_preds, columns=['Survived'], index=X_test.index)
y_fin.Survived.value_counts()

0    253
1    165
Name: Survived, dtype: int64

In [84]:
y_fin.to_csv('submissions/logit_003.csv', header=True, index_label='PassengerId') #0.77033

In [69]:
poly = PolynomialFeatures(2)
X_train_poly = poly.fit_transform(X_train)
X_valid_poly = poly.transform(X_valid)

In [120]:
X_train.shape, X_train_poly.shape

((596, 9), (596, 55))

In [109]:
logit = LogisticRegression(penalty='l2', random_state=RANDOM_STATE)
gs = GridSearchCV(estimator=logit, param_grid={'tol': [1e-3, 1e-4], 
                                              'C': [0.2, 0.3, 0.4, 0.5, 0.7, 0.9, 1.1, 1., 1.3],
                                            'solver': ['liblinear'],
                                            'max_iter': [50, 200, 700],
                                              },

                  scoring='accuracy', cv=skf)
gs.fit(X_train_poly, y_train);
gs.best_params_

{'C': 0.2, 'max_iter': 50, 'solver': 'liblinear', 'tol': 0.001}

In [114]:
# params = {'C': 1.,
#  'class_weight': None,
#  'max_iter': 50,
#  'penalty': 'l2',
#  'solver': 'liblinear',
#  'tol': 0.001
#          } 
params = {'C': 0.7,
 'class_weight': None,
 'max_iter': 1000,
 'solver': 'sag',
 'tol': 0.001
         }
logit = LogisticRegression(**params, random_state=RANDOM_STATE)
logit.fit(X_train_poly, y_train)
y_preds = logit.predict(X_valid_poly)
print('ROC AUC:', roc_auc_score(y_preds, y_valid))

ROC AUC: 0.8406865472348949


### Submission #004

In [115]:
poly = PolynomialFeatures(2)
X_poly = poly.fit_transform(X)
X_test_poly = poly.transform(X_test)

In [117]:
params = {'C': 0.7,
 'class_weight': None,
 'max_iter': 1000,
 'solver': 'sag',
 'tol': 0.001
         }
logit = LogisticRegression(**params, random_state=RANDOM_STATE)
logit.fit(X_poly, y)
y_preds = logit.predict(X_test_poly)


In [118]:
y_fin = pd.DataFrame(y_preds, columns=['Survived'], index=X_test.index)
y_fin.Survived.value_counts()

0    263
1    155
Name: Survived, dtype: int64

In [119]:
y_fin.to_csv('submissions/logit_poly_001.csv', header=True, index_label='PassengerId') #

In [121]:
poly = PolynomialFeatures(2)
X_train_poly = poly.fit_transform(X_train)
X_valid_poly = poly.fit_transform(X_valid)

In [109]:
logit = LogisticRegression(penalty='l2', random_state=RANDOM_STATE)
gs = GridSearchCV(estimator=logit, param_grid={'tol': [1e-3, 1e-4], 
                                              'C': [0.2, 0.3, 0.4, 0.5, 0.7, 0.9, 1.1, 1., 1.3],
                                            'solver': ['liblinear'],
                                            'max_iter': [50, 200, 700],
                                              },

                  scoring='accuracy', cv=skf)
gs.fit(X_train_poly, y_train);
gs.best_params_

{'C': 0.2, 'max_iter': 50, 'solver': 'liblinear', 'tol': 0.001}

In [124]:
params = {'C': 0.2,
 'class_weight': None,
 'max_iter': 50,
 'penalty': 'l2',
 'solver': 'liblinear',
 'tol': 0.001
         } 
logit = LogisticRegression(**params, random_state=RANDOM_STATE)
logit.fit(X_train_poly, y_train)
y_preds = logit.predict(X_valid_poly)
print('ROC AUC:', roc_auc_score(y_preds, y_valid))

ROC AUC: 0.8406865472348949


### Submission #005

In [125]:
poly = PolynomialFeatures(2)
X_poly = poly.fit_transform(X)
X_test_poly = poly.fit_transform(X_test)

In [126]:
logit = LogisticRegression(penalty='l2', random_state=RANDOM_STATE)
gs = GridSearchCV(estimator=logit, param_grid={'tol': [1e-3, 1e-4], 
                                              'C': [0.2, 0.3, 0.4, 0.5, 0.7, 0.9, 1.1, 1., 1.3],
                                            'solver': ['liblinear'],
                                            'max_iter': [50, 200, 200, 700],
                                              },

                  scoring='accuracy', cv=skf)
gs.fit(X_poly, y);
gs.best_params_

{'C': 0.3, 'max_iter': 50, 'solver': 'liblinear', 'tol': 0.001}

In [127]:
params = {'C': 0.3,
 'class_weight': None,
 'max_iter': 50,
 'solver': 'liblinear',
 'tol': 0.001
         }
logit = LogisticRegression(**params, random_state=RANDOM_STATE)
logit.fit(X_poly, y)
y_preds = logit.predict(X_test_poly)


In [128]:
y_fin = pd.DataFrame(y_preds, columns=['Survived'], index=X_test.index)
y_fin.Survived.value_counts()

0    269
1    149
Name: Survived, dtype: int64

In [129]:
y_fin.to_csv('submissions/logit_poly_002.csv', header=True, index_label='PassengerId') #0.78468

### Submission #006

In [141]:
logit = LogisticRegression(penalty='l2', random_state=RANDOM_STATE)
gs = GridSearchCV(estimator=logit, param_grid={'tol': [1e-3], 
                                              'C': [0.07, 0.08, 0.09, 0.1, 0.2, 0.4],
                                            'solver': ['sag'],
                                            'max_iter': [700],
                                              },

                  scoring='accuracy', cv=skf)
gs.fit(X_poly, y);
gs.best_params_

{'C': 0.09, 'max_iter': 700, 'solver': 'sag', 'tol': 0.001}

In [142]:
params = {'C': 0.09, 'max_iter': 700, 'solver': 'sag', 'tol': 0.001}
logit = LogisticRegression(**params, random_state=RANDOM_STATE)
logit.fit(X_poly, y)
y_preds = logit.predict(X_test_poly)


In [143]:
y_fin = pd.DataFrame(y_preds, columns=['Survived'], index=X_test.index)
y_fin.Survived.value_counts()

0    263
1    155
Name: Survived, dtype: int64

In [144]:
y_fin.to_csv('submissions/logit_poly_003.csv', header=True, index_label='PassengerId') #0.78947

### Submission #007

In [146]:
logit = LogisticRegression(penalty='l2', random_state=RANDOM_STATE)
gs = GridSearchCV(estimator=logit, param_grid={'tol': [1e-3], 
                                              'C': [0.07, 0.08, 0.09, 0.1, 0.2, 0.4],
                                            'solver': ['lbfgs'],
                                            'max_iter': [700],
                                              },

                  scoring='accuracy', cv=skf)
gs.fit(X_poly, y);
gs.best_params_

{'C': 0.2, 'max_iter': 700, 'solver': 'lbfgs', 'tol': 0.001}

In [147]:
params = {'C': 0.2, 'max_iter': 700, 'solver': 'lbfgs', 'tol': 0.001}
logit = LogisticRegression(**params, random_state=RANDOM_STATE)
logit.fit(X_poly, y)
y_preds = logit.predict(X_test_poly)


In [148]:
y_fin = pd.DataFrame(y_preds, columns=['Survived'], index=X_test.index)
y_fin.Survived.value_counts()

0    267
1    151
Name: Survived, dtype: int64

In [149]:
y_fin.to_csv('submissions/logit_poly_004.csv', header=True, index_label='PassengerId') #.7799