In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score

from sklearn.preprocessing import OneHotEncoder 

from statistics import mean
from hyperopt import Trials, hp, fmin, tpe, STATUS_OK, space_eval #for hyperparameter tuning and minimizing objective function 


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
dfTrainX = pd.read_csv('Titanic.csv')
dfTrainY = dfTrainX['Survived']
dfTrainX = dfTrainX.drop(['Survived', 'Name', 'Cabin', 'Ticket'], axis = 1)
dfTrainX.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,3,male,22.0,1,0,7.25,S
1,2,1,female,38.0,1,0,71.2833,C
2,3,3,female,26.0,0,0,7.925,S
3,4,1,female,35.0,1,0,53.1,S
4,5,3,male,35.0,0,0,8.05,S


In [3]:
sexEncoder = pd.get_dummies(dfTrainX['Sex'])
dfTrainX = dfTrainX.drop('Sex', axis = 1)
dfTrainX = dfTrainX.join(sexEncoder)
embarkEncoder = pd.get_dummies(dfTrainX['Embarked'])
dfTrainX = dfTrainX.drop('Embarked', axis = 1)
dfTrainX = dfTrainX.join(embarkEncoder)

In [4]:
imp = SimpleImputer(missing_values = np.nan, strategy = 'mean') #could also use mean, median, most freq
dfTrainX = pd.DataFrame(imp.fit_transform(dfTrainX), columns = dfTrainX.columns)
dfTrainX.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    float64
 1   Pclass       891 non-null    float64
 2   Age          891 non-null    float64
 3   SibSp        891 non-null    float64
 4   Parch        891 non-null    float64
 5   Fare         891 non-null    float64
 6   female       891 non-null    float64
 7   male         891 non-null    float64
 8   C            891 non-null    float64
 9   Q            891 non-null    float64
 10  S            891 non-null    float64
dtypes: float64(11)
memory usage: 76.7 KB


In [5]:
space = {
    'C' : hp.choice('C', [0.1, 1, 10]),
    'gamma' : hp.choice('gamma', [0.1, 1, 10, 'scale', 'auto']),
    'kernel' : hp.choice('kernel', ['rbf', 'poly'])
} #defines the space in which we do hyperparameter tuning for C, gamma and kernel
kfold = StratifiedKFold(n_splits = 3, shuffle = True, random_state = 0)

def objective(params) :
    svc = SVC(**params) #grab all keyword paramaters
    #cross_val_score takes in object to fit, x, y shape, cv generator (in this case kfold), 
    #scoring metric, and number of parallel processings (just do 1 cuz h0m3l355)
    scores = cross_val_score(svc, dfTrainX, np.ravel(dfTrainY, order = 'C'), cv = kfold, scoring = 'accuracy', n_jobs = 1)
    best_score = mean(scores) 
    loss = -best_score
    return {'loss': loss, 'params': params, 'status': STATUS_OK}
    
num_trials = Trials()
best = fmin(fn = objective, space = space, algo = tpe.suggest, max_evals = 30, trials = num_trials)

  3%|▎         | 1/30 [00:00<00:04,  6.90trial/s, best loss: -0.6161616161616161]

In [None]:
# svc = SVC(C = space_eval(space, best)['C'], gamma = space_eval(space, best)['gamma'], kernel = space_eval(space, best)['kernel'])
# svc.fit(dfTrainX, dfTrainY)
# # logistic_regression.score(dfTestX, dfTestY)

In [None]:
titanic_in = pd.read_csv('/kaggle/input/titanic/test.csv')
titanic_Passengers = titanic_in['PassengerId']
titanic_in = titanic_in.drop(['Name', 'Cabin', 'Ticket'], axis = 1)
sexEncoder = pd.get_dummies(titanic_in['Sex'])
titanic_in = titanic_in.drop('Sex', axis = 1)
titanic_in = titanic_in.join(sexEncoder)
embarkEncoder = pd.get_dummies(titanic_in['Embarked'])
titanic_in = titanic_in.drop('Embarked', axis = 1)
titanic_in = titanic_in.join(embarkEncoder)
titanic_imp = SimpleImputer(missing_values = np.nan, strategy = 'mean') #could also use mean, median, most freq
titanic_in = pd.DataFrame(imp.fit_transform(titanic_in), columns = titanic_in.columns)
titanic_out = svc.predict(titanic_in)
sub = pd.DataFrame({'PassengerId': titanic_Passengers, 'Survived': titanic_out})
sub.to_csv("submission.csv", index=False)
sub
