In [None]:
import pandas as pd

In [None]:
train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")

In [None]:
train_data.head()

In [None]:
train_data.info()

In [None]:
train_data.describe()

In [None]:
train_data['Survived'].value_counts()

In [None]:
train_data['Pclass'].value_counts()

In [None]:
train_data['Embarked'].value_counts()

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin

class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self,X):
        return X[self.attribute_names]

In [None]:
from sklearn.pipeline import Pipeline    #for numerical attributes
from sklearn.impute import SimpleImputer

num_pipeline = Pipeline([
    ('select_numeric', DataFrameSelector(['Age','SibSp','Parch','Fare'])),
    ('imputer', SimpleImputer(strategy='median')),
])

In [None]:
num_pipeline.fit_transform(train_data)

In [None]:
#we also need an imputer for categorical data and a regular SimpleImputer does not work on those 

In [None]:
class MostFrequentImputer(BaseEstimator, TransformerMixin):
    def fit(self,X,y=None):
        self.most_frequent_ = pd.Series([X[c].value_counts().index[0] for c in X],
                                       index=X.columns)
        return self
    def transform(self,X,y=None):
        return X.fillna(self.most_frequent_)

In [None]:
from sklearn.preprocessing import OneHotEncoder

In [None]:
# pipeline for categorical attributes

In [None]:
cat_pipeline = Pipeline([
    ('select_cat', DataFrameSelector(['Pclass','Sex','Embarked'])),
    ('imputer', MostFrequentImputer()),
    ('cat_encoder', OneHotEncoder(sparse=False)),
])

In [None]:
cat_pipeline.fit_transform(train_data)

In [None]:
#joining the numerical and categorical pipelines

In [None]:
from sklearn.pipeline import FeatureUnion
preprocess_pipeline= FeatureUnion(transformer_list=[
    ("num_pipeline", num_pipeline),
    ("cat_pipeline", cat_pipeline),
])

In [None]:
X_train=preprocess_pipeline.fit_transform(train_data)
X_train

In [None]:
y_train=train_data['Survived']

In [None]:
from sklearn.svm import SVC

svm_clf= SVC(gamma='auto')
svm_clf.fit(X_train,y_train)

In [None]:
#predicting
X_test= preprocess_pipeline.transform(test_data)
y_pred=svm_clf.predict(X_test)

In [None]:
from sklearn.model_selection import cross_val_score

svm_scores= cross_val_score(svm_clf, X_train,y_train,cv=10)
svm_scores.mean()

### for better accuracy, trying RandomForestClassifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

forest_clf = RandomForestClassifier(n_estimators=100, random_state=42)
forest_scores = cross_val_score(forest_clf,X_train,y_train, cv=10)
forest_scores.mean()