In [None]:
import pandas as pd

pd.set_option('display.max_columns', 50)

In [None]:
train = pd.read_csv('./data/train.csv')
train.head(3)

In [None]:
test = pd.read_csv('./data/test.csv')
test.head(3)

In [None]:
submission = pd.read_csv('./data/gender_submission.csv')
submission.head()

In [None]:
from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier()

y = train['Survived']
X = train[['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']]

In [None]:
model.fit(X, y)

In [None]:
columns = ['PassengerId', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare']
X = train[columns]

In [None]:
model.fit(X, y)

In [None]:
X.info()

In [None]:
X['Age'].fillna(X['Age'].mean(), inplace=True)
X.info()

In [None]:
model.fit(X, y)

In [None]:
model.predict(test[columns])

In [None]:
test['Age'].fillna(test['Age'].mean(), inplace=True)

In [None]:
model.predict(test[columns])

In [None]:
test[columns].info()

In [None]:
test['Fare'].fillna(test['Fare'].mean(), inplace=True)

In [None]:
model.predict(test[columns])

In [None]:
y_predicted = model.predict(test[columns])

output = pd.DataFrame({'PassengerId': test['PassengerId'], 'Survived': y_predicted})
output

In [None]:
output.to_csv('./submissions/0_naive.csv', index=False)

# Enter Automated Machine Learning

In [None]:
import azureml.core

print("Azure ML SDK Version: ", azureml.core.VERSION)

In [None]:
df = pd.read_csv('./data/train.csv')

In [None]:
df.info()

In [None]:
# Authenticate if we haven't already

from azureml.core.authentication import InteractiveLoginAuthentication
credentials = InteractiveLoginAuthentication()

In [None]:
# Load the workspace info

from azureml.core import Workspace
ws = Workspace.from_config(auth=credentials)

In [None]:
# Create a new experiment if we haven't already

from azureml.core import Experiment
exp = Experiment(workspace=ws, name='Clujaggle')

In [None]:
from azureml.train.automl import AutoMLConfig

In [None]:
# @future_me, you should probably launch the experiment now ;)
config = AutoMLConfig(
    task='classification',
    primary_metric='AUC_weighted',
    iterations=3,
    training_data=df,
    label_column_name='Survived',
    n_cross_validations=5,
    preprocess=True,
    model_explainability=False
)

In [None]:
local_run = exp.submit(config, show_output=True)

In [None]:
best_run, fitted_model = local_run.get_output()
print('Best run:', best_run)
print('-----------')
print('Best model:', fitted_model)

In [None]:
def generate_submission_file(model, file_name):    
    data = pd.read_csv('./data/test.csv')
    output = pd.DataFrame({
        'PassengerId': test['PassengerId'], 
        'Survived': model.predict(data)
    })
    output.to_csv(file_name, index=False)

In [None]:
generate_submission_file(fitted_model, './submissions/1_auto.csv')    

## Analyze the trained model

In [None]:
transformer = fitted_model.steps[0][1]
scaler = fitted_model.steps[1][1]
classifier = fitted_model.steps[2][1]

In [None]:
type(transformer)

In [None]:
scaler.model

In [None]:
classifier

In [None]:
transformer.get_engineered_feature_names()

In [None]:
len(transformer.get_engineered_feature_names())

In [None]:
transformer.get_featurization_summary()

In [None]:
test = pd.read_csv('./data/test.csv')
test.info()

In [None]:
test_preprocessed = transformer.transform(test)
test_preprocessed

In [None]:
pd.DataFrame(test_preprocessed.toarray(), columns=transformer.get_engineered_feature_names())

In [None]:
test.tail(5)

In [None]:
test_scaled = scaler.transform(test_preprocessed)
test_scaled

In [None]:
pd.DataFrame(test_scaled.toarray(), columns=transformer.get_engineered_feature_names())

In [None]:
results = classifier.predict(test_scaled)
results

## One more try, with more iterations

In [None]:
config = AutoMLConfig(
    task='classification',
    primary_metric='AUC_weighted',
    iterations=15,
    training_data=df,
    label_column_name='Survived',
    n_cross_validations=5,
    preprocess=True,
    model_explainability=False
)

local_run = exp.submit(config, show_output=True)

In [None]:
best_run, fitted_model = local_run.get_output()
fitted_model    

In [None]:
classifier = fitted_model.named_steps['prefittedsoftvotingclassifier']
classifier

In [None]:
classifier.estimators

In [None]:
generate_submission_file(fitted_model, './submissions/2_auto_ensemble.csv')

## Explainability

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [None]:
config = AutoMLConfig(
    task='classification',
    primary_metric='AUC_weighted',
    iterations=10,
    training_data=df,
    label_column_name='Survived',
    preprocess=True,
    model_explainability=True
)


local_run = exp.submit(config, show_output=True)

In [None]:
from azureml.contrib.interpret.explanation.explanation_client import ExplanationClient

best_run, fitted_model = local_run.get_output()

client = ExplanationClient.from_run(best_run)
engineered_explanations = client.download_model_explanation(raw=False)

In [None]:
engineered_explanations.get_feature_importance_dict()

In [None]:
df['Title'] = df['Name'].str.extract(' ([A-Za-z]+)\.', expand=True)

pd.crosstab(df['Sex'], df['Title'])

In [None]:
df['Title'] = df['Title'].replace('Mlle', 'Miss')
df['Title'] = df['Title'].replace('Ms', 'Miss')
df['Title'] = df['Title'].replace('Mme', 'Mrs')
df['Title'] = df['Title'].replace(['Lady', 'Countess','Capt', 'Col', 
                                            'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')

pd.crosstab(df['Sex'], df['Title'])

pd.crosstab(df['Survived'], df['Title'])

In [None]:
df['Title'].sample(10)

In [None]:
better_df = df.drop(columns=['PassengerId', 'Name'])

In [None]:
config = AutoMLConfig(
    task='classification',
    primary_metric='AUC_weighted',
    iterations=15,
    training_data=better_df,
    label_column_name='Survived',
    n_cross_validations=5,
    preprocess=True,
    model_explainability=False
)

local_run = exp.submit(config, show_output=True)

In [None]:
generate_submission_file(fitted_model, './submissions/3_auto_ensemble_simplified.csv')

# The End

Get the code from [github.com/vladiliescu/talks](https://github.com/vladiliescu/talks)