# Create your Zetabase account

In [None]:
from zbpy import client 

In [None]:
%createaccount

# Create a pandas DataFrame 

In [None]:
import pandas as pd 
from sklearn import datasets
import numpy as np

data = pd.read_csv('datasets/train.csv')

---
# Set up your Zetabase client with either ecdsa or jwt security:

### Ecdsa  

In [None]:
pub_key = client.import_key('./zb/zetabase.1591621160.pub', public=True)
priv_key = client.import_key('./zb/zetabase.1591621160.priv', public=False)

zb = client.ZetabaseClient('18259baf-b9e7-4cbd-9027-ca6a4dae1af1')
zb.connect()
zb.set_id_key(priv_key, pub_key)

### Jwt 

In [None]:
zb = client.ZetabaseClient('18259baf-b9e7-4cbd-9027-ca6a4dae1af1')
zb.connect()
zb.set_id_password('test_user', 'test_pass')
zb.auth_login_jwt()

---
# Insert your DataFrames into an existing Zetabase table or create a new one with indexed fields based on the columns of your dataframe! 

### Inserting into existing table

In [None]:
zb.put_dataframe('titanic', data, 'raw')

### Inserting into new table

In [None]:
zb.put_dataframe_new_table('titanic_a14', data, 'raw', allow_jwt=True)

---
# List keys from your table, retrieve your data, and convert it back to pandas DataFrames

In [None]:
list_keys = zb.list_keys('titanic_a14')

keys = [key for key in list_keys]

output_data = zb.get('titanic_a14', keys)
df = output_data.to_dataframe() 

---
# Engineer your features

In [None]:
df['FamSize'] = df['SibSp'] + df['Parch']

mapping_fam_size = {0: 0, 1: .25, 2: .5, 3: .75, 4: 1, 5: 1.25, 6: 1.5, 7: 1.75, 8: 2, 9: 2.25, 10: 2.5, 11: 2.75, 12: 3}
mapping_sex = {'male': 0, 'female': 1}
mapping_cabin = {'A': 0, 'B': .5, 'C': 1, 'D': 1.5, 'E': 2, 'F': 2.5, 'G': 3, 'T': 3.5}

df['Sex'] = df['Sex'].map(mapping_sex)
    
df['Cabin'] = df['Cabin'].str[:1]
df['Cabin'] = df['Cabin'].map(mapping_cabin)

df['FamSize'] = df['FamSize'].map(mapping_fam_size)

df.loc[df['Fare'] <= 20, 'Fare'] = 0,
df.loc[(df['Fare'] > 20) & (df['Fare'] <= 40), 'Fare'] = 1,
df.loc[(df['Fare'] > 40) & (df['Fare'] <= 100), 'Fare'] = 2,
df.loc[df['Fare'] > 100, 'Fare'] = 3

df.loc[df['Age'] <= 17, 'Age'] = 0,
df.loc[(df['Age'] > 17) & (df['Age'] <= 30), 'Age'] = 1,
df.loc[(df['Age'] > 30) & (df['Age'] <= 40), 'Age'] = 2,
df.loc[(df['Age'] > 40) & (df['Age'] <= 50), 'Age'] = 3,
df.loc[(df['Age'] > 50) & (df['Age'] <= 60), 'Age'] = 4,
df.loc[df['Age'] > 60, 'Age'] = 5
    
df['Cabin'].fillna(df.groupby('Pclass')['Cabin'].transform('median'), inplace=True)

df.fillna(2, inplace=True)


pass_ids = df['PassengerId']    
features_to_drop = ['Ticket', 'SibSp', 'Parch', 'Name', 'Embarked', 'PassengerId']
df = df.drop(features_to_drop, axis=1)

--- 
# Save your featurized data back into Zetabase 

In [None]:
zb.put_dataframe('titanic_a14', df, 'feat')

# Retrieve only the featurized data and split it into training and testing groups

In [None]:
from sklearn.model_selection import train_test_split

list_keys = zb.list_keys_with_pattern('titanic_a14', 'feat/%')
keys = [key for key in list_keys]

data_feat = zb.get('titanic_a14', keys)
df_new = data_feat.to_dataframe()

x = df_new[['Pclass', 'Sex', 'Age', 'Fare', 'Cabin', 'FamSize']]
y = df_new['Survived']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25)

---
# Create and fit your model to the training data 

In [None]:
from sklearn import ensemble 

gb_clf = ensemble.GradientBoostingClassifier()

gb_clf.fit(x_train, y_train)

---
# Save your model to Zetabase  

In [None]:
import _pickle as cPickle 

model_to_save = cPickle.dumps(gb_clf)
zb.put_data('mlModels', 'titanic_gb_clf', model_to_save, overwrite=True)

---

# Reload your pre-trained model 

In [None]:
key = 'titanic_gb_clf'
get_model = zb.get('mlModels', [key])
pickled_model = get_model.data()
titanic_clf = cPickle.loads(pickled_model[key])

# Make predictions with your model

In [None]:
df_new = df_new.drop(['Survived'], axis=1)
prediction = titanic_clf.predict(df_new).copy()
results = pd.DataFrame({
    'PassengerId': pass_ids,
    'Survived': prediction
})

print(results)

# Save your predictions to a new table 

In [None]:
zb.put_dataframe_new_table('titanic_preds1', results, 'preds')