In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [2]:
from speedml import Speedml

sml = Speedml('data/train.csv', 
              'data/test.csv', 
              target = 'Survived',
              uid = 'PassengerId')
sml.shape()



'train (891, 11) | test (418, 10)'

In [3]:
sml.eda()

Unnamed: 0,Results,Observations
Speedml Release,v0.9.3,Visit https://speedml.com for release notes.
Nulls,1014,Use feature.impute.
Outliers Upper,"[SibSp, Fare]",Positive skew (> 3). Use feature.outliers(upper).
Shape,"train (891, 11) | test (418, 10)",
Numerical Ratio,54%,Aim for 100% numerical.
Numerical High-cardinality,"[Age, Fare]",(>10) categories. Use feature.density
Numerical Categorical,"[Survived, Pclass, SibSp, Parch]",Use plot.ordinal.
Text High-cardinality,"[Ticket, Cabin]",(>10) categories. Use feature.labels.
Text Categorical,"[Sex, Embarked]",Use feature.labels or feature.mapping.
Text Unique,[Name],~80% unique. Use feature.extract or feature.drop.


In [4]:
sml.feature.outliers('Fare', upper=98)
sml.feature.outliers('SibSp', upper=99)

'Fixed 17 or 1.91% upper outliers. '

'Fixed 7 or 0.79% upper outliers. '

In [5]:
sml.feature.density('Age')
sml.feature.density('Ticket')
sml.train[['Ticket', 'Ticket_density']].head()

Unnamed: 0,Ticket,Ticket_density
0,A/5 21171,1
1,PC 17599,1
2,STON/O2. 3101282,1
3,113803,2
4,373450,1


In [6]:
sml.feature.drop(['Ticket'])
sml.feature.fillna(a='Cabin', new='Z')
sml.feature.extract(new='Deck', a='Cabin', regex='([A-Z]){1}')
sml.feature.drop(['Cabin'])
sml.feature.mapping('Sex', {'male': 0, 'female': 1})
sml.feature.sum(new='FamilySize', a='Parch', b='SibSp')
sml.feature.add('FamilySize', 1)

'Dropped 1 features with 12 features available.'

'Filled 1014 null values across test and train datasets.'

'Dropped 1 features with 12 features available.'

In [7]:
sml.feature.drop(['Parch', 'SibSp'])
sml.feature.impute()
sml.feature.extract(new='Title', a='Name', regex=' ([A-Za-z]+)\.')

'Dropped 2 features with 11 features available.'

'Imputed 179 empty values to 0.'

In [8]:
sml.feature.replace(a='Title', match=['Lady', 'Countess','Capt', 'Col',\
'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], new='Rare')
sml.feature.replace('Title', 'Mlle', 'Miss')
sml.feature.replace('Title', 'Ms', 'Miss')
sml.feature.replace('Title', 'Mme', 'Mrs')

sml.feature.drop(['Name'])
sml.feature.labels(['Title', 'Embarked', 'Deck'])

'Replaced matching list of strings across train and test datasets.'

'Replaced 2 matching values across train and test datasets.'

'Replaced 2 matching values across train and test datasets.'

'Replaced 1 matching values across train and test datasets.'

'Dropped 1 features with 11 features available.'

In [9]:
sml.model.data()

'train_X: (891, 10) train_y: (891,) test_X: (418, 10)'

In [10]:
result = sml.eda()
result

Unnamed: 0,Results,Observations
Speedml Release,v0.9.3,Visit https://speedml.com for release notes.
Shape,"train (891, 11) | test (418, 10)",
Numerical High-cardinality,"[Age, Age_density, Fare]",(>10) categories. Use feature.density
Numerical Categorical,"[Deck, Embarked, FamilySize, Pclass, Sex, Surv...",Use plot.ordinal.
Target Analysis (Survived),Model ready.,Use classification models.


In [11]:
cat_cols = result['Results'][3]
cat_cols

['Deck',
 'Embarked',
 'FamilySize',
 'Pclass',
 'Sex',
 'Survived',
 'Ticket_density',
 'Title']

In [12]:
cat_cols.remove('Survived')

In [13]:
train = sml.train
test = sml.test
train_length = train.shape[0]

In [14]:
all_df = pd.concat([train, test], axis=0)
print(all_df.shape)

(1309, 11)


In [15]:
all_df = pd.get_dummies(all_df, columns=cat_cols)
all_df.head(3)

Unnamed: 0,Age,Age_density,Fare,Survived,Deck_0,Deck_1,Deck_2,Deck_3,Deck_4,Deck_5,...,Ticket_density_3,Ticket_density_4,Ticket_density_5,Ticket_density_6,Ticket_density_7,Title_0,Title_1,Title_2,Title_3,Title_4
0,22.0,27,7.25,0.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,38.0,11,71.2833,1.0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2,26.0,18,7.925,1.0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0


In [19]:
train = all_df.iloc[:train_length]
test = all_df.iloc[train_length:]
test.drop('Survived', inplace=True, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


In [20]:
train.to_csv('data\preprocessed_train.csv', float_format="%.6f", encoding='utf-8', index=False)
test.to_csv('data\preprocessed_test.csv', float_format="%.6f", encoding='utf-8', index=False)

In [21]:
print(train.shape)
print(test.shape)

(891, 42)
(418, 41)
