In [1]:
import pandas as pd

  return f(*args, **kwds)
  return f(*args, **kwds)


In [3]:
# Create a tabular data structure
X = pd.DataFrame({'city':['tokyo', None, 'london', 'seattle', 'san francisco', 'tokyo'], 
                  'boolean':['yes', 'no', None, 'no', 'no', 'yes'], 
                  'ordinal_column':['somewhat like', 'like', 'somewhat like', 'like', 'somewhat like', 'dislike'], 
                  'quantitative_column':[1, 11, -.5, 10, None, 20]})
X

Unnamed: 0,boolean,city,ordinal_column,quantitative_column
0,yes,tokyo,somewhat like,1.0
1,no,,like,11.0
2,,london,somewhat like,-0.5
3,no,seattle,like,10.0
4,no,san francisco,somewhat like,
5,yes,tokyo,dislike,20.0


In [4]:
X.isnull().sum()

boolean                1
city                   1
ordinal_column         0
quantitative_column    1
dtype: int64

In [5]:
# Let's impute some values, imputer has a most_frequent option, but it only works if categories are integers
from sklearn.preprocessing import Imputer
from sklearn.pipeline import Pipeline

  return f(*args, **kwds)
  return f(*args, **kwds)
  return f(*args, **kwds)
  return f(*args, **kwds)


In [6]:
X['city'].value_counts().index[0]  
# Most common category

'tokyo'

In [7]:
X['city'].fillna(X['city'].value_counts().index[0])  
# Fill empty slots with most common category

0            tokyo
1            tokyo
2           london
3          seattle
4    san francisco
5            tokyo
Name: city, dtype: object

## Custom category imputer

1. Pipelines allow us to sequentially apply a list of transforms and a final estimator
2. Intermediate steps of the pipeline must be transforms, meaning they must implement <b>fit</b> and <b>transform</b> methods
3. The final estimator only needs to implement <b>fit</b>

In [21]:
from sklearn.base import TransformerMixin

In [10]:
class CustomCategoryImputer(TransformerMixin):
    def __init__(self, cols=None):
        self.cols = cols
        
    def transform(self, df):
        X = df.copy()
        for col in self.cols:
            X[col].fillna(X[col].value_counts().index[0], inplace=True)
        return X
    
    def fit(self, *_):
        return self

In [11]:
cci = CustomCategoryImputer(cols=['city', 'boolean'])

In [12]:
cci.fit_transform(X)

Unnamed: 0,boolean,city,ordinal_column,quantitative_column
0,yes,tokyo,somewhat like,1.0
1,no,tokyo,like,11.0
2,no,london,somewhat like,-0.5
3,no,seattle,like,10.0
4,no,san francisco,somewhat like,
5,yes,tokyo,dislike,20.0


## Custom quantitative imputer

In [13]:
# Let's make an imputer that can apply a strategy to select columns by name
class CustomQuantitativeImputer(TransformerMixin):
    def __init__(self, cols=None, strategy='mean'):
        self.cols = cols
        self.strategy = strategy
        
    def transform(self, df):
        X = df.copy()
        impute = Imputer(strategy=self.strategy)
        for col in self.cols:
            X[col] = impute.fit_transform(X[[col]])
        return X
    
    def fit(self, *_):
        return self

In [14]:
cqi = CustomQuantitativeImputer(cols=['quantitative_column'], strategy='mean')

cqi.fit_transform(X)

Unnamed: 0,boolean,city,ordinal_column,quantitative_column
0,yes,tokyo,somewhat like,1.0
1,no,,like,11.0
2,,london,somewhat like,-0.5
3,no,seattle,like,10.0
4,no,san francisco,somewhat like,8.3
5,yes,tokyo,dislike,20.0


In [None]:
# Setup Pipeline() so that we can transform our dataset in one go 

In [15]:
imputer = Pipeline([('quant', cqi), ('category', cci)])

imputer.fit_transform(X)  # ready for action

Unnamed: 0,boolean,city,ordinal_column,quantitative_column
0,yes,tokyo,somewhat like,1.0
1,no,tokyo,like,11.0
2,no,london,somewhat like,-0.5
3,no,seattle,like,10.0
4,no,san francisco,somewhat like,8.3
5,yes,tokyo,dislike,20.0


## Custom dummifier for nominal level

To transform our categorical data into dummy variables:
1. Utilize pandas to automatically find the categorical variables and dummy code them
2. Create our own custom transformer using dummy variables to work in a pipeline

<b>Dummy variables</b> take the value zero or one to indicate the absence or presence of a category. They are proxy variables, or numerical stand-ins, for quantitative data. It's important to be aware of and avoid the <b>dummy variable trap</b>. The dummy variable trap is when you have independent variables that are multicollinear, or highly correlated. Simply put, these variables can be predicted from each other. 

In [18]:
# Will automatically find categorical variables
pd.get_dummies(X, prefix_sep='__')  
# The seperator between the prefix (column name) and cell value

Unnamed: 0,quantitative_column,boolean__no,boolean__yes,city__london,city__san francisco,city__seattle,city__tokyo,ordinal_column__dislike,ordinal_column__like,ordinal_column__somewhat like
0,1.0,0,1,0,0,0,1,0,0,1
1,11.0,1,0,0,0,0,0,0,1,0
2,-0.5,0,0,1,0,0,0,0,0,1
3,10.0,1,0,0,0,1,0,0,1,0
4,,1,0,0,1,0,0,0,0,1
5,20.0,0,1,0,0,0,1,1,0,0


In [20]:
# For ordinal columns, we don't want to dummify
pd.get_dummies(X,
               # Which columns to dummify
               columns = ['city', 'boolean'],
               # The seperator between the prefix (column name) and cell value
               prefix_sep='__')

Unnamed: 0,ordinal_column,quantitative_column,city__london,city__san francisco,city__seattle,city__tokyo,boolean__no,boolean__yes
0,somewhat like,1.0,0,0,0,1,0,1
1,like,11.0,0,0,0,0,1,0
2,somewhat like,-0.5,1,0,0,0,0,0
3,like,10.0,0,0,1,0,1,0
4,somewhat like,,0,1,0,0,1,0
5,dislike,20.0,0,0,0,1,0,1


In [23]:
class CustomDummifier(TransformerMixin):
    def __init__(self, cols=None):
        self.cols = cols
        
    def transform(self, X):
        return pd.get_dummies(X, columns=self.cols)
    
    def fit(self, *_):
        return self

In [24]:
cd = CustomDummifier(cols=['boolean', 'city'])

cd.fit_transform(X)

Unnamed: 0,ordinal_column,quantitative_column,boolean_no,boolean_yes,city_london,city_san francisco,city_seattle,city_tokyo
0,somewhat like,1.0,0,1,0,0,0,1
1,like,11.0,1,0,0,0,0,0
2,somewhat like,-0.5,0,0,1,0,0,0
3,like,10.0,1,0,0,0,1,0
4,somewhat like,,1,0,0,1,0,0
5,dislike,20.0,0,1,0,0,0,1


## Custom encoder for ordinal level

At the ordinal level, since there is meaning in the data having a specific order, it does not make sense to use dummy variables. To maintain the order, we still use a <b>label encoder</b> that each label in our ordinal data will have a numerical value associated to it.

In [26]:
# What about the ordinal_column, we still want to use it and it's a string.
# 0 for dislike, 1 for somewhat like, and 2 for like
ordering = ['dislike', 'somewhat like', 'like'] 

print(X['ordinal_column'])
print(X['ordinal_column'].map(lambda x: ordering.index(x)))

0    somewhat like
1             like
2    somewhat like
3             like
4    somewhat like
5          dislike
Name: ordinal_column, dtype: object
0    1
1    2
2    1
3    2
4    1
5    0
Name: ordinal_column, dtype: int64


In [27]:
class CustomEncoder(TransformerMixin):
    def __init__(self, col, ordering=None):
        self.ordering = ordering
        self.col = col
        
    def transform(self, df):
        X = df.copy()
        X[self.col] = X[self.col].map(lambda x: self.ordering.index(x))
        return X
    
    def fit(self, *_):
        return self

In [28]:
ce = CustomEncoder(col='ordinal_column', ordering = ['dislike', 'somewhat like', 'like'])

ce.fit_transform(X)

Unnamed: 0,boolean,city,ordinal_column,quantitative_column
0,yes,tokyo,1,1.0
1,no,,2,11.0
2,,london,1,-0.5
3,no,seattle,2,10.0
4,no,san francisco,1,
5,yes,tokyo,0,20.0


## Custom cutter for bucketing continous features into categories

In [29]:
# Name of category is the bin by default
pd.cut(X['quantitative_column'], bins=3)

0     (-0.52, 6.333]
1    (6.333, 13.167]
2     (-0.52, 6.333]
3    (6.333, 13.167]
4                NaN
5     (13.167, 20.0]
Name: quantitative_column, dtype: category
Categories (3, interval[float64]): [(-0.52, 6.333] < (6.333, 13.167] < (13.167, 20.0]]

In [30]:
# Use no labels
pd.cut(X['quantitative_column'], bins=3, labels=False)

0    0.0
1    1.0
2    0.0
3    1.0
4    NaN
5    2.0
Name: quantitative_column, dtype: float64

In [31]:
# Use pre-made labels
group_names = ['Low', 'Okay', 'Good']
pd.cut(X['quantitative_column'], bins=3, labels=group_names)

0     Low
1    Okay
2     Low
3    Okay
4     NaN
5    Good
Name: quantitative_column, dtype: category
Categories (3, object): [Low < Okay < Good]

In [32]:
class CustomCutter(TransformerMixin):
    def __init__(self, col, bins, labels=False):
        self.labels = labels
        self.bins = bins
        self.col = col
        
    def transform(self, df):
        X = df.copy()
        X[self.col] = pd.cut(X[self.col], bins=self.bins, labels=self.labels)
        return X
    
    def fit(self, *_):
        return self

In [33]:
cc = CustomCutter(col='quantitative_column', bins=3)

cc.fit_transform(X)
# Note that the output of this is an ordinal column, meaning there is no need to dummify them

Unnamed: 0,boolean,city,ordinal_column,quantitative_column
0,yes,tokyo,somewhat like,0.0
1,no,,like,1.0
2,,london,somewhat like,0.0
3,no,seattle,like,1.0
4,no,san francisco,somewhat like,
5,yes,tokyo,dislike,2.0


## Pipeline

1. <b>Imputer</b>: fill in missing values for both categorical and numerical data using fillna() or Imputer()
2. <b>Dummifier</b>: nominal data
3. <b>Encoder</b>: Ordinal data
4. <b>Cutter</b>: bucket the quantitative data

In [None]:
# boolean, city: dummy encoding
# ordinal_column: label encoding
# quantitative_column: ordinal level data

In [34]:
pipe = Pipeline([("imputer", imputer), ('dummify', cd), ('encode', ce), ('cut', cc)])

In [35]:
# Take a look at the original data
X

Unnamed: 0,boolean,city,ordinal_column,quantitative_column
0,yes,tokyo,somewhat like,1.0
1,no,,like,11.0
2,,london,somewhat like,-0.5
3,no,seattle,like,10.0
4,no,san francisco,somewhat like,
5,yes,tokyo,dislike,20.0


In [36]:
pipe.fit(X)

Pipeline(memory=None,
     steps=[('imputer', Pipeline(memory=None,
     steps=[('quant', <__main__.CustomQuantitativeImputer object at 0x7ff991980748>), ('category', <__main__.CustomCategoryImputer object at 0x7ff9919802e8>)])), ('dummify', <__main__.CustomDummifier object at 0x7ff99199b400>), ('encode', <__main__.CustomEncoder object at 0x7ff99199bcf8>), ('cut', <__main__.CustomCutter object at 0x7ff9919abcc0>)])

In [37]:
pipe.transform(X)

Unnamed: 0,ordinal_column,quantitative_column,boolean_no,boolean_yes,city_london,city_san francisco,city_seattle,city_tokyo
0,1,0,0,1,0,0,0,1
1,2,1,1,0,0,0,0,1
2,1,0,1,0,1,0,0,0
3,2,1,1,0,0,0,1,0
4,1,1,1,0,0,1,0,0
5,0,2,0,1,0,0,0,1


# Activity Recognition from Single Chest-Mounted Accelerometer

In [39]:
import glob

pandas concatenation:
1. <b>pd.concat()</b> function: the most multi-purpose and can be used to combine multiple DataFrames along either axis.
2. <b>DataFrame.append()</b> method: a quick way to add rows to your DataFrame, but not applicable for adding columns.
3. <b>pd.merge()</b> function: great for joining two DataFrames together when we have one column (key) containing common values.
4. <b>DataFrame.join()</b> method: a quicker way to join two DataFrames, but works only off index labels rather than columns.

In [None]:
# Use append()
# path = r'/home/yungshun/workspace/py3/feature-construction/datasets'
# filenames = glob.glob(path + "/*.csv")
# df = pd.DataFrame(columns=['index', 'x', 'y', 'z', 'activity']) 
# for filename in filenames:
#     df = df.append(pd.read_csv(filename, names=['index', 'x', 'y', 'z', 'activity']), ignore_index=True)

In [111]:
df = pd.read_csv('/home/yungshun/workspace/py3/feature-construction/datasets/1.csv', header=None)
df.columns = ['index', 'x', 'y', 'z', 'activity']
df

Unnamed: 0,index,x,y,z,activity
0,0.0,1502,2215,2153,1
1,1.0,1667,2072,2047,1
2,2.0,1611,1957,1906,1
3,3.0,1601,1939,1831,1
4,4.0,1643,1965,1879,1
5,5.0,1604,1959,1921,1
6,6.0,1640,1829,1940,1
7,7.0,1607,1910,1910,1
8,8.0,1546,2045,1910,1
9,9.0,1529,2049,1972,1


In [112]:
# null accuracy (to beat) is 0.515369
df['activity'].value_counts(normalize=True)

7    0.515369
1    0.207242
4    0.165291
3    0.068793
5    0.019637
6    0.017951
2    0.005711
0    0.000006
Name: activity, dtype: float64

In [None]:
# Machine learning

In [85]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

  return f(*args, **kwds)


In [113]:
X = df[['x', 'y', 'z']]
# Create our feature matrix by removing the response variable
y = df['activity']

# our grid search variables and instances
# KNN parameters to try
knn_params = {'n_neighbors':[3, 4, 5, 6]}

knn = KNeighborsClassifier()
grid = GridSearchCV(knn, knn_params)
grid.fit(X, y)

print(grid.best_score_, grid.best_params_)



0.720752487676999 {'n_neighbors': 5}
