In [7]:
import numpy as np
import pandas as pd

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import IPython.display as ipd
import librosa.display

import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
import pandas as pd


In [3]:
data = pd.read_csv("./diabetic_data.csv")
data.replace('?',np.nan,inplace=True)

In [63]:
from math import log10
class ExploratoryDataAnalysis:
    def __init__(self,data):
        self.data = data
    
    def get_feature_class_count(self,col):
        count_map = {}
        for x in col:
            if(x in count_map):
                count_map[x] = count_map[x] + 1
            else:
                count_map[x] = 1

        return log10(len(count_map))
    
    def get_missing_counts(self,col):
        return len(col) - col.count()
    
    def plot_class_counts(self,plot_missing_feature_count = False):
        class_counts = [self.get_feature_class_count(self.data[column]) for column in self.data]
        data = [go.Histogram(x=self.data.columns, y=class_counts)]
        trace = go.Bar(
            x=self.data.columns,
            y=class_counts
        )
        layout = go.Layout(
            title='Feature counts | Total number of rows=' + str(len(self.data)),
            xaxis = dict(title='Features'),
            yaxis = dict(title='Log(Number of unique values)')
        )
        py.iplot(go.Figure(data=[trace], layout=layout))
    
    def plot_class_count(self):
        count = 0
        for x in self.data["readmitted"]:
            if(x == "NO"):
                count = count + 1

        trace = go.Bar(
            x=["YES","NO"],
            y=[(len(self.data) - count),count]
        )
        layout = go.Layout(
            title='Class counts | Total number of rows=' + str(len(self.data)),
            xaxis = dict(title='Features'),
            yaxis = dict(title='Log(Number of unique values)')
        )
        py.iplot(go.Figure(data=[trace], layout=layout))
        
    def plot_missing_count(self):
        class_counts = [len(self.data[column]) for column in self.data]
        missing_counts = [self.get_missing_counts(self.data[column]) for column in self.data]
        data = [go.Histogram(x=self.data.columns, y=class_counts)]
        trace = go.Bar(
            x=self.data.columns,
            y=class_counts
        )
        trace_missing = go.Bar(
            x=self.data.columns,
            y=missing_counts
        )
        layout = go.Layout(
            title='Feature counts | Total number of rows=' + str(len(self.data)),
            xaxis = dict(title='Features'),
            yaxis = dict(title='Log(Number of unique values)')
        )
        py.iplot(go.Figure(data=[trace,trace_missing], layout=layout))
        

    

In [64]:
EDA = ExploratoryDataAnalysis(data)
EDA.plot_class_counts()
EDA.plot_class_count()
EDA.plot_missing_count()

In [105]:
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from scipy.sparse import hstack,vstack
class CategoricalEncoder:
    def __init__(self):
        self.label_encoder = LabelEncoder()
        self.one_hot_encoder = OneHotEncoder()
       
    def fit(self,data):
        labels = self.label_encoder.fit(data)
        return self.one_hot_encoder.fit([labels])
   
    def fit_transform(self,data):
        labels = self.label_encoder.fit_transform(data)
        labels = labels.reshape(-1,1)
        encoded_data = self.one_hot_encoder.fit_transform(labels)
        return encoded_data
   
    def transform(self,data):
        labels = self.label_encoder.transform(data)
        return self.one_hot_encoder.transform([labels])
    
class FeatureEngineering:
       
    def get_encoding(self,column):
        categorical_encoder = CategoricalEncoder()
        return categorical_encoder.fit_transform(column), categorical_encoder
    
    def fit(self,data):
        return data
    
    def fit_transform(self,data):
        return data
    
    def transform(self,data):
        return data
    

In [102]:
feature_engineering = FeatureEngineering(data)
# encoded_data,encoder = feature_engineering.get_encoding(data["race"].astype('str'))
# print(encoded_data)
num_encoded_data,encoder = feature_engineering.get_encoding(data["admission_source_id"].astype('str'))
print(num_encoded_data)

  (0, 0)	1.0
  (1, 14)	1.0
  (2, 14)	1.0
  (3, 14)	1.0
  (4, 14)	1.0
  (5, 6)	1.0
  (6, 6)	1.0
  (7, 14)	1.0
  (8, 11)	1.0
  (9, 11)	1.0
  (10, 14)	1.0
  (11, 11)	1.0
  (12, 14)	1.0
  (13, 14)	1.0
  (14, 6)	1.0
  (15, 14)	1.0
  (16, 14)	1.0
  (17, 14)	1.0
  (18, 14)	1.0
  (19, 6)	1.0
  (20, 11)	1.0
  (21, 11)	1.0
  (22, 11)	1.0
  (23, 0)	1.0
  (24, 6)	1.0
  :	:
  (101741, 14)	1.0
  (101742, 6)	1.0
  (101743, 14)	1.0
  (101744, 14)	1.0
  (101745, 0)	1.0
  (101746, 14)	1.0
  (101747, 14)	1.0
  (101748, 14)	1.0
  (101749, 0)	1.0
  (101750, 0)	1.0
  (101751, 0)	1.0
  (101752, 0)	1.0
  (101753, 14)	1.0
  (101754, 14)	1.0
  (101755, 14)	1.0
  (101756, 14)	1.0
  (101757, 14)	1.0
  (101758, 14)	1.0
  (101759, 14)	1.0
  (101760, 14)	1.0
  (101761, 14)	1.0
  (101762, 12)	1.0
  (101763, 14)	1.0
  (101764, 14)	1.0
  (101765, 14)	1.0



The handling of integer data will change in version 0.22. Currently, the categories are determined based on the range [0, max(values)], while in the future they will be determined based on the unique values.
In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.



In [65]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
pipeline = Pipeline([FeatureEngineering(),LogisticRegression()])
pipeline.fit(data,y)
pipeline.score(data,y)
    
    

NameError: name 'FeatureEngineering' is not defined