In [11]:
import pandas as pd # to read the csv
import random # generate random numbers for train test split
import math # to calculate log2 probabilities
df = pd.read_csv("winequality-red.csv", sep=';')
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [12]:
# train test split function
def split_train_test(df, train=0.60):
    train_size = round(len(df) * train)
    train_indices = random.sample(population=df.index.tolist(), k=train_size)
    train_df = df.loc[train_indices]
    test_df = df.loc[set(df.index) - set(train_df.index)] #get rest of index
    return train_df, test_df
    
    

In [13]:
# example
train_df, test_df = split_train_test(df=df, train=0.5)
print(train_df.size)
print(test_df.size)
print(df.size == test_df.size + train_df.size)

9600
9588
True


In [14]:
#for continuous target
# Split criteria based on reduction of variance (so maximising SSR)
###attention group 1 is <= split !!!
def variance_SSR(df, feature, split, target):
    mean_target_group1 = df[df[feature]<=split][target].mean()
    len_group1 = sum(df[feature]<=split)
    mean_target_group2 = df[df[feature]>split][target].mean()
    len_group2 = len(df.index) - len_group1
    mean = df[target].mean()
    variance_SSR = len_group1 * (mean_target_group1 - mean)**2 + len_group2 * (mean_target_group2 - mean)**2
    return variance_SSR



In [15]:
# example
variance_SSR(df, 'fixed acidity', 5.5, 'quality')


10.823548170824402

In [16]:
# Alternate function, find max of SSR for all possible splits for a specific variable

def variance_SSR_max(df, feature, target):
    splits = df[feature].unique() # all possible splits are all unique values, except the first value
    max_SSR = 0
    mean = df[target].mean()
    for split in splits[1:]: # We have to exclude the first value as split is <=
        mean_target_group1 = df[df[feature]<=split][target].mean()
        mean_target_group2 = df[df[feature]>split][target].mean()
        len_group1 = sum(df[feature]<=split)
        len_group2 = len(df.index) - len_group1
        variance_SSR = len_group1 * (mean_target_group1 - mean)**2 + len_group2 * (mean_target_group2 - mean)**2
        if variance_SSR > max_SSR:
            best_split = split
            max_SSR=variance_SSR
    return best_split



In [17]:
variance_SSR_max(df,'fixed acidity', 'quality')

9.9000000000000004

In [18]:
# Alternate function to find the best split out of all possible splits (so for all variables):

def variance_SSR_max(df, target):
    max_SSR = 0
    for column in df: 
        if column =='quality': # We can't splt on the target variable
            continue
        splits = df[column].unique() # Possible splits are all the unique values, except the last value (because of <=)
        mean = df[column].mean()
        for split in splits[:-1]: # We have to exclude the last value as split
            mean_target_group1 = df[df[column]<=split][target].mean()
            mean_target_group2 = df[df[column]>split][target].mean()
            len_group1 = sum(df[column]<=split)
            len_group2 = len(df.index) - len_group1
            variance_SSR = len_group1 * (mean_target_group1 - mean)**2 + len_group2 * (mean_target_group2 - mean)**2
            if variance_SSR > max_SSR:
                best_split = split
                max_SSR = variance_SSR
                best_column = column
    return best_split,best_column
    
    

In [19]:
variance_SSR_max(df, 'quality')

(59.0, 'total sulfur dioxide')

# What about if the target is not continuous?
# We will use another data set, and calculate Gini or entropy instead. 


In [21]:
iris = pd.read_csv('iris_data.csv', sep=',')
iris.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [22]:
# First step is to find the pk
def pk(df, target):
    categories = df[target].unique()
    pk = []
    i = 0
    for category in categories:
        pk.append(sum(df[target]==category) / len(df))
        i = i +1
    return categories, pk



In [23]:
pk(iris, 'species')

(array(['setosa', 'versicolor', 'virginica'], dtype=object),
 [0.33333333333333331, 0.33333333333333331, 0.33333333333333331])

In [24]:
# Gini function
def gini(df, target):
    categories = df[target].unique()
    pk = []
    i = 0
    for category in categories:
        pk.append(sum(df[target]==category) / len(df)) # Find proportion in each class
        i = i +1
    return 1-sum([p ** 2 for p in pk]) # Return gini



In [25]:
# Entropy function
def entropy(df, target):
    categories = df[target].unique()
    pk = []
    i = 0
    for category in categories:
        pk.append(sum(df[target]==category) / len(df)) # Find proportion in each class
        i = i +1
    return -1 * sum([p * math.log2(p) for p in pk]) # Return entropy



In [26]:
entropy(iris, 'species')

1.5849625007211561

In [27]:
gini(iris, 'species')

0.66666666666666674

In [28]:
# Next we need to find all possible binary splits
#### does not work with return, only works with yield, why?
# Bacically the idea here is to generate the subsets (possible splits) in a smart way.
# This is a recursive algo. Basically, you start with the first category. Call this left. 
# Next you take the second category and set it aside. Call this right. 
#The way to create a subset of size n when you have all the subsets of size (n-1) is to add the
# new element to the left, then to the right, and then create a new subset with it on the left. 
#ex for n=3:
# 1
# 1 2
# 13  2
# 1   23
# 3   12
#for n=4:
# same as n=3 plus:
# 134  2
# 13   24
# 14   23
# 1    234
# 34   12
# 3    124
# 4    123
def cat_split(categories):
    if len(categories)==1:
        yield [categories]
    else:
        first = categories[0]
        for next_one in cat_split(categories[1:]): # need to exclude first category, as stored in 'first'
            for i, subset in enumerate(next_one):
                yield next_one[:i] + [[ first ] + subset] + next_one[i+1:]
            yield [[first]] + next_one


In [29]:
#something = iris['species'][1:4].unique().tolist() #test if only one category
something = iris['species'].unique().tolist()
for n, p in enumerate(cat_split(something), 1):
    if len(p)==2: # binary splits only
        print(n, sorted(p))

2 [['setosa'], ['versicolor', 'virginica']]
3 [['setosa', 'versicolor'], ['virginica']]
4 [['setosa', 'virginica'], ['versicolor']]
