In [3]:
##### Task 1: Data Pre-processing

### 1. Import the dataset irismissing.csv into a data frame and find the row number of each instance that has missing values.

import pandas as pd
import numpy as np

irismissing = pd.read_csv('irismissing.csv') # Import the dataset irismissing.csv into a data frame using pandas package
#print(irismissing)
null_data = irismissing[irismissing.isnull().any(axis=1)] # df.isnull() returns dataframe after assigning True for missing values and any(axis=1) checking for at least one True per row
print(null_data.index.values) # Print the row numbers of null_data

[  3  10  19  27  31  32  46  57  62  82  90  96 101 118 128 131 137 144]


In [4]:
### 2. Write a program to drop missing values, and describe other two strategies (median, mean) for handling missing values and write a function to implement these strategies.

# Drop rows of missing values
strategy_dropNA = irismissing.dropna()
print(strategy_dropNA)

# Median: A new dataframe with median filling in missing values
strategy_median = irismissing.fillna(irismissing.median())
print(strategy_median)
print(strategy_median.isnull().sum()) # Check that all columns indeed have 0 missing values left

# Mean: A new dataframe with mean filling in missing values
strategy_mean = irismissing.fillna(irismissing.mean())
print(strategy_mean)
print(strategy_mean.isnull().sum()) # Check that all columns indeed have 0 missing values left

      Id  SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm  \
0      1            5.1           3.5            1.4           0.2   
1      2            4.9           3.0            1.4           0.2   
2      3            4.7           3.2            1.3           0.2   
4      5            5.0           3.6            1.4           0.2   
5      6            5.4           3.9            1.7           0.4   
6      7            4.6           3.4            1.4           0.3   
7      8            5.0           3.4            1.5           0.2   
8      9            4.4           2.9            1.4           0.2   
9     10            4.9           3.1            1.5           0.1   
11    12            4.8           3.4            1.6           0.2   
12    13            4.8           3.0            1.4           0.1   
13    14            4.3           3.0            1.1           0.1   
14    15            5.8           4.0            1.2           0.2   
15    16            

In [5]:
### 3. Compare the results of applying each missing value strategy using some visualization method.

#print(irismissing.describe())
#print(strategy_dropNA.describe())
#print(strategy_median.describe())
#print(strategy_mean.describe())

import matplotlib.pyplot as plt

boxdata1 = [strategy_dropNA['SepalLengthCm'], strategy_median['SepalLengthCm'], strategy_mean['SepalLengthCm']]
#data2 = [strategy_dropNA['SepalWidthCm'], strategy_median['SepalWidthCm'], strategy_mean['SepalWidthCm']]
#data3 = [strategy_dropNA['PetalLengthCm'], strategy_median['PetalLengthCm'], strategy_mean['PetalLengthCm']]
#data4 = [strategy_dropNA['PetalWidthCm'], strategy_median['PetalWidthCm'], strategy_mean['PetalWidthCm']]

graph1 = plt.boxplot(boxdata1, patch_artist=True, labels=['Strategy 1','Strategy 2','Strategy 3'])
plt.title("Sepal length after applying three missing values strategies")
plt.ylabel("Sepal Length (in cm)")

colors = ['pink', 'lightblue', 'lightgreen']
for patch, color in zip(graph1['boxes'], colors):
    patch.set_facecolor(color)

In [10]:
##### Task 2: Decision Trees

### 1. Manually generate the decision tree for the passenger survival dataset below. Use information gain as the split measure.

# 1. Class: Whole Entropy

import pandas as pd
import numpy as np
import math

# Import the dataset Passanger Survival.xlsx into a data frame using pandas package
dataPassangerSurvival = pd.read_excel('Passanger Survival.xlsx') 

### ----------------------------------------------------------------------------------------------

## Functions
# Function to calculate the total frequency of a certain attribute
def calcFreq(attributeList):
    totalfreq = 0
    for freq in attributeList:
        totalfreq = totalfreq + freq
    return totalfreq

# Function to calculate probability of each type of attributes
def calcProb(dataset,A): # A is attribute (in string) from the dataset
    listAttrType = np.unique(dataset[A]) # get a list of types of the attributes
    listAttrProb = []
    for x in listAttrType:
        subdataset = dataset[dataset[A]==x] # get sub-dataset containing in which "Attribute == type"
        subprob = calcFreq(subdataset['Freq'])/calcFreq(dataset['Freq']) # divide the frequency of "Attribute == type" over the frequency of total Attribute
        listAttrProb.append(subprob)
    return listAttrProb

# Function to calculate entropy, given a probability list from the output of the function calcProb(dataset)
def calcEnt(dataset,A):
    ent = 0.0
    for prob in calcProb(dataset,A):
        ent = ent + prob * math.log(prob, 2) # log base 2
    return -ent

# Function to calculate information gain
def IG(dataset,A):
    
    # Make a list of entropy E(S1), E(S2), ... E(Sn)
    listAttrType = np.unique(dataset[A]) # get a list of types of the attributes
    listEnt = []
    for x in listAttrType:
        subdataset = dataset[dataset[A]==x] # get sub-dataset containing in which "Attribute == type"
        ent = calcEnt(subdataset, 'Survived') # entropy of total/main attribute
        listEnt.append(ent)
    
    # Make a list of probability P1, P2, ..., Pn
    listProb = calcProb(dataset,A)
    
    subEnt = 0.0
    for ent in listEnt:
        for p in listProb:
            if listEnt.index(ent) == listProb.index(p): #make sure p1*E(S1)
                subEnt = subEnt + p*ent
    
    IG = calcEnt(dataset,'Survived') - subEnt # main entropy - sub entropy
    return IG

### ----------------------------------------------------------------------------------------------

## Results of information gain for each attribute

print(IG(dataPassangerSurvival, 'Class'))
print(IG(dataPassangerSurvival, 'Sex')) # --> highest IG, so become root node
print(IG(dataPassangerSurvival, 'Age'))

0.08122494499417288
0.19038696504629216
0.005225772887007096


In [17]:
dataMale = dataPassangerSurvival[dataPassangerSurvival['Sex']=='Male']
print(IG(dataMale,'Class'))
print(IG(dataMale,'Age'))

dataFemale = dataPassangerSurvival[dataPassangerSurvival['Sex']=='Female']
print(IG(dataFemale,'Class'))
print(IG(dataFemale,'Age'))

0.02136387471797685
0.02395741115102923
0.2242493003235302
0.017751701417448862


In [None]:
##### Task 2.2: Tennis data

### 1. Write a function that computes the entropy of a set S with 𝑁𝑝𝑜𝑠 positive observations and 𝑁𝑛𝑒𝑔 negative observations.
