# Assignment 1
### Yu-Wen Huang 

### Declaration:

It is declared that it has been understood that no other library/package than the Python 3 standard library, NumPy and pandas may be used in the solution for this assignment.


## Load NumPy and pandas

In [1]:
import numpy as np
import pandas as pd

## 1a. Create and apply normalization

In [15]:
# Insert the functions create_normalization and apply_normalization below (after the comments)
#
# Input to create_normalization:
# df: a dataframe (where the column names "CLASS" and "ID" have special meaning)
# normalizationtype: "minmax" (default) or "zscore"
#
# Output from create_normalization:
# df: a new dataframe, where each numeric value in a column has been replaced by a normalized value
# normalization: a mapping (dictionary) from each column name to a triple, consisting of
#                ("minmax",min_value,max_value) or ("zscore",mean,std)
#
# Hint 1: First copy the input dataframe and modify the copy (the input dataframe should be kept unchanged)
# Hint 2: Consider columns of type "float" or "int" only (and which are not labeled "CLASS" or "ID"),
#         the other columns should remain unchanged
# Hint 3: Take a close look at the lecture slides on data preparation
def create_normalization(df, normalizationtype="minmax"):
    """
    Make normalization vectors based on training dataset
    """
    #Hint 1: Basic
    #copy the DataFrame first
    df_copy = df.copy()
    columns = df_copy.columns
    normalization = {}
    
    for i in columns:
        dtype = df_copy[i].dtype

        #Hint 2: Constratints handling
        #do not care about ID or CLASS
        if i == "ID" or i == "CLASS":
            continue
            
        #only care about int and float
        if not np.issubdtype(dtype, np.integer) and not np.issubdtype(dtype, np.floating):
            continue

        if normalizationtype == "minmax":
            min_value = np.min(df_copy[i])
            max_value = np.max(df_copy[i])
            df_copy[i] = df_copy[i].apply(lambda x: (x-min_value)/(max_value-min_value))
            normalization[i] = (normalizationtype, min_value, max_value)
            
        elif normalizationtype == "zscore":
            mean = df_copy[i].mean()
            std = df_copy[i].std()
            df_copy[i] = df_copy[i].apply(lambda x: (x-mean/std))
            normalization[i] = (normalizationtype, mean, std)
    
    return df_copy, normalization
    

# Input to apply_normalization:
# df: a dataframe
# normalization: a mapping (dictionary) from column names to triples (see above)
#
# Output from apply_normalization:
# df: a new dataframe, where each numerical value has been normalized according to the mapping
#
# Hint 1: First copy the input dataframe and modify the copy (the input dataframe should be kept unchanged)
# Hint 2: For minmax-normalization, you may consider to limit the output range to [0,1]
def apply_normalization(df, normalization):
    """
    Apply training information onto test dataset
    """
    #Hint 1: Basic
    #copy the DataFrame first
    df_copy = df.copy()
    
    #key: column name
    #val: ('method', number1, number2)
    for key, val in normalization.items():
        normalizationtype = val[0]
        
        #Hint 2: Constratints handling
        #do not care about ID or CLASS
        dtype = df_copy[key].dtype
        if key == "ID" or key == "CLASS":
            continue
            
        #only care about int and float
        if not np.issubdtype(dtype, np.integer) and not np.issubdtype(dtype, np.floating):
            continue
            
        if normalizationtype == "minmax":
            min_value = val[1]
            max_value = val[2]
            df_copy[key] = df_copy[key].apply(lambda x: (x-min_value)/(max_value-min_value))
            
            #Hint 2: apply strong constraint limit [0,1]
            #there are both way and both are working fine
            """
            df_copy.loc[df_copy[key] < 0, key] = 0 
            df_copy.loc[df_copy[key] > 1, key] = 1
            """
            df_copy[key] = df_copy[key].clip(0,1)
            
        elif normalizationtype == "zscore":
            mean = val[1]
            std = val[2]
            df_copy[key] = df_copy[key].apply(lambda x: (x-mean/std))
            
    return df_copy

In [16]:
# Test your code (leave this part unchanged)

glass_train_df = pd.read_csv("glass_train.txt")

glass_test_df = pd.read_csv("glass_test.txt")

glass_train_norm, normalization = create_normalization(glass_train_df,normalizationtype="minmax")
print("normalization:\n")
for f in normalization:
    print("{}:{}".format(f,normalization[f]))

glass_test_norm = apply_normalization(glass_test_df,normalization)
print("\nglass_test_norm:\n")
glass_test_norm

normalization:

RI:('minmax', 1.51131, 1.53125)
Na:('minmax', 10.73, 15.79)
Mg:('minmax', 0.0, 4.49)
Al:('minmax', 0.29, 3.04)
Si:('minmax', 69.81, 75.18)
K:('minmax', 0.0, 6.21)
Ca:('minmax', 5.43, 14.68)
Ba:('minmax', 0.0, 3.15)
Fe:('minmax', 0.0, 0.37)

glass_test_norm:



Unnamed: 0,ID,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,CLASS
0,101,0.262788,0.399209,0.634744,0.418182,0.644320,0.091787,0.363243,0.034921,0.594595,2
1,104,0.799398,0.606719,0.701559,0.134545,0.141527,0.012882,0.671351,0.000000,0.000000,2
2,44,0.541123,0.592885,0.855234,0.156364,0.363128,0.027375,0.465946,0.000000,0.000000,1
3,17,0.327482,0.385375,0.817372,0.316364,0.614525,0.098229,0.353514,0.000000,0.000000,1
4,81,0.231194,0.420949,0.783964,0.665455,0.530726,0.111111,0.274595,0.000000,0.000000,2
5,142,0.361083,0.488142,0.808463,0.283636,0.562384,0.091787,0.322162,0.028571,0.459459,2
6,120,0.261284,0.559289,0.795100,0.429091,0.491620,0.103060,0.273514,0.000000,0.000000,2
7,123,0.278837,0.494071,0.788419,0.432727,0.564246,0.090177,0.288649,0.000000,0.000000,2
8,133,0.342026,0.533597,0.886414,0.323636,0.499069,0.093398,0.294054,0.000000,0.000000,2
9,185,0.000000,1.000000,0.000000,0.018182,1.000000,0.000000,0.131892,0.000000,0.000000,6


### Comment on assumptions, things that do not work properly, etc.


Everything works fine (Answer is correct and apply all constraints).

- constraint 1: float and int<br>
  -> because there are many type of int and float (int16~64, float32~128), we tried to apply np.issubdtype to check int, and np.dtype to check float (to apply various way). It is a trick to check all float dtype in numpy <br>
  ```python
  np.issubdtype(dtype, np.integer)
  np.issubdtype(dtype, np.floating)
  ```
- constraint 2: strict range [0,1] on the test set<br>
  -> we tried two way: apply manually and use np.clip and both worked fine

## 1b. Create and apply imputation

In [17]:
# Insert the functions create_imputation and apply_imputation below (after the comments)
#
# Input to create_imputation:
# df: a dataframe (where the column names "CLASS" and "ID" have special meaning)
#
# Output from create_imputation:
# df: a new dataframe, where each missing numeric value in a column has been replaced by the mean of that column 
#     and each missing categoric value in a column has been replaced by the mode of that column
# imputation: a mapping (dictionary) from column name to value that has replaced missing values
#
# Hint 1: First copy the input dataframe and modify the copy (the input dataframe should be kept unchanged)
# Hint 2: Handle columns of type "float" or "int" only (and which are not labeled "CLASS" or "ID") in one way
#         and columns of type "object" and "category" in other ways
# Hint 3: Consider using the pandas functions mean() and mode() respectively, as well as fillna
# Hint 4: In the rare case of all values in a column being missing, replace numeric values with 0,
#         object values with "" and category values with the first category (cat.categories[0])  
#

def create_imputation(df):
    #Hint 1: Basic
    #copy the DataFrame first
    df_copy = df.copy()
    columns = df_copy.columns
    imputation = {}
    
    for i in columns:
        dtype = df_copy[i].dtype

        #Hint 2: Constratints handling
        #do not care about ID or CLASS
        if i == "ID" or i == "CLASS":
            continue
        #Case 1: continuous -> use mean
        if np.issubdtype(dtype, np.integer) or np.issubdtype(dtype, np.floating):
            #Special case: all values are missing
            if np.all(df_copy[i].isnull()):
                criteria = 0
            #regular case
            else:
                criteria = df_copy[i].mean()
        #case 2: categorical -> use mode
        elif hasattr(df_copy[i], 'cat'):
            #Special case: all values are missing
            if np.all(df_copy[i].isnull()):
                criteria = df_copy[i].cat.categories[0]
            #regular case
            else:
                print(df_copy[i][df_copy[i].notnull()])
                criteria = df_copy[i].mode()[0] #always return series
        #case 3: object case -> cannot apply .cat -> use "" when all are missing
        #not sure about dtype == "object" or else
        elif dtype == "object":
            #Special case: all values are missing
            if np.all(df_copy[i].isnull()):
                criteria = ""
            #regular case
            else:
                criteria = df_copy[i].mode()[0] #always return series
        #except object, categorical, numerical -> but there is no case when we load a file
        else:
            print(dtype)

        #apply criteria (use fillna)
        df_copy[i] = df_copy[i].fillna(criteria)
        #add value into imputation dictionary
        imputation[i] = criteria
    
    return df_copy, imputation
        
        
# Input to apply_imputation:
# df: a dataframe
# imputation: a mapping (dictionary) from column name to value that should replace missing values
#
# Output from apply_imputation:
# df: a new dataframe, where each missing value has been replaced according to the mapping
#
# Hint 1: First copy the input dataframe and modify the copy (the input dataframe should be kept unchanged)
# Hint 2: Consider using fillna
def apply_imputation(df, imputation):
    #Hint 1: Basic
    #copy the DataFrame first
    df_copy = df.copy()
    
    #key: column name
    #val: imputation value
    for key, val in imputation.items():
        
        #Hint 2: Constratints handling
        #do not care about ID or CLASS (safe check when applying!)
        if key == "ID" or key == "CLASS": 
            continue
            
        criteria = val
        df_copy[key] = df_copy[key].fillna(criteria)
            
    return df_copy


In [18]:
# Test your code (leave this part unchanged)

anneal_train_df = pd.read_csv("anneal_train.txt")
anneal_test_df = pd.read_csv("anneal_test.txt")

anneal_train_imp, imputation = create_imputation(anneal_train_df)
anneal_test_imp = apply_imputation(anneal_test_df,imputation)

print("Imputation:\n")
for f in imputation:
    print("{}:{}".format(f,imputation[f]))

print("\nNo. of replaced missing values in training data:\n{}".format(anneal_train_imp.count()-anneal_train_df.count()))
print("\nNo. of replaced missing values in test data:\n{}".format(anneal_test_imp.count()-anneal_test_df.count()))



Imputation:

family:TN
product-type:C
steel:A
carbon:3.859688195991091
hardness:13.084632516703786
temper_rolling:T
condition:S
formability:2.2517482517482517
strength:26.302895322939868
non-ageing:N
surface-finish:P
surface-quality:E
enamelability:1.7142857142857142
bc:Y
bf:Y
bt:Y
bw/me:B
bl:Y
m:0
chrom:C
phos:P
cbond:Y
marvi:0
exptl:0
ferro:Y
corr:0
blue-bright-varn-clean:B
lustre:Y
jurofm:0
s:0
p:0
shape:SHEET
thick:1.1911937639198227
width:769.4917594654789
len:1229.293986636971
oil:Y
bore:35.18930957683742
packing:3.0

No. of replaced missing values in training data:
family                    382
product-type                0
steel                      43
carbon                      0
hardness                    0
temper_rolling            374
condition                 160
formability               163
strength                    0
non-ageing                391
surface-finish            444
surface-quality           128
enamelability             442
bc                        448
b

### Comment on assumptions, things that do not work properly, etc.

Everything works fine (Answer is correct and apply all constraints).

- constraint 1: handle separately regarding its dtype (cat, obj <-> int, float)
 - for empty and nan values: use mean (numerical) and mode (obj), first category (cat)
 - when we load a data, actually there is no possibility to be category when we do not set it manually using astype (all the string values are regarded object), but we made a condition on category following description because it will be used in further assignments
- constraint 2: checking NaN or Null value -> use isnull / apply value -> fillna
 - We used np.fillna to fill empty value and used np.isnull to check nan/null value because np.isnan is only applicable onto float column

## 1c. Create and apply discretization

In [19]:
# Insert the functions create_bins and apply_bins below
#
# Input to create_bins:
# df: a dataframe
# nobins: no. of bins (default = 10)
# bintype: either "equal-width" (default) or "equal-size" 
#
# Output from create_bins:
# df: a new dataframe, where each numeric feature value has been replaced by a categoric (corresponding to some bin)
# binning: a mapping (dictionary) from column name to bins (threshold values for the bin)
#
# Hint 1: First copy the input dataframe and modify the copy (the input dataframe should be kept unchanged)
# Hint 2: Discretize columns of type "float" or "int" only (and which are not labeled "CLASS" or "ID")
# Hint 3: Consider using pd.cut and pd.qcut respectively, with labels=False, retbins=True and duplicates="drop"
#         (the last option will avoid errors when not enough bins can be created)
# Hint 4: Set all columns in the new dataframe to be of type "category"
# Hint 5: Set the categories of the discretized features to be [0,...,nobins-1]
# Hint 6: Change the first and the last element of each binning to -np.inf and np.inf respectively 
def create_bins(df, nobins=10, bintype="equal-width"):
    #Hint 1: Basic
    #copy the DataFrame first
    df_copy = df.copy()
    columns = df_copy.columns
    binning = {}
    
    for i in columns:
        dtype = df_copy[i].dtype

        #Hint 2: Constratints handling
        #do not care about ID or CLASS
        if i == "ID" or i == "CLASS":
            continue
        #only care about int and float
        if not np.issubdtype(dtype, np.integer) and not np.issubdtype(dtype, np.floating):
            continue
        
        #Hint 3 - Case 1: equal width -> cut
        if bintype == "equal-width":
            res, bins = pd.cut(df_copy[i], bins=nobins, labels=False, retbins=True, duplicates="drop")
        #Hint 3 - Case 2: equal size -> qcut
        elif bintype == "equal-size":
            res, bins = pd.qcut(df_copy[i], q=nobins, labels=False, retbins=True, duplicates="drop")
            
        #apply res
        df_copy[i] = res
        
        #Hint 4 - Set column to be of type "category"
        df_copy[i] = df_copy[i].astype("category")
        
        #Hint 5 - set the categories as a number of bins
        df_copy[i] = df_copy[i].cat.set_categories(list(range(len(bins))))
        
        #Hint 6 - set first and last value
        bins[0] = -np.inf
        bins[-1] = np.inf
        
        #set bins on the output
        binning[i] = bins
    
    return df_copy, binning

# Input to apply_bins:
# df: a dataframe
# binning: a mapping (dictionary) from column name to bins (threshold values for the bin)
#
# Output from apply_bins:
# df: a new dataframe, where each numeric feature value has been replaced by a categoric (corresponding to some bin)
#
# Hint 1: First copy the input dataframe and modify the copy (the input dataframe should be kept unchanged)
# Hint 2: Consider using pd.cut 
# Hint 3: Set all columns in the new dataframe to be of type "category"
# Hint 4: Set the categories of the discretized features to be [0,...,nobins-1]
#
def apply_bins(df, binning):
    #Hint 1: Basic
    #copy the DataFrame first
    df_copy = df.copy()
    
    for key, val in binning.items():
        dtype = df_copy[key].dtype
        #Hint 2: Constratints handling
        
        #do not care about ID or CLASS (safe check when applying!)
        if key == "ID" or key == "CLASS":
            continue
        #only care about int and float
        if not np.issubdtype(dtype, np.integer) and not np.issubdtype(dtype, np.floating):
            continue
        
        #Hint 2
        res = pd.cut(df_copy[key], bins=val, labels=False, duplicates="drop")
        df_copy[key] = res
        
        #Hint 3 - Set column to be of type "category"
        df_copy[key] = df_copy[key].astype("category")

        #Hint 4 - set the categories as a number of nobins
        df_copy[key] = df_copy[key].cat.set_categories(list(range(len(val))))
        
    return df_copy


In [20]:
# Test your code  (leave this part unchanged)

glass_train_df = pd.read_csv("glass_train.txt")

glass_test_df = pd.read_csv("glass_test.txt")

glass_train_disc, binning = create_bins(glass_train_df,nobins=10,bintype="equal-size")
print("binning:\n")
for f in binning:
    print("{}:{}".format(f,binning[f]))

glass_test_disc = apply_bins(glass_test_df,binning)
print("\nglass_test_disc:\n")
glass_test_disc


binning:

RI:[    -inf 1.515896 1.51618  1.516516 1.516866 1.51753  1.517902 1.518618
 1.520114 1.521846      inf]
Na:[  -inf 12.73  12.872 13.    13.222 13.38  13.492 13.794 14.198 14.82
    inf]
Mg:[ -inf 1.82  3.188 3.41  3.476 3.55  3.61  3.728   inf]
Al:[ -inf 0.906 1.172 1.23  1.348 1.48  1.54  1.622 1.808 2.094   inf]
Si:[  -inf 71.756 72.196 72.388 72.72  72.79  72.966 73.06  73.208 73.372
    inf]
K:[ -inf 0.006 0.148 0.39  0.54  0.576 0.6   0.636 0.67    inf]
Ca:[  -inf  7.978  8.112  8.338  8.554  8.67   8.81   9.032  9.674 10.924
    inf]
Ba:[-inf 0.78  inf]
Fe:[ -inf 0.062 0.118 0.24    inf]

glass_test_disc:



Unnamed: 0,ID,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,CLASS
0,101,3,1,1,4,8,4,5,0,2,2
1,104,9,7,1,0,0,1,9,0,0,2
2,44,9,6,7,0,1,2,8,0,0,1
3,17,5,0,6,1,7,6,5,0,0,1
4,81,1,1,4,9,3,8,0,0,0,2
5,142,6,3,6,1,5,4,3,0,2,2
6,120,3,6,5,4,3,7,0,0,0,2
7,123,4,4,4,4,5,4,1,0,0,2
8,133,6,5,7,2,3,5,2,0,0,2
9,185,0,9,0,0,9,0,0,0,0,6


### Comment on assumptions, things that do not work properly, etc.
Everything works fine (Answer is correct and apply all constraints).

- constraint 1: apply categorical value for each columns
 - labels=False will give integer indicator range.
 - we use additionally cat.set_categories to set categories with [0,...,nobins-1] to make it sure
- constraint 2: use qcut and cut differently
 - we set default value as "equal-width" and when equal-width, we use cut, otherwise, we use qcut

## 1d. Divide a dataset into a training and a test set

In [21]:
# Insert the function split below
#
# Input to split:
# df: a dataframe
# testfraction: a float in the range (0,1) (default = 0.5)
# Output from split:
# trainingdf: a dataframe consisting of a random sample of (1-testfraction) of the rows in df
# testdf: a dataframe consisting of the rows in df that are not included in trainingdf
#
# Hint: You may use np.random.permutation(df.index) to get a permuted list of indexes where a 
#       prefix corresponds to the test instances, and the suffix to the training instances 
def split(df, testfraction = 0.5):
    perm = np.random.permutation(df.index)
    
    #get range for spliting based on (1-testfraction)
    ran = np.int(np.round(len(perm)*(1-testfraction)))
    
    #use same ran in the end of training and start of test so that they cannot be duplicated
    trainingID = perm[:ran]
    testID = perm[ran:]
    
    trainingdf = df.iloc[trainingID]
    testdf = df.iloc[testID]

    return trainingdf, testdf

#### Our answer (to compare, it applied permutation)

In [22]:
# Test your code  (leave this part unchanged)

glass_df = pd.read_csv("glass.txt")

glass_train, glass_test = split(glass_df,testfraction=0.25)

print("Training IDs:\n{}".format(glass_train["ID"].values))

print("\nTest IDs:\n{}".format(glass_test["ID"].values))

print("\nOverlap: {}".format(set(glass_train["ID"]).intersection(set(glass_test["ID"]))))


Training IDs:
[198 102  42 210 117 183  70 157  84 151  52   2  55  38 202  57  37  69
  86 177  96 189 209 208 142 128  43   1  11  66  82 146 124 119  60  12
  92  22 211  54 171  44  64  99 176 110 160  81 132 185 212 159  76 125
  16 186 135  53  63 139 137 145 116 200  87 133 178 106  35 112   4 190
 140 162 180 184 100 199  75 144  41  10  28 150  19 114 196 191 123 168
  98  47  85   8  39  20 201 192 172 130  71 193  27 134  21 113 206  65
 105 161  90 115   6  59   3  33 166 111 155 207  68 164  88  97 179 195
  95  77  89 153  67  34  80 131 147 197 175  74  49 205  31 108  79  62
 169 214  46 182  32 138 174 165  51 187  26 118 121 154  36 170]

Test IDs:
[ 14 149 167  40  61 103  30  58 122 126  23 136   5  50 204 203  56  91
   7   9  45 213 101  93  48 109 129  72  29 156 104  17 163 107 173  25
 127 141 194  78  15  24 120 188  83 158  94 143 148 181  73 152  18  13]

Overlap: set()


#### Henrik's answer (to compare, it did not apply permutation)

In [23]:
# Test your code  (leave this part unchanged)

glass_df = pd.read_csv("glass.txt")

glass_train, glass_test = split(glass_df,testfraction=0.25)

print("Training IDs:\n{}".format(glass_train["ID"].values))

print("\nTest IDs:\n{}".format(glass_test["ID"].values))

print("\nOverlap: {}".format(set(glass_train["ID"]).intersection(set(glass_test["ID"]))))


Training IDs:
[184 193 145 199 108  21 196 155  71 100 200 179  73  82 175 204  14  39
  70 132  92 141 106  79 191 189  50  36  28  32   2 151 156  33  13  72
 142 118  43   5   4  96 195  99 198  23 122  75 140  53  30 120  31 159
  86  67 202  18  56  66 203 167 148  63 173   7  68 176 121 194  64  54
  10  24 187 183  97 185 149 114  91 133 162   8  52 128  19 171  77 152
  42 161 134  57  83 150 110  80  11 205 107 144 201  29  74  44 136 186
 211  35   6 164  65  26  46  25   9 143  12 177 158  41 135  89 190 116
  85 115  81 146  27 170 213 181  78 153  16 111  15  51 165  84 109  88
 188  40 139  20 126 102 119 125 214 104  38 154 127   3  94  90]

Test IDs:
[209 160 137 105 168  59 206 212 131 113 103  49 157 130  62  61 124 178
  95 123  76 210 117  47  93 138  45  37 182  17  48  69   1 197 174  55
 129  22  34 163 166  87 192 147 208 169 172 112 180  58  98  60 101 207]

Overlap: set()


### Comment on assumptions, things that do not work properly, etc.
Everything works fine (Answer is correct and apply all constraints).
- Henrik's answer does not apply permutation so these are not ramdonly distributed
- we use permutation function before spliting so the numbers are randomly distributed
 * We put both answers here to compare
- but the number of training and test rows are same (54 in test and others in training)

## 1e. Calculate accuracy of a set of predictions

In [24]:
# Insert the function accuracy below
#
# Input to accuracy:
# df: a dataframe with class labels as column names and each row corresponding to
#     a prediction with estimated probabilities for each class
# correctlabels: an array (or list) of the correct class label for each prediction
#                (the number of correct labels must equal the number of rows in df)
#
# Output from accuracy:
# accuracy: the fraction of cases for which the predicted class label coincides with the correct label
#
# Hint: In case the label receiving the highest probability is not unique, you may
#       resolve that by picking the first (as ordered by the column names) or 
#       by randomly selecting one of the labels with highest probaility.

def accuracy(df, correctlabels):
    df_copy = df.copy()

    pred = np.empty(len(correctlabels), dtype = str)
    df_max = df_copy.max(axis=1) #find the highest value in each row to compare later

    for i in range(len(df)):
        df_tmp = df_copy.iloc[i:i+1]
        for col in df_tmp.columns:
            if(df_tmp[col] >= df_max[i:i+1]).bool(): 
                pred[i] = col
                """
                if break enabled, will pick the first option, 
                else, will leave the last option that equals the highest value, 
                can be randomized with an if and random function
                """
                #1. random mode
                if np.random.choice([True, False]): break 
                
                #2. picking first one mode
                #break
                
    numbercorrect = np.sum(np.array(correctlabels) == pred)
    
    return numbercorrect/len(correctlabels)

In [35]:
# Test your code  (leave this part unchanged)

predictions = pd.DataFrame({"A":[0.5,0.5,0.5,0.25,0.25],"B":[0.5,0.25,0.25,0.5,0.25],"C":[0.0,0.25,0.25,0.25,0.5]})
predictions


Unnamed: 0,A,B,C
0,0.5,0.5,0.0
1,0.5,0.25,0.25
2,0.5,0.25,0.25
3,0.25,0.5,0.25
4,0.25,0.25,0.5


In [36]:
correctlabels = ["B","A","B","B","C"]

accuracy(predictions,correctlabels) # Note that depending on how ties are resolved the accuracy may be 0.6 or 0.8

0.6

## 2a. Create and apply one-hot encoding

In [37]:
# Insert the functions create_one_hot and apply_one_hot below
#
# Input to create_one_hot:
# df: a dataframe
#
# Output from create_one_hot:
# df: a new dataframe, where each categoric feature has been replaced by a set of binary features 
#    (as many new features as there are possible values)
# one_hot: a mapping (dictionary) from column name to a set of categories (possible values for the feature)
#
# Hint 1: First copy the input dataframe and modify the copy (the input dataframe should be kept unchanged)
# Hint 2: Consider columns of type "object" or "category" only (and which are not labeled "CLASS" or "ID")
# Hint 3: Consider creating new column names by merging the original column name and the categorical value
# Hint 4: Set all new columns to be of type "float"
# Hint 5: Do not forget to remove the original categoric feature

def create_one_hot(df):
    #Hint 1: Basic
    #copy the DataFrame first
    df_copy = df.copy()
    columns = df_copy.columns
    output = {}
    
    for i in columns:
        dtype = df_copy[i].dtype

        #Hint 2: Constratints handling
        #do not care about ID or CLASS
        if i == "ID" or i == "CLASS":
            continue
            
        #we only need to care about object and categorical values
        #but practically when we load a file, there is no categorical value
        #Hint 2 - Case 1: object
        if dtype == "object":
            #change the type into category to make one hot easier
            df_copy[i] = df_copy[i].astype("category")
        #Hint 2 - Case 2: category
        if hasattr(df_copy[i], 'cat'):
            cats = df_copy[i].cat.categories
            for cat in cats:
                #make new column and make type as float
                df_copy[i+'-'+cat] = (df_copy[i] == cat).astype("float")
            #delete original column
            df_copy.drop(i, axis=1, inplace=True)
            output[i] = cats
    
    return df_copy, output

# Input to apply_one_hot:
# df: a dataframe
# one_hot: a mapping (dictionary) from column name to categories
#
# Output from apply_one_hot:
# df: a new dataframe, where each categoric feature has been replaced by a set of binary features
#
# Hint: See the above Hints

def apply_one_hot(df, one_hot):
    #Hint 1: Basic
    #copy the DataFrame first
    df_copy = df.copy()
    columns = df_copy.columns
    
    for i, cats in one_hot.items():
        dtype = df_copy[i].dtype

        #Hint 2: Constratints handling
        #do not care about ID or CLASS
        if i == "ID" or i == "CLASS":
            continue
            
        #Hint 2 - Case 1: category
        if dtype == "object":
            #change the type into category to make one hot easier
            df_copy[i] = df_copy[i].astype("category")
        if hasattr(df_copy[i], 'cat'):
            for cat in cats:
                #make new column and make type as float
                df_copy[i+'-'+cat] = (df_copy[i] == cat).astype("float")
            #delete original column
            df_copy.drop(i, axis=1, inplace=True)
    
    return df_copy

#### Our answer (to compare, there is top-left-square-b)

In [38]:
# Test your code  (leave this part unchanged)

tictactoe = pd.read_csv("tic-tac-toe.txt")

train_df, test_df = split(tictactoe) # Using your above function

new_train, one_hot = create_one_hot(train_df)

new_test = apply_one_hot(test_df,one_hot)
new_test

Unnamed: 0,CLASS,top-left-square-b,top-left-square-o,top-left-square-x,top-middle-square-b,top-middle-square-o,top-middle-square-x,top-right-square-b,top-right-square-o,top-right-square-x,...,middle-right-square-x,bottom-left-square-b,bottom-left-square-o,bottom-left-square-x,bottom-middle-square-b,bottom-middle-square-o,bottom-middle-square-x,bottom-right-square-b,bottom-right-square-o,bottom-right-square-x
296,positive,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
664,negative,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
796,negative,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
775,negative,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
380,positive,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
499,positive,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
432,positive,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
766,negative,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
7,positive,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
882,negative,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


#### Henrik's answer (to compare, there is no top-left-square-b)

In [39]:
# Test your code  (leave this part unchanged)

tictactoe = pd.read_csv("tic-tac-toe.txt")

train_df, test_df = split(tictactoe) # Using your above function

new_train, one_hot = create_one_hot(train_df)

new_test = apply_one_hot(test_df,one_hot)
new_test

Unnamed: 0,CLASS,top-left-square-b,top-left-square-o,top-left-square-x,top-middle-square-b,top-middle-square-o,top-middle-square-x,top-right-square-b,top-right-square-o,top-right-square-x,...,middle-right-square-x,bottom-left-square-b,bottom-left-square-o,bottom-left-square-x,bottom-middle-square-b,bottom-middle-square-o,bottom-middle-square-x,bottom-right-square-b,bottom-right-square-o,bottom-right-square-x
407,positive,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
186,positive,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
11,positive,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
862,negative,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
726,negative,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
357,positive,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
312,positive,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
679,negative,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
566,positive,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
285,positive,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0


### Comment on assumptions, things that do not work properly, etc.
Everything works fine (Answer is correct and apply all constraints).

- In the answer sheet, there is no column called top-left-square-b. It is missing on professor's test set. 
- But in our answer we have that column (we tested seperately using 3 different code for each student but there is always top-left-square-b). 
- So our answer has one more column than Henrik's one, however it does not mean it is incorrect.


## 2b. Divide a dataset into a number of folds

In [40]:
# Insert the function folds below
#
# Input to folds:
# df: a dataframe
# nofolds: an integer greater than 1 (default = 10)
#
# Output from folds:
# folds: a list (of length = nofolds) dataframes consisting of random non-overlapping, 
#        approximately equal-sized subsets of the rows in df
#
# Hint: You may use np.random.permutation(df.index) to get a permuted list of indexes from which a 
#       prefix corresponds to the test instances, and the suffix to the training instances 

def folds(df, nofolds=10):
    idex = np.random.permutation(df.index)
    fract = np.int(np.round(len(idex)/nofolds))
    afterfolds = [df[df.index.isin(idex[fract*i:fract*(i+1)])] for i in range(nofolds)]
    return afterfolds

In [41]:
# Test your code  (leave this part unchanged)

glass_df = pd.read_csv("glass.txt")

glass_folds = folds(glass_df,nofolds=5)

fold_sizes = [len(f) for f in glass_folds]

print("Fold sizes:{}\nTotal no. instances: {}".format(fold_sizes,sum(fold_sizes)))

Fold sizes:[43, 43, 43, 43, 42]
Total no. instances: 214


### Comment on assumptions, things that do not work properly, etc.
Everything works fine (Answer is correct and apply all constraints).
- there was no any criteria applying ceil or round, we tested both but the result was same
- now in our code, we use np.round() and it worked fine


## 2c. Calculate Brier score of a set of predictions

In [42]:
# Insert the function brier_score below
#
# Input to brier_score:
# df: a dataframe with class labels as column names and each row corresponding to
#     a prediction with estimated probabilities for each class
# correctlabels: an array (or list) of the correct class label for each prediction
#                (the number of correct labels must equal the number of rows in df)
#
# Output from brier_score:
# brier_score: the average square error of the predicted probabilties 
#
# Hint: Compare each predicted vector to a vector for each correct label, which is all zeros except 
#       for at the index of the correct class. The index can be found using np.where(df.columns==l)[0] 
#       where l is the correct label.

def brier_score(df, correctlabels):
    #setting dictionary
    correctdict = {}
    brier_score = 0
    
    for i in correctlabels:
        if i not in correctdict.keys():
            correctdict[i] = [0]
    
    correctdict = pd.DataFrame(correctdict)
    print(correctdict)

    #loop number of samples
    for cnt, val in enumerate(correctlabels):
        print(cnt,val)
        #initialize 0 again
        correctdict.iloc[0] = 0
        #only correct one goes to 1
        correctdict[val] = 1
        #get single row in a prediction
        row = df.iloc[cnt]
        #calculate score
        score = np.sum(np.square(row-correctdict), axis=1)
        brier_score += score
    
    brier_score /= len(correctlabels)
    return brier_score[0]

In [43]:
# Test your code  (leave this part unchanged)

predictions = pd.DataFrame({"A":[0.5,0.5,0.5,0.25,0.25],"B":[0.5,0.25,0.25,0.5,0.25],"C":[0.0,0.25,0.25,0.25,0.5]})

correctlabels = ["B","A","B","B","C"]

brier_score(predictions,correctlabels)

   B  A  C
0  0  0  0
0 B
1 A
2 B
3 B
4 C


0.5

### Comment on assumptions, things that do not work properly, etc.

Everything works fine (Answer is correct and apply all constraints).


## 2d. Calculate AUC of a set of predictions

In [44]:
# Insert the function auc below
#
# Input to auc:
# df: a dataframe with class labels as column names and each row corresponding to
#     a prediction with estimated probabilities for each class
# correctlabels: an array (or list) of the correct class label for each prediction
#                (the number of correct labels must equal the number of rows in df)
#
# Output from auc:
# auc: the weighted area under ROC curve
#
# Hint 1: Calculate the binary AUC first for each class label c, i.e., treating the
#         predicted probability of this class for each instance as a score; the true positives
#         are the ones belonging to class c and the false positives the rest
# Hint 2: When calculating the binary AUC, first find the scores of the true positives and then
#         the scores of the true negatives
# Hint 3: You may use a dictionary with a mapping from each score to an array of two numbers; 
#         the number of true positives with this score and the number of true negatives with this score
# Hint 4: Created a (reversely) sorted (on the scores) list of pairs from the dictionary and
#         iterate over this to additively calculate the AUC
# Hint 5: For each pair in the above list, there are three cases to consider; the no. of false positives
#         is zero, the no. of true positives is zero, and both are non-zero
# Hint 6: Calculate the weighted AUC by summing the individual AUCs weighted by the relative
#         frequency of each class (as estimated from the correct labels)

import collections

def auc(df, correctlabels):
    #make labels
    labels = set(correctlabels)
    print(labels)
    counts = [] #for weighted sum (count of class in true population)
    aucs = [] #auc score for each count
    
    #calculate TP/FP for each label
    for label in labels:
        #make scores 
        tot_tp = 0
        tot_fp = 0
        scores = {}
        
        for idx, val in enumerate(correctlabels):
            if df.iloc[idx][label] not in scores.keys():
                scores[df.iloc[idx][label]] = [0, 0]
            if val == label:
                scores[df.iloc[idx][label]][0] += 1
                tot_tp += 1
            else:
                scores[df.iloc[idx][label]][1] += 1
                tot_fp += 1
         
        #Descending sort by its score
        print(scores)
        
        scores = collections.OrderedDict(sorted(scores.items(), reverse=True))
        
        print(scores)
        #GET AUC score
        auc_sub = 0
        cov_tp = 0
        
        
        for key, val in scores.items():
            tp_rate = val[0]
            fp_rate = val[1]
            
            print(tp_rate)
            print(fp_rate)

            if fp_rate == 0:
                cov_tp += tp_rate
            elif tp_rate == 0:
                auc_sub += (cov_tp/tot_tp) * (fp_rate/tot_fp)
            else:
                auc_sub += (cov_tp/tot_tp)*(fp_rate/tot_fp) + ((tp_rate/tot_tp)*(fp_rate/tot_fp))/2
                cov_tp += tp_rate
        
        #apply proportion
        counts.append(correctlabels.count(label))
        aucs.append(auc_sub)
        
    auc = np.array(aucs).dot(np.array(counts))/len(correctlabels)
    
    print(len(correctlabels))

    return auc

In [45]:
# Test your code  (leave this part unchanged)

predictions = pd.DataFrame({"A":[0.9,0.9,0.6,0.55],"B":[0.1,0.1,0.4,0.45]})
correctlabels = ["A","B","B","A"]

auc(predictions,correctlabels)

{'A', 'B'}
{0.9: [1, 1], 0.6: [0, 1], 0.55: [1, 0]}
OrderedDict([(0.9, [1, 1]), (0.6, [0, 1]), (0.55, [1, 0])])
1
1
0
1
1
0
{0.1: [1, 1], 0.4: [1, 0], 0.45: [0, 1]}
OrderedDict([(0.45, [0, 1]), (0.4, [1, 0]), (0.1, [1, 1])])
0
1
1
0
1
1
4


0.375

In [46]:
predictions = pd.DataFrame({"A":[0.5,0.5,0.5,0.25,0.25],"B":[0.5,0.25,0.25,0.5,0.25],"C":[0.0,0.25,0.25,0.25,0.5]})

correctlabels = ["B","A","B","B","C"]

auc(predictions,correctlabels)

{'A', 'B', 'C'}
{0.5: [1, 2], 0.25: [0, 2]}
OrderedDict([(0.5, [1, 2]), (0.25, [0, 2])])
1
2
0
2
{0.5: [2, 0], 0.25: [1, 2]}
OrderedDict([(0.5, [2, 0]), (0.25, [1, 2])])
2
0
1
2
{0.0: [0, 1], 0.25: [0, 3], 0.5: [1, 0]}
OrderedDict([(0.5, [1, 0]), (0.25, [0, 3]), (0.0, [0, 1])])
1
0
0
3
0
1
5


0.85

In [47]:
import matplotlib.pyplot as plt
pos = [1,1,1,1,0,1,0,0]
neg = [0,0,1,0,1,0,2,1]
tpr = [cs/sum(pos) for cs in np.cumsum(pos)]
fpr = [cs/sum(neg) for cs in np.cumsum(neg)]
plt.plot([0.0]+fpr+[1.0],[0.0]+tpr+[1.0],"-",label="1")
plt.plot([0.0,1.0],[0.0,1.0],"--",label="Baseline")
plt.xlabel("fpr")
plt.ylabel("tpr")
plt.legend()
plt.show()

<Figure size 640x480 with 1 Axes>