In [103]:
# 1a
# Input to create_column_filter:
# df - a dataframe (where the column names "CLASS" and "ID" have special meaning)
#
# Output from create_filter:
# df            - a new dataframe, where columns, except "CLASS" and "ID", containing only missing values 
#                 or only one unique value (apart from the missing values) have been dropped
# column_filter - a list of the names of the remaining columns, including "CLASS" and "ID"

In [104]:
import pandas as pd
import numpy as np

In [105]:
df = pd.DataFrame({"CLASS":[1,0,1,0,1],"A":[1,2,np.nan,4,5],"B":[1,1,1,1,np.nan],"C":["h","h",np.nan,"i","h"],"D":[np.nan,np.nan,np.nan,np.nan,np.nan]})
df

Unnamed: 0,CLASS,A,B,C,D
0,1,1.0,1.0,h,
1,0,2.0,1.0,h,
2,1,,1.0,,
3,0,4.0,1.0,i,
4,1,5.0,,h,


In [106]:
test = df.loc[:,'A']
test

0    1.0
1    2.0
2    NaN
3    4.0
4    5.0
Name: A, dtype: float64

In [107]:
for item in df.columns:
    print(item)

CLASS
A
B
C
D


In [108]:
df = pd.DataFrame({"CLASS":[1,0,1,0,1],"A":[1,2,np.nan,4,5],"B":[1,1,1,1,np.nan],"C":["h","h",np.nan,"i","h"],"D":[np.nan,np.nan,np.nan,np.nan,np.nan],"E":[1,np.nan,np.nan,np.nan,np.nan]})
df

Unnamed: 0,CLASS,A,B,C,D,E
0,1,1.0,1.0,h,,1.0
1,0,2.0,1.0,h,,
2,1,,1.0,,,
3,0,4.0,1.0,i,,
4,1,5.0,,h,,


In [109]:
df.columns

Index(['CLASS', 'A', 'B', 'C', 'D', 'E'], dtype='object')

In [110]:
def create_column_filter(df):
    new_df = df.copy()
    all_columns = new_df.columns
    saved_columns = []
    
    for column_name in new_df.columns:
        if column_name == 'CLASS':
            continue
        column_value = new_df.loc[:, column_name]
        values = []
        for item in column_value:
            if pd.isna(item):
                continue
            else:
                values.append(item)
        if len(set(values)) > 1 or len(values) == 1:
            saved_columns.append(column_name)
            
    deleted_colums = list(set(all_columns) ^ set(saved_columns))
    deleted_colums.remove('CLASS')
    
    for item in deleted_colums:
        new_df.pop(item)
    return new_df, saved_columns

In [111]:
new_df, column_filter = create_column_filter(df)
print(new_df)
print(column_filter)

   CLASS    A    C    E
0      1  1.0    h  1.0
1      0  2.0    h  NaN
2      1  NaN  NaN  NaN
3      0  4.0    i  NaN
4      1  5.0    h  NaN
['A', 'C', 'E']


In [112]:
def apply_column_filter(df, column_filter):
    new_df = df.copy()
    for item in column_filter:
        new_df.pop(item)
    return new_df

In [113]:
new_df = pd.DataFrame({"CLASS":[1,0,0],"A":[4,5,6],"B":[1,2,1],"C":[np.nan,np.nan,np.nan],"D":[np.nan,4,5], "E":[1,np.nan,np.nan]})
print(new_df)
filtered_new_df = apply_column_filter(new_df,column_filter)
print(filtered_new_df)

   CLASS  A  B   C    D    E
0      1  4  1 NaN  NaN  1.0
1      0  5  2 NaN  4.0  NaN
2      0  6  1 NaN  5.0  NaN
   CLASS  B    D
0      1  1  NaN
1      0  2  4.0
2      0  1  5.0


In [114]:
# 1b
# Insert the functions create_normalization and apply_normalization below (after the comments)
#
# Input to create_normalization:
# df: a dataframe (where the column names "CLASS" and "ID" have special meaning)
# normalizationtype: "minmax" (default) or "zscore"
#
# Output from create_normalization:
# df            - a new dataframe, where each numeric value in a column has been replaced by a normalized value
# normalization - a mapping (dictionary) from each column name to a triple, consisting of
#                ("minmax",min_value,max_value) or ("zscore",mean,std)
#
# Hint 1: First copy the input dataframe and modify the copy (the input dataframe should be kept unchanged)
#
# Hint 2: Consider columns of type "float" or "int" only (and which are not labeled "CLASS" or "ID"),
#         the other columns should remain unchanged
#
# Hint 3: Take a close look at the lecture slides on data preparation
#
# Input to apply_normalization:
# df            - a dataframe
# normalization - a mapping (dictionary) from column names to triples (see above)
#
# Output from apply_normalization:
# df - a new dataframe, where each numerical value has been normalized according to the mapping
#
# Hint 1: First copy the input dataframe and modify the copy (the input dataframe should be kept unchanged)
#
# Hint 2: For minmax-normalization, you may consider to limit the output range to [0,1]

In [115]:
glass_train_df = pd.read_csv("/Users/zhangziheng/OneDrive/KTH/ID2214 HT21 Programming for Data Science/Assignment1/glass_train.csv")
glass_test_df = pd.read_csv("/Users/zhangziheng/OneDrive/KTH/ID2214 HT21 Programming for Data Science/Assignment1/glass_test.csv")
glass_test_df.head()

Unnamed: 0,ID,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,CLASS
0,101,1.51655,12.75,2.85,1.44,73.27,0.57,8.79,0.11,0.22,2
1,104,1.52725,13.8,3.15,0.66,70.57,0.08,11.64,0.0,0.0,2
2,44,1.5221,13.73,3.84,0.72,71.76,0.17,9.74,0.0,0.0,1
3,17,1.51784,12.68,3.67,1.16,73.11,0.61,8.7,0.0,0.0,1
4,81,1.51592,12.86,3.52,2.12,72.66,0.69,7.97,0.0,0.0,2


In [116]:
def create_normalization(df,normalizationtype="minmax"):
    normalized_df = df
    normalization = {}
#     display(normalized_df)
    if normalizationtype == "minmax":
        print("minmax mode")
        normalized_df = (df-df.min())/(df.max()-df.min())
        display(df) 
        for index, row in df.iteritems():
#             print(index,row)
            if index in ["ID","CLASS"]:
                continue
            normalization[index]=("minmax",row.mincreate_normalization(),row.max())
    if normalizationtype == "zscore":
        print("zscore mode")
        normalized_df = (df-df.mean())/df.std()
        for index, row in df.iteritems():
#             display(index,row)
            if index in ["ID","CLASS"]:
                continue
            normalization[index]=("minmax",row.mean(),row.std())
    normalized_df['ID'] = df['ID']
    normalized_df['CLASS'] = df['CLASS']
#     display(normalized_df)
    return normalized_df, normalization

def apply_normalization(df,normalization):
    df2 = df
    for index, row in df.iteritems():
        if index in ["ID","CLASS"]:
            continue
        n = normalization[index]
        if n[0] == "minmax":
            df2[index] = (row-n[1])/(n[2]-n[1])
        if n[0] == "zscore":
            df2[index] = (row-n[1])/n[2]
    return df2

In [117]:
glass_train_norm, normalization = create_normalization(glass_train_df,normalizationtype="minmax")
print("normalization:\n")
for f in normalization:
    print("{}:{}".format(f,normalization[f]))

minmax mode


Unnamed: 0,ID,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,CLASS
0,202,1.51653,11.95,0.00,1.19,75.18,2.70,8.93,0.00,0.00,7
1,124,1.51707,13.48,3.48,1.71,72.52,0.62,7.99,0.00,0.00,2
2,152,1.52127,14.32,3.90,0.83,71.50,0.00,9.49,0.00,0.00,3
3,197,1.51556,13.87,0.00,2.54,73.23,0.14,9.41,0.81,0.01,7
4,144,1.51709,13.00,3.47,1.79,72.72,0.66,8.18,0.00,0.00,2
...,...,...,...,...,...,...,...,...,...,...,...
102,178,1.51937,13.79,2.41,1.19,72.76,0.00,9.77,0.00,0.00,6
103,160,1.51796,13.50,3.36,1.63,71.94,0.57,8.81,0.00,0.09,3
104,88,1.51645,13.40,3.49,1.52,72.65,0.67,8.08,0.00,0.10,2
105,98,1.51743,12.20,3.25,1.16,73.55,0.62,8.90,0.00,0.24,2


normalization:

RI:('minmax', 1.51131, 1.53125)
Na:('minmax', 10.73, 15.79)
Mg:('minmax', 0.0, 4.49)
Al:('minmax', 0.29, 3.04)
Si:('minmax', 69.81, 75.18)
K:('minmax', 0.0, 6.21)
Ca:('minmax', 5.43, 14.68)
Ba:('minmax', 0.0, 3.15)
Fe:('minmax', 0.0, 0.37)


In [118]:
glass_test_norm = apply_normalization(glass_test_df,normalization)
display("glass_test_norm",glass_test_norm)

'glass_test_norm'

Unnamed: 0,ID,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,CLASS
0,101,0.262788,0.399209,0.634744,0.418182,0.644320,0.091787,0.363243,0.034921,0.594595,2
1,104,0.799398,0.606719,0.701559,0.134545,0.141527,0.012882,0.671351,0.000000,0.000000,2
2,44,0.541123,0.592885,0.855234,0.156364,0.363128,0.027375,0.465946,0.000000,0.000000,1
3,17,0.327482,0.385375,0.817372,0.316364,0.614525,0.098229,0.353514,0.000000,0.000000,1
4,81,0.231194,0.420949,0.783964,0.665455,0.530726,0.111111,0.274595,0.000000,0.000000,2
...,...,...,...,...,...,...,...,...,...,...,...
102,80,0.230191,0.413043,0.783964,0.585455,0.567970,0.111111,0.274595,0.000000,0.000000,2
103,141,0.280341,0.513834,0.788419,0.480000,0.508380,0.109501,0.289730,0.000000,0.000000,2
104,14,0.309428,0.420949,0.792873,0.356364,0.633147,0.086957,0.318919,0.000000,0.459459,1
105,132,0.743731,0.586957,0.000000,0.389091,0.266294,0.030596,0.865946,0.000000,0.270270,2


In [119]:
# 1c
# Insert the functions create_imputation and apply_imputation below (after the comments)
#
# Input to create_imputation:
# df: a dataframe (where the column names "CLASS" and "ID" have special meaning)
#
# Output from create_imputation:
# df         - a new dataframe, where each missing numeric value in a column has been replaced by the mean of that column 
#              and each missing categoric value in a column has been replaced by the mode of that column
# imputation - a mapping (dictionary) from column name to value that has replaced missing values
#
# Hint 1: First copy the input dataframe and modify the copy (the input dataframe should be kept unchanged)
#
# Hint 2: Handle columns of type "float" or "int" only (and which are not labeled "CLASS" or "ID") in one way
#         and columns of type "object" and "category" in other ways
#
# Hint 3: Consider using the pandas functions mean and mode respectively, as well as fillna
#
# Hint 4: In the rare case of all values in a column being missing*, replace numeric values with 0,
#         object values with "" and category values with the first category (cat.categories[0])  
#
#         *Note that this will not occur if the previous column filter function has been applied
#
# Input to apply_imputation:
# df         - a dataframe
# imputation - a mapping (dictionary) from column name to value that should replace missing values
#
# Output from apply_imputation:
# df - a new dataframe, where each missing value has been replaced according to the mapping
#
# Hint 1: First copy the input dataframe and modify the copy (the input dataframe should be kept unchanged)
#
# Hint 2: Consider using fillna

In [120]:
## 缺失的数字被这一列的平均数代替
## 缺失的分类被这一列的众数代替
def create_imputation(dataframe):
    my_df = dataframe.copy()
    imputation = {}
    
    for column in my_df.columns:
        ## the column names "CLASS" and "ID" have special meaning
        if column != "CLASS" and column != "ID":
            
            ## 如果是数字，用平均数代替
            if my_df[column].dtypes == "int" or my_df[column].dtypes == "float":
                my_df[column].fillna(my_df[column].mean(), inplace=True)
                imputation[column] = my_df[column].mean()
            ## 如果是类别，用众数代替
            else:
                my_df[column].fillna(my_df[column].mode()[0], inplace=True)
                imputation[column] = my_df[column].mode()[0]
    return my_df, imputation

In [121]:
anneal_train_df = pd.read_csv("/Users/zhangziheng/OneDrive/KTH/ID2214 HT21 Programming for Data Science/Assignment1/anneal_train.csv")
anneal_test_df = pd.read_csv("/Users/zhangziheng/OneDrive/KTH/ID2214 HT21 Programming for Data Science/Assignment1/anneal_test.csv")

anneal_train_imp, imputation = create_imputation(anneal_train_df)
display(anneal_train_imp)
display(imputation)

Unnamed: 0,family,product-type,steel,carbon,hardness,temper_rolling,condition,formability,strength,non-ageing,...,s,p,shape,thick,width,len,oil,bore,packing,CLASS
0,TN,C,A,0,0,T,S,2.000000,0,N,...,,,SHEET,0.999,1220.0,4880,Y,0,3.0,3
1,TN,C,A,0,0,T,A,2.000000,0,N,...,,,SHEET,0.700,1320.0,4880,Y,0,3.0,3
2,TN,C,A,0,0,T,S,3.000000,0,N,...,,,SHEET,0.500,1220.0,4880,Y,0,3.0,5
3,ZS,C,A,0,50,T,S,2.251748,0,N,...,,,SHEET,0.451,1250.0,762,Y,0,3.0,3
4,TN,C,A,0,85,T,S,2.251748,0,N,...,,,COIL,4.000,1000.0,0,Y,600,3.0,U
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
444,ZS,C,A,0,70,T,S,2.251748,0,N,...,,,COIL,0.600,610.0,0,Y,0,3.0,3
445,TN,C,A,0,0,T,S,1.000000,0,N,...,,,COIL,0.300,609.9,0,Y,0,3.0,3
446,TN,C,K,45,0,T,S,2.251748,0,N,...,,,COIL,2.300,900.0,0,Y,600,3.0,3
447,TN,C,A,0,0,T,S,3.000000,0,N,...,,,SHEET,1.000,1320.0,4880,Y,0,3.0,3


{'family': 'TN',
 'product-type': 'C',
 'steel': 'A',
 'carbon': 3.859688195991091,
 'hardness': 13.084632516703786,
 'temper_rolling': 'T',
 'condition': 'S',
 'formability': 2.2517482517482557,
 'strength': 26.302895322939868,
 'non-ageing': 'N',
 'surface-finish': 'P',
 'surface-quality': 'E',
 'enamelability': 1.7142857142857018,
 'bc': 'Y',
 'bf': 'Y',
 'bt': 'Y',
 'bw/me': 'B',
 'bl': 'Y',
 'm': nan,
 'chrom': 'C',
 'phos': 'P',
 'cbond': 'Y',
 'marvi': nan,
 'exptl': nan,
 'ferro': 'Y',
 'corr': nan,
 'blue-bright-varn-clean': 'B',
 'lustre': 'Y',
 'jurofm': nan,
 's': nan,
 'p': nan,
 'shape': 'SHEET',
 'thick': 1.1911937639198227,
 'width': 769.4917594654789,
 'len': 1229.293986636971,
 'oil': 'Y',
 'bore': 35.18930957683742,
 'packing': 3.0}

In [122]:
def apply_imputation(df, imputation):
    my_df = df.copy()
    for column in imputation:
        my_df[column].fillna(imputation[column], inplace=True)
    return my_df

In [123]:
# Test your code (leave this part unchanged)

anneal_train_df = pd.read_csv("/Users/zhangziheng/OneDrive/KTH/ID2214 HT21 Programming for Data Science/Assignment1/anneal_train.csv")
anneal_test_df = pd.read_csv("/Users/zhangziheng/OneDrive/KTH/ID2214 HT21 Programming for Data Science/Assignment1/anneal_test.csv")


anneal_train_imp, imputation = create_imputation(anneal_train_df)
anneal_test_imp = apply_imputation(anneal_test_df,imputation)

print("Imputation:\n")
for f in imputation:
    print("{}:{}".format(f,imputation[f]))

print("\nNo. of replaced missing values in training data:\n{}".format(anneal_train_imp.count()-anneal_train_df.count()))
print("\nNo. of replaced missing values in test data:\n{}".format(anneal_test_imp.count()-anneal_test_df.count()))

Imputation:

family:TN
product-type:C
steel:A
carbon:3.859688195991091
hardness:13.084632516703786
temper_rolling:T
condition:S
formability:2.2517482517482557
strength:26.302895322939868
non-ageing:N
surface-finish:P
surface-quality:E
enamelability:1.7142857142857018
bc:Y
bf:Y
bt:Y
bw/me:B
bl:Y
m:nan
chrom:C
phos:P
cbond:Y
marvi:nan
exptl:nan
ferro:Y
corr:nan
blue-bright-varn-clean:B
lustre:Y
jurofm:nan
s:nan
p:nan
shape:SHEET
thick:1.1911937639198227
width:769.4917594654789
len:1229.293986636971
oil:Y
bore:35.18930957683742
packing:3.0

No. of replaced missing values in training data:
family                    382
product-type                0
steel                      43
carbon                      0
hardness                    0
temper_rolling            374
condition                 160
formability               163
strength                    0
non-ageing                391
surface-finish            444
surface-quality           128
enamelability             442
bc               

In [124]:
# 1d
# Insert the functions create_bins and apply_bins below
#
# Input to create_bins:
# df      - a dataframe
# nobins  - no. of bins (default = 10)
# bintype - either "equal-width" (default) or "equal-size" 
#
# Output from create_bins:
# df      - a new dataframe, where each numeric feature value has been replaced by a categoric (corresponding to some bin)
# binning - a mapping (dictionary) from column name to bins (threshold values for the bin)
#
# Hint 1: First copy the input dataframe and modify the copy (the input dataframe should be kept unchanged)
#
# Hint 2: Discretize columns of type "float" or "int" only (and which are not labeled "CLASS" or "ID")
#
# Hint 3: Consider using pd.cut and pd.qcut respectively, with labels=False and retbins=True
#
# Hint 4: Set all columns in the new dataframe to be of type "category"
#
# Hint 5: Set the categories of the discretized features to be [0,...,nobins-1]
#
# Hint 6: Change the first and the last element of each binning to -np.inf and np.inf respectively 
#
# Input to apply_bins:
# df      - a dataframe
# binning - a mapping (dictionary) from column name to bins (threshold values for the bin)
#
# Output from apply_bins:
# df - a new dataframe, where each numeric feature value has been replaced by a categoric (corresponding to some bin)
#
# Hint 1: First copy the input dataframe and modify the copy (the input dataframe should be kept unchanged)
#
# Hint 2: Consider using pd.cut 
#
# Hint 3: Set all columns in the new dataframe to be of type "category"
#
# Hint 4: Set the categories of the discretized features to be [0,...,nobins-1]


In [125]:
# 1d
def create_bins(df, nobins=10, bintype="equal-width"):
    my_df = df.copy()
    binning = {}
    
    for column in my_df.columns:
        if column != "CLASS" and column != "ID" and (my_df[column].dtype in ["float64", "float32", "int64", "int32"]):
            if bintype == "equal-width":
                my_df[column], bins = pd.cut(my_df[column], nobins, retbins=True, duplicates="drop", labels=False)
                ## 记录分类的区间
                binning[column] = bins
            elif bintype == "equal-size":
                my_df[column], bins = pd.qcut(my_df[column], q=nobins, retbins=True, duplicates="drop", labels=False)
                binning[column] = bins
            ## Set all columns in the new dataframe to be of type "category"
            my_df[column] = my_df[column].astype("category")
            my_df[column] = my_df[column].cat.set_categories([str(i) for i in my_df[column].cat.categories], rename=True)
            binning[column][0] = -np.inf
            binning[column][-1] = np.inf
        else:
            my_df[column] = my_df[column].astype('category')
    return my_df, binning
    


In [126]:
glass_train_df = pd.read_csv("glass_train.csv")
display(glass_train_df)
glass_train_disc, binning = create_bins(glass_train_df,nobins=10,bintype="equal-size")
display(glass_train_disc)
display(binning)

Unnamed: 0,ID,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,CLASS
0,202,1.51653,11.95,0.00,1.19,75.18,2.70,8.93,0.00,0.00,7
1,124,1.51707,13.48,3.48,1.71,72.52,0.62,7.99,0.00,0.00,2
2,152,1.52127,14.32,3.90,0.83,71.50,0.00,9.49,0.00,0.00,3
3,197,1.51556,13.87,0.00,2.54,73.23,0.14,9.41,0.81,0.01,7
4,144,1.51709,13.00,3.47,1.79,72.72,0.66,8.18,0.00,0.00,2
...,...,...,...,...,...,...,...,...,...,...,...
102,178,1.51937,13.79,2.41,1.19,72.76,0.00,9.77,0.00,0.00,6
103,160,1.51796,13.50,3.36,1.63,71.94,0.57,8.81,0.00,0.09,3
104,88,1.51645,13.40,3.49,1.52,72.65,0.67,8.08,0.00,0.10,2
105,98,1.51743,12.20,3.25,1.16,73.55,0.62,8.90,0.00,0.24,2


Unnamed: 0,ID,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,CLASS
0,202,3,0,0,2,9,8,6,0,0,7
1,124,4,5,4,7,3,6,1,0,0,2
2,152,8,8,7,0,0,0,7,0,0,3
3,197,0,7,0,9,8,1,7,1,0,7
4,144,4,2,3,7,3,7,2,0,0,2
...,...,...,...,...,...,...,...,...,...,...,...
102,178,7,6,1,2,4,0,8,0,0,6
103,160,6,6,2,7,1,4,5,0,1,3
104,88,2,5,4,5,3,7,1,0,1,2
105,98,4,0,2,1,9,6,6,0,2,2


{'RI': array([    -inf, 1.515896, 1.51618 , 1.516516, 1.516866, 1.51753 ,
        1.517902, 1.518618, 1.520114, 1.521846,      inf]),
 'Na': array([  -inf, 12.73 , 12.872, 13.   , 13.222, 13.38 , 13.492, 13.794,
        14.198, 14.82 ,    inf]),
 'Mg': array([ -inf, 1.82 , 3.188, 3.41 , 3.476, 3.55 , 3.61 , 3.728,   inf]),
 'Al': array([ -inf, 0.906, 1.172, 1.23 , 1.348, 1.48 , 1.54 , 1.622, 1.808,
        2.094,   inf]),
 'Si': array([  -inf, 71.756, 72.196, 72.388, 72.72 , 72.79 , 72.966, 73.06 ,
        73.208, 73.372,    inf]),
 'K': array([ -inf, 0.006, 0.148, 0.39 , 0.54 , 0.576, 0.6  , 0.636, 0.67 ,
          inf]),
 'Ca': array([  -inf,  7.978,  8.112,  8.338,  8.554,  8.67 ,  8.81 ,  9.032,
         9.674, 10.924,    inf]),
 'Ba': array([-inf, 0.78,  inf]),
 'Fe': array([ -inf, 0.062, 0.118, 0.24 ,   inf])}

In [127]:
def apply_bins(df, binning):
    my_df = df.copy()
    bins = {}
    for column in binning:
        my_df[column] = pd.cut(my_df[column], binning[column], labels=False)
        my_df[column] = my_df[column].astype("category")
        my_df[column] = my_df[column].cat.set_categories([str(i) for i in my_df[column].cat.categories], rename = True)        
    my_df = my_df.astype("category")
    return my_df

In [128]:
# Test your code  (leave this part unchanged)

glass_train_df = pd.read_csv("/Users/zhangziheng/OneDrive/KTH/ID2214 HT21 Programming for Data Science/Assignment1/glass_train.csv")

glass_test_df = pd.read_csv("/Users/zhangziheng/OneDrive/KTH/ID2214 HT21 Programming for Data Science/Assignment1/glass_test.csv")

glass_train_disc, binning = create_bins(glass_train_df,nobins=10,bintype="equal-size")
print("binning:\n")
for f in binning:
    print("{}:{}".format(f,binning[f]))

glass_test_disc = apply_bins(glass_test_df,binning)
print("\nglass_test_disc:\n")
glass_test_disc

binning:

RI:[    -inf 1.515896 1.51618  1.516516 1.516866 1.51753  1.517902 1.518618
 1.520114 1.521846      inf]
Na:[  -inf 12.73  12.872 13.    13.222 13.38  13.492 13.794 14.198 14.82
    inf]
Mg:[ -inf 1.82  3.188 3.41  3.476 3.55  3.61  3.728   inf]
Al:[ -inf 0.906 1.172 1.23  1.348 1.48  1.54  1.622 1.808 2.094   inf]
Si:[  -inf 71.756 72.196 72.388 72.72  72.79  72.966 73.06  73.208 73.372
    inf]
K:[ -inf 0.006 0.148 0.39  0.54  0.576 0.6   0.636 0.67    inf]
Ca:[  -inf  7.978  8.112  8.338  8.554  8.67   8.81   9.032  9.674 10.924
    inf]
Ba:[-inf 0.78  inf]
Fe:[ -inf 0.062 0.118 0.24    inf]

glass_test_disc:



Unnamed: 0,ID,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,CLASS
0,101,3,1,1,4,8,4,5,0,2,2
1,104,9,7,1,0,0,1,9,0,0,2
2,44,9,6,7,0,1,2,8,0,0,1
3,17,5,0,6,1,7,6,5,0,0,1
4,81,1,1,4,9,3,8,0,0,0,2
...,...,...,...,...,...,...,...,...,...,...,...
102,80,1,1,4,8,5,8,0,0,0,2
103,141,4,4,4,6,3,8,1,0,0,2
104,14,4,1,5,3,8,3,3,0,2,1
105,132,9,6,0,4,0,2,9,0,1,2


In [129]:
# 1e
# Insert the functions create_one_hot and apply_one_hot below
#
# Input to create_one_hot:
# df: a dataframe
#
# Output from create_one_hot:
# df      - a new dataframe, where each categoric feature has been replaced by a set of binary features 
#           (as many new features as there are possible values)
# one_hot - a mapping (dictionary) from column name to a set of categories (possible values for the feature)
#
# Hint 1: First copy the input dataframe and modify the copy (the input dataframe should be kept unchanged)
#
# Hint 2: Consider columns of type "object" or "category" only (and which are not labeled "CLASS" or "ID")
#
# Hint 3: Consider creating new column names by merging the original column name and the categorical value
#
# Hint 4: Set all new columns to be of type "float"
#
# Hint 5: Do not forget to remove the original categoric feature
#
# Input to apply_one_hot:
# df      - a dataframe
# one_hot - a mapping (dictionary) from column name to categories
#
# Output from apply_one_hot:
# df - a new dataframe, where each categoric feature has been replaced by a set of binary features
#
# Hint: See the above Hints

In [138]:
def create_one_hot(df):
    my_df = df.copy()
    df_new = df.copy()
    one_hot = {}
    for col in df.columns:
        if col != "CLASS" and col != "ID":  
            if str(my_df.dtypes[col]) == "category" or str(my_df.dtypes[col]) == "object":
                my_df[col] = my_df[col].astype("category")
               # one_hot[col] = df[col].cat.categories
                one_hot[col] = list(my_df[col].cat.categories)
                # display(one_hot)
                for i in one_hot[col]:
                    name = col + "_" + str(i)
                    # print(name)
                    # print(my_df[col], i)
                    # print("===")
                    new_col = my_df[col] == i
                    new_col = new_col.astype("float")
                    df_new[name] = new_col 
                df_new = df_new.drop(columns = col, axis = 1) 

    return df_new, one_hot

def split(dataframe, testfraction=0.5):
    
    df = dataframe.copy()
    
    df_random = df.reindex(np.random.permutation(df.index))
    
    trainingdf = df_random[0: int((1-testfraction)*df.shape[0])]
    testdf = df_random[int((1-testfraction)*df.shape[0])+1 : df.shape[0]]
    
    return trainingdf, testdf

def apply_one_hot(dataframe, one_hot):
    df = dataframe.copy()
    df_new = df.copy()
    for col in df.columns:
        if col in one_hot.keys():
            for i in one_hot[col]:
                name = col + "_" + str(i)
                new_col = df[col] == i
                new_col = pd.Series(new_col.astype("float"))
                df_new[name] = new_col
            df_new = df_new.drop(columns = col, axis = 1)

    return df_new

In [139]:
tictactoe = pd.read_csv("tic-tac-toe.csv")
display(tictactoe)
train_df, test_df = split(tictactoe) # Using your above function
new_train, one_hot = create_one_hot(train_df)
display(new_train)
display(one_hot)

Unnamed: 0,top-left-square,top-middle-square,top-right-square,middle-left-square,middle-middle-square,middle-right-square,bottom-left-square,bottom-middle-square,bottom-right-square,CLASS
0,x,x,x,x,o,o,x,o,o,positive
1,x,x,x,x,o,o,o,x,o,positive
2,x,x,x,x,o,o,o,o,x,positive
3,x,x,x,x,o,o,o,b,b,positive
4,x,x,x,x,o,o,b,o,b,positive
...,...,...,...,...,...,...,...,...,...,...
953,o,x,x,x,o,o,o,x,x,negative
954,o,x,o,x,x,o,x,o,x,negative
955,o,x,o,x,o,x,x,o,x,negative
956,o,x,o,o,x,x,x,o,x,negative


435    o
723    x
848    o
141    x
546    b
      ..
847    o
410    o
324    o
665    x
5      x
Name: top-left-square, Length: 479, dtype: category
Categories (3, object): ['b', 'o', 'x'] b
===
435    o
723    x
848    o
141    x
546    b
      ..
847    o
410    o
324    o
665    x
5      x
Name: top-left-square, Length: 479, dtype: category
Categories (3, object): ['b', 'o', 'x'] o
===
435    o
723    x
848    o
141    x
546    b
      ..
847    o
410    o
324    o
665    x
5      x
Name: top-left-square, Length: 479, dtype: category
Categories (3, object): ['b', 'o', 'x'] x
===
435    b
723    b
848    o
141    o
546    o
      ..
847    o
410    o
324    x
665    x
5      x
Name: top-middle-square, Length: 479, dtype: category
Categories (3, object): ['b', 'o', 'x'] b
===
435    b
723    b
848    o
141    o
546    o
      ..
847    o
410    o
324    x
665    x
5      x
Name: top-middle-square, Length: 479, dtype: category
Categories (3, object): ['b', 'o', 'x'] o
===
435    b
72

Unnamed: 0,CLASS,top-left-square_b,top-left-square_o,top-left-square_x,top-middle-square_b,top-middle-square_o,top-middle-square_x,top-right-square_b,top-right-square_o,top-right-square_x,...,middle-right-square_x,bottom-left-square_b,bottom-left-square_o,bottom-left-square_x,bottom-middle-square_b,bottom-middle-square_o,bottom-middle-square_x,bottom-right-square_b,bottom-right-square_o,bottom-right-square_x
435,positive,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0
723,negative,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
848,negative,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
141,positive,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
546,positive,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
847,negative,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
410,positive,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
324,positive,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
665,negative,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0


{'top-left-square': ['b', 'o', 'x'],
 'top-middle-square': ['b', 'o', 'x'],
 'top-right-square': ['b', 'o', 'x'],
 'middle-left-square': ['b', 'o', 'x'],
 'middle-middle-square': ['b', 'o', 'x'],
 'middle-right-square': ['b', 'o', 'x'],
 'bottom-left-square': ['b', 'o', 'x'],
 'bottom-middle-square': ['b', 'o', 'x'],
 'bottom-right-square': ['b', 'o', 'x']}

In [140]:
# 1f
# Insert the function split below
#
# Input to split:
# df           - a dataframe
# testfraction - a float in the range (0,1) (default = 0.5)
#
# Output from split:
# trainingdf - a dataframe consisting of a random sample of (1-testfraction) of the rows in df
# testdf     - a dataframe consisting of the rows in df that are not included in trainingdf
#
# Hint: You may use np.random.permutation(df.index) to get a permuted list of indexes where a 
#       prefix corresponds to the test instances, and the suffix to the training instances

In [145]:
def split(dataframe, testfraction=0.5):
    
    df = dataframe.copy()
    # display(df)
    
    df_random = df.reindex(np.random.permutation(df.index))
    # display(df_random)
    
    trainingdf = df_random[0: int((1-testfraction)*df.shape[0])]
    testdf = df_random[int((1-testfraction)*df.shape[0])+1 : df.shape[0]]
    
    return trainingdf, testdf

In [146]:
# Test your code  (leave this part unchanged)

glass_df = pd.read_csv("glass.csv")

glass_train, glass_test = split(glass_df,testfraction=0.25)

print("Training IDs:\n{}".format(glass_train["ID"].values))

print("\nTest IDs:\n{}".format(glass_test["ID"].values))

print("\nOverlap: {}".format(set(glass_train["ID"]).intersection(set(glass_test["ID"]))))

Unnamed: 0,ID,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,CLASS
0,1,1.52101,13.64,4.49,1.10,71.78,0.06,8.75,0.00,0.0,1
1,2,1.51761,13.89,3.60,1.36,72.73,0.48,7.83,0.00,0.0,1
2,3,1.51618,13.53,3.55,1.54,72.99,0.39,7.78,0.00,0.0,1
3,4,1.51766,13.21,3.69,1.29,72.61,0.57,8.22,0.00,0.0,1
4,5,1.51742,13.27,3.62,1.24,73.08,0.55,8.07,0.00,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...
209,210,1.51623,14.14,0.00,2.88,72.61,0.08,9.18,1.06,0.0,7
210,211,1.51685,14.92,0.00,1.99,73.06,0.00,8.40,1.59,0.0,7
211,212,1.52065,14.36,0.00,2.02,73.42,0.00,8.44,1.64,0.0,7
212,213,1.51651,14.38,0.00,1.94,73.61,0.00,8.48,1.57,0.0,7


Unnamed: 0,ID,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,CLASS
43,44,1.52210,13.73,3.84,0.72,71.76,0.17,9.74,0.0,0.00,1
132,133,1.51813,13.43,3.98,1.18,72.49,0.58,8.15,0.0,0.00,2
17,18,1.52196,14.36,3.85,0.89,71.36,0.15,9.15,0.0,0.00,1
28,29,1.51768,12.56,3.52,1.43,73.15,0.57,8.54,0.0,0.00,1
56,57,1.51215,12.99,3.47,1.12,72.98,0.62,8.35,0.0,0.31,1
...,...,...,...,...,...,...,...,...,...,...,...
115,116,1.51846,13.41,3.89,1.33,72.38,0.51,8.28,0.0,0.00,2
133,134,1.51800,13.71,3.93,1.54,71.81,0.54,8.21,0.0,0.15,2
79,80,1.51590,12.82,3.52,1.90,72.86,0.69,7.97,0.0,0.00,2
98,99,1.51689,12.67,2.88,1.71,73.21,0.73,8.54,0.0,0.00,2


Training IDs:
[ 44 133  18  29  57 210  16  43 100  28 191  86 164 176 214 199 141  97
  34 149 207  66 150 156  87 139 155  12 169 189 204  36   1  48 200  65
 168 119  95 112 101 162 178  55 209 173  25 197  33   5  37  67  88 118
  61 115 180  78 140  42 213  49 182 185 144 152 136 154  52  84 110  32
  73 131 167  58   2  64 146 123 201 121  63 195  14  31 196 205  39   3
 181 203 192   4  10  75 151 158  17 187 125 177 198 147  62   8 127   9
 117  90  56 165  41  53  47 170  60  96 106  23  91 130 138   6 183 124
 206 161  79 129 107 208 104 171 142  22 193   7  35 153  54  68  21  46
 113 114  45 166  72 202  15  76  24 132  71 145 103  59 102 212]

Test IDs:
[190 160  74 126 143 174 122 128 111 105  94 186  98  20  89  26  81  69
 108 137 188  51  27  70  40 175 184 109 172  38  13  92  83  30  19 159
 211  93 157  77 163  85  11 120 148 194  82 179 116 134  80  99  50]

Overlap: set()


In [147]:
# 1g
# Insert the function accuracy below
#
# Input to accuracy:
# df            - a dataframe with class labels as column names and each row corresponding to
#                 a prediction with estimated probabilities for each class
# correctlabels - an array (or list) of the correct class label for each prediction
#                 (the number of correct labels must equal the number of rows in df)
#
# Output from accuracy:
# accuracy - the fraction of cases for which the predicted class label coincides with the correct label
#
# Hint: In case the label receiving the highest probability is not unique, you may
#       resolve that by picking the first (as ordered by the column names) or 
#       by randomly selecting one of the labels with highest probaility.


In [148]:
def accuracy(dataframe, correctlabels):
    df = dataframe.copy()
    
    # 返回第一次出现的最大索引
    labels = df.idxmax(axis=1)
    display(labels)
    truelabels = (labels == correctlabels).sum(axis=0)
    accuracy = truelabels/len(df)
    return accuracy

In [149]:
predictions = pd.DataFrame({"A":[0.5,0.5,0.5,0.25,0.25],"B":[0.5,0.25,0.25,0.5,0.25],"C":[0.0,0.25,0.25,0.25,0.5]})

correctlabels = ["B","A","B","B","C"]

accuracy(predictions,correctlabels)

0    A
1    A
2    A
3    B
4    C
dtype: object

0.6