In [217]:
import pandas as pd 
import numpy as np
import matplotlib as plt
%matplotlib inline

In [79]:
df = pd.read_csv('adult.data', sep=",", header=None)

In [80]:
df.head(20)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
5,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K
6,49,Private,160187,9th,5,Married-spouse-absent,Other-service,Not-in-family,Black,Female,0,0,16,Jamaica,<=50K
7,52,Self-emp-not-inc,209642,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45,United-States,>50K
8,31,Private,45781,Masters,14,Never-married,Prof-specialty,Not-in-family,White,Female,14084,0,50,United-States,>50K
9,42,Private,159449,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,5178,0,40,United-States,>50K


In [81]:
df.columns = ['age' , 'workclass', 'fnlwgt', 'education', 'education_num', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'capital_gain', 'capital_loss', 'hours-per-week','native_country','salary']

In [100]:
cat_columns = [ 'workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex','native_country']

In [106]:
y = df['salary']
df = df.drop(columns=['salary'])


In [108]:
df = pd.get_dummies(df,columns=cat_columns)

In [109]:
df.head()

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours-per-week,workclass_ Federal-gov,workclass_ Local-gov,workclass_ Private,workclass_ Self-emp-inc,...,native_country_ Portugal,native_country_ Puerto-Rico,native_country_ Scotland,native_country_ South,native_country_ Taiwan,native_country_ Thailand,native_country_ Trinadad&Tobago,native_country_ United-States,native_country_ Vietnam,native_country_ Yugoslavia
0,39,77516,13,2174,0,40,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,50,83311,13,0,0,13,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,38,215646,9,0,0,40,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
3,53,234721,7,0,0,40,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
4,28,338409,13,0,0,40,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [83]:
df.shape

(32561, 15)

In [85]:
df = df.replace(' ?',np.nan) 
df = df.dropna()

In [186]:
class Node():
    def __init__(self, feature_index=None, threshold=None, left=None, right=None, info_gain=None, leaf_class=None):
        ''' constructor ''' 
        
        # for decision node
        self.feature_index = feature_index
        self.threshold = threshold
        self.left = left
        self.right = right
        self.info_gain = info_gain
        
        #class
        self.leaf_class = leaf_class


In [212]:
class DecisionTreeClassifier():
    def __init__(self, min_samples_split=2, max_depth=100):
        
        
        #root
        self.root = None
        
        # base condition
        self.min_samples_split = min_samples_split
        self.max_depth = max_depth
      
    
    #Build the decision tree.
    def build_tree(self, dataset, curr_depth=0):

    
        X, Y = dataset[:,:-1], dataset[:,-1]
        num_samples, num_features = np.shape(X)
        if num_samples>=self.min_samples_split and curr_depth<=self.max_depth:
            
            best_split = self.get_best_split(dataset, num_samples, num_features)

            if best_split["info_gain"]>0:
    
                left_subtree = self.build_tree(best_split["dataset_left"], curr_depth+1)
                
                right_subtree = self.build_tree(best_split["dataset_right"], curr_depth+1)
    
                return Node(best_split["feature_index"], best_split["threshold"], 
                            left_subtree, right_subtree, best_split["info_gain"])
        
        
        leaf_class = self.calculate_leaf_class(Y)
        
        return Node(leaf_class=leaf_class)
    
    def get_best_split(self, dataset, num_samples, num_features):
        
        
        # dictionary to store the best split
        best_split = {}
        max_info_gain = -np.inf
        
        # for every feature in feature set
        for feature_index in range(num_features):
            feature_values = dataset[:, feature_index]
            possible_thresholds = np.unique(feature_values)
            #if value is 0 and 1 we take one of them
            if len(possible_thresholds) == 2:
                possible_thresholds = [0]
            #if there are many unique values we take 20 intervals
            elif len(possible_thresholds) > 20:
                temp = np.linspace(min(possible_thresholds),max(possible_thresholds),20)
                possible_thresholds = temp
            # all thresholds for continuous feature
            for threshold in possible_thresholds:
                #binary tree <= and >
                dataset_left, dataset_right = self.split(dataset, feature_index, threshold)
        
                if len(dataset_left)>0 and len(dataset_right)>0:
                    y, left_y, right_y = dataset[:, -1], dataset_left[:, -1], dataset_right[:, -1]
                    # compute information gain
                    curr_info_gain = self.information_gain(y, left_y, right_y)
                    # check if new attribute has more info gain , make new current
                    if curr_info_gain>max_info_gain:
                        best_split["feature_index"] = feature_index
                        best_split["threshold"] = threshold
                        best_split["dataset_left"] = dataset_left
                        best_split["dataset_right"] = dataset_right
                        best_split["info_gain"] = curr_info_gain
                        max_info_gain = curr_info_gain
                        

        return best_split
    
    def split(self, dataset, feature_index, threshold):
        #split data into less than nad greater than
        
        dataset_left = np.array([row for row in dataset if row[feature_index]<=threshold])
        dataset_right = np.array([row for row in dataset if row[feature_index]>threshold])
        return dataset_left, dataset_right
    
    def information_gain(self, parent, l_child, r_child):
        #calculating info gain given parent left_child and right_child
        
        weight_l = len(l_child) / len(parent)
        weight_r = len(r_child) / len(parent)
        gain = self.entropy(parent) - (weight_l*self.entropy(l_child) + weight_r*self.entropy(r_child))
        return gain
    
    def entropy(self, y):
        #calculate entropy
        
        class_labels = np.unique(y)
        entropy = 0
        for cls in class_labels:
            p_cls = len(y[y == cls]) / len(y)
            entropy += -p_cls * np.log2(p_cls)
        return entropy
        
    def calculate_leaf_class(self, Y):
        #final level leaf class
        
        Y = list(Y)
        return max(Y, key=Y.count)
    

    def make_decisionTree(self, X, Y):
        #init build tree
        
        dataset = np.concatenate((X, Y), axis=1)
        self.root = self.build_tree(dataset)
    
    def predict(self, X):
        #predict multiple datapoints
        
        preditions = [self.one_predict(x, self.root) for x in X]
        return preditions
    
    def one_predict(self, x, tree):
        #predict single data point
        
        if tree.leaf_class!=None: return tree.leaf_class
        feature_val = x[tree.feature_index]
        if feature_val<=tree.threshold:
            return self.one_predict(x, tree.left)
        else:
            return self.one_predict(x, tree.right)

In [298]:
def tst_trn_split(X,Y , test_size  = 0.2):
    X['salary'] =  Y
    X = X.sample(frac=1).reset_index(drop=True)
    no_rows = round(0.8 * X.shape[0])
    X_train = X.iloc[0:no_rows,:-1]
    y_train = X.iloc[0:no_rows,-1].values.reshape(-1,1)
    X_test = X.iloc[no_rows: , :-1]
    y_test = X.iloc[no_rows: , -1].values.reshape(-1,1)
    return X_train, X_test, y_train, y_test
    
    
    

In [299]:
X = df.iloc[: , : -1]
Y = y.values.reshape(-1,1)

X_train, X_test, Y_train, Y_test = tst_trn_split(X, Y, test_size=.2)

In [255]:
classifier = DecisionTreeClassifier(min_samples_split=30, max_depth=10)
classifier.make_decisionTree(X_train,Y_train)


In [257]:
Y_pred = classifier.predict(np.array(X_test))


In [265]:
def cal_accuracy(Y_pred , Y_test):
    z = [1 if x==y else 0 for x,y in zip(Y_pred,Y_test)]
    print(sum(z)/len(z))
    return sum(z)/len(z)

In [259]:
cal_accuracy(Y_pred , Y_test)

0.8511519973479198


In [271]:
def bootstrap(X_train,Y_train):
    X_train['salary'] =  Y_train
    X_train = X_train.sample(frac=1).reset_index(drop=True)
    X_train = X_train.sample(frac=0.4,replace=True).reset_index(drop=True)
    y_train = X_train.iloc[:,-1].values.reshape(-1,1)
    X_train = X_train.iloc[: , :-1]
    return X_train, y_train

    
    
       

In [274]:
accuracy_list = []
Y_pred_list = []
for _ in range(10):
    X_train_b, y_train_b = bootstrap(X_train,Y_train)
    print(X_train_b.shape,y_train_b.shape)
    classifier_b = DecisionTreeClassifier(min_samples_split=30, max_depth=10)
    classifier_b.make_decisionTree(X_train_b,y_train_b)
    Y_pred_b = classifier_b.predict(np.array(X_test))
    Y_pred_list.append(Y_pred_b)
    accuracy_list.append(cal_accuracy(Y_pred_b , Y_test))
    
print(accuracy_list)
    

(9652, 104) (9652, 1)
0.8430300016575502
(9652, 104) (9652, 1)
0.841869716558926
(9652, 104) (9652, 1)
0.8388861263053208
(9652, 104) (9652, 1)
0.8329189457981104
(9652, 104) (9652, 1)
0.8355710260235372
(9652, 104) (9652, 1)
0.8359025360517156
(9652, 104) (9652, 1)
0.8395491463616774
(9652, 104) (9652, 1)
0.8373943311785181
(9652, 104) (9652, 1)
0.8422012265871043
(9652, 104) (9652, 1)
0.8448533068125311
[0.8430300016575502, 0.841869716558926, 0.8388861263053208, 0.8329189457981104, 0.8355710260235372, 0.8359025360517156, 0.8395491463616774, 0.8373943311785181, 0.8422012265871043, 0.8448533068125311]


In [285]:
Y_pred_list_col = (np.array(Y_pred_list).T)
for x in Y_pred_list_col:
    print(x)
    break
pred_final = [max(x.tolist(),key=x.tolist().count) for x in Y_pred_list_col]
cal_accuracy(pred_final,Y_test)
# def final_predictions(pred_list):
    

[' <=50K' ' <=50K' ' <=50K' ' >50K' ' <=50K' ' <=50K' ' <=50K' ' <=50K'
 ' >50K' ' <=50K']
0.8566219128128626


0.8566219128128626

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours-per-week,workclass_ Federal-gov,workclass_ Local-gov,workclass_ Private,workclass_ Self-emp-inc,...,native_country_ Portugal,native_country_ Puerto-Rico,native_country_ Scotland,native_country_ South,native_country_ Taiwan,native_country_ Thailand,native_country_ Trinadad&Tobago,native_country_ United-States,native_country_ Vietnam,native_country_ Yugoslavia
0,19,28790,10,0,0,40,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
1,44,172479,9,15024,0,60,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,19,128363,10,0,0,30,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
3,34,82938,9,0,0,50,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
4,21,252253,10,0,0,40,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24125,64,254797,10,0,0,40,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
24126,53,137192,13,0,0,50,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
24127,35,140854,9,0,0,60,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0
24128,46,190729,10,0,0,40,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [301]:
X_pca =  pd.concat([X_train,X_test]).reset_index(drop=True)

In [305]:
X_pca = (X_pca - X_pca.mean(axis=0))/X_pca.std(axis=0) 
cov = X_pca.T @ X_pca

In [316]:
u , v ,vh = np.linalg.svd(cov ,full_matrices=True)

In [None]:
def PCA(X_train,X_test, n_components=20):
    X_pca =  pd.concat([X_train,X_test]).reset_index(drop=True)
    X_pca = (X_pca - X_pca.mean(axis=0))/X_pca.std(axis=0) 
    cov = X_pca.T @ X_pca
    u , v ,vh = np.linalg.svd(cov ,full_matrices=True)
    X_train = 
    for i in range n_components:
        