In [0]:

import numpy as np
from sklearn import preprocessing
from sklearn.preprocessing import MaxAbsScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.neighbors import KNeighborsClassifier
import matplotlib.pyplot as plt


class BankDataset:
    def __init__(self, **kwargs):
        super(BankDataset, self).__init__(**kwargs)

        # This will be initialised by the load method with all the dataset features
        self.X = None

        # This will be initialised by the load method with all the dataset classes
        self.y = None       

        # As name suggests
        self.feature_names = ["age","job","marital","education","default","balance","housing",
                              "loan","contact","day","month","duration","campaign","pdays","previous","poutcome"]
        self.target_names = ["no", "yes"]

        self.feature_types = ["num","cat","cat","cat","cat","num","cat",
                              "cat","cat","num","cat","num","num","num","num","cat"]
        
    def preprocess(self, type_, filter=[], apply_scaling=False):  
        # filter is for any filtered variables that you dont want

        if type_ == "numerical": #Just to avoid no nums in standard classifiers
            self.feature_encoders = [
                                     
                None,  
                # age

                preprocessing.LabelEncoder().fit(
                    ["admin.", "blue-collar", "entrepreneur", "housemaid", "management", "retired", 
                     "self-employed", "services", "student", "technician", "unemployed", "unknown" ]),  
                # job
                
                preprocessing.LabelEncoder().fit(["divorced", "married",  "single"]),  
                # marital
                
                preprocessing.LabelEncoder().fit(["primary", "secondary", "tertiary", "unknown"]),
                # education
                
                preprocessing.LabelEncoder().fit(["no", "yes"]),
                # default
                
                None,
                # balance
                
                preprocessing.LabelEncoder().fit(["no", "yes"]),  
                # housing
                
                preprocessing.LabelEncoder().fit(["no", "yes"]),  
                # loan
                
                preprocessing.LabelEncoder().fit(["cellular",  "telephone", "unknown"]),  
                # contact
                
                None,
                # day
                
                preprocessing.LabelEncoder().fit([ "jan", "feb", "mar", "apr", "may", "jun", "jul","aug", "sep", "oct", "nov", "dec"]),
                # month
                
                None, 
                # duration
                
                None,
                # campaign
                
                None,
                # pdays
                
                None, 
                # previous
                
                preprocessing.LabelEncoder().fit([ "failure", "other", "success", "unknown"]),
                # poutcome
            
            ]
        elif type_ == "one-hot": #Just to have proper flow of non nums labels
            self.feature_encoders = [
                None,  # age

                (preprocessing.LabelEncoder().fit(
                    ["admin.", "blue-collar", "entrepreneur", "housemaid", "management", "retired", 
                     "self-employed", "services", "student", "technician", "unemployed", "unknown" ]
                    ),  preprocessing.OneHotEncoder(n_values=12, sparse=False)),
                # job


                (preprocessing.LabelEncoder().fit(["divorced", "married",  "single"]), preprocessing.OneHotEncoder(n_values=3, sparse=False)),  
                # marital


                (preprocessing.LabelEncoder().fit(["primary", "secondary", "tertiary", "unknown"]), preprocessing.OneHotEncoder(n_values=4, sparse=False)),
                # education


                (preprocessing.LabelEncoder().fit(["no", "yes"])),
                # default


                None,
                # balance


                (preprocessing.LabelEncoder().fit(["no", "yes"])),  
                # housing


                (preprocessing.LabelEncoder().fit(["no", "yes"])),  
                # loan


                (preprocessing.LabelEncoder().fit(["cellular",  "telephone", "unknown"]), preprocessing.OneHotEncoder(n_values=3, sparse=False)),
                # contact


                None,
                #day


                (preprocessing.LabelEncoder().fit([ "jan", "feb", "mar", "apr", "may", "jun", "jul","aug", "sep", "oct", "nov", "dec"]
                    ), preprocessing.OneHotEncoder(n_values=12, sparse=False)),
                # month


                None, 
                # duration


                None,
                # campaign


                None,
                # pdays


                None, 
                # previous


                (preprocessing.LabelEncoder().fit([ "failure", "other", "success", "unknown"]), preprocessing.OneHotEncoder(n_values=4, sparse=False))
                # poutcome


            ]
        else:
            raise ValueError("Unable to load feature encoders for type {}".format(type_))

        self.class_encoder = preprocessing.LabelBinarizer().fit(["no", "yes"])

        num_features = self.X.shape[1]
        print("Number of features is {}".format(num_features))
        num_instances = self.X.shape[0]
        print("Number of instances is {}".format(num_instances))
        one_hot_applied = False
        new_features = []

        for f_id in [x for x in range(num_features) if not x in filter]:
            # convert them to integers
            if self.feature_encoders[f_id] is None:
                if type_ == "one-hot":
                    new_features.append(np.expand_dims(self.X[:, f_id].astype(np.float32), -1))
                else:
                    new_features.append(self.X[:, f_id].astype(np.float32))
            else:
                # apply in sequence the preprocessors
                if isinstance(self.feature_encoders[f_id], (list, tuple)):
                    one_hot_applied = True
                    temp = self.feature_encoders[f_id][0].transform(np.expand_dims(self.X[:, f_id], -1))
                    new_features.append(self.feature_encoders[f_id][1].fit_transform(np.expand_dims(temp, -1)))
                else:
                    temp = self.feature_encoders[f_id].transform(np.expand_dims(self.X[:, f_id], -1))
                    new_features.append(np.expand_dims(temp, -1))

        if one_hot_applied or type_ == "one-hot":
            self.X = np.concatenate(new_features, -1)
            print("Selected")
        else:
            self.X = np.array(self.X)
        # apply max abs scaling (useful for 1-hot representations)
        if apply_scaling:
            self.scaler = MaxAbsScaler().fit(self.X)
            self.X = self.scaler.transform(self.X)
        print(type(self.y))
        self.y = np.array(self.class_encoder.transform(self.y))
        self.y = self.y.squeeze(-1)
        print(type(self.y))
        print("Dataset correctly preprocessed")
        
    def load(self, filename):
        """
        Loads the data from the specified file 
        """
        print("Loading bank dataset from file {}".format(filename))
        # we open the file in read mode
        with open(filename) as in_file:
            self.X = []
            self.y = []
            
            for line in in_file:
                # Reminder: each line is in composed by values seperated by commas
                # e.g., 36,technician,married,tertiary,no,4596,yes,no,cellular,8,oct,234,2,175,2,success,yes
                values = line.strip().split(",")
                
                # we just make sure that we read a valid line
                if values and values[0] != '' and "?" not in values:
                    curr_X = values[:-1]
                    # we extract the class value for the current example
                    curr_y = values[-1]

                    # we store the current values by appending them to X and Y
                    self.X.append(curr_X)
                    self.y.append(curr_y)
            
            print("Dataset correctly loaded")
            self.X = np.array(self.X)
            self.y = np.array(self.y)
    
    def report(self):
    
        """
        Prints relevant information about the dataset 
        """
        # we assume that both X and Y have been correctly loaded
        if self.X is None and self.y is None:
            raise ValueError("Remember to call 'load' to load the dataset!")
        
        print("Num. examples: {}".format(str(self.X.shape[0])))
        print("Num. features: {}".format(str(self.X.shape[1])))
        
        # TODO: Implement printing of mean/variance for numerical and mode for categorical values.
        
        # print("calculating")
        num_column = []
        cat_column = []
        for i,j in enumerate(self.feature_types):
          if j=='num':
            num_column.append(self.X[:,i])
          else:
            cat_column.append(self.X[:,i])

        num_column = (np.array(num_column)).transpose()
        num_column = np.asarray(num_column).astype(np.float64)
        cat_column = (np.array(cat_column)).transpose()    
        print(num_column.shape)
        print(self.X.shape)
        print(self.X[:,0]==num_column[:,0])
                      

        mean = np.mean(num_column, axis = 0)
        std  = np.std (num_column, axis = 0)
        var  = np.var (num_column, axis = 0)

        from scipy import stats
        m = stats.mode(cat_column, axis = 0)
        mode = m[0][0]

        count_mean = 0
        count_mode = 0
        for i in (self.feature_types):

          if i=='num':
            
            print('')
            print("The mean of the column  "+ "''"+str(self.feature_names[count_mean+count_mode])+"''"+'   is:')
            print(mean[count_mean])
            print("The Standard Deviation" +' is:')
            print(std[count_mean])
            print("& The Variance" +' is:')
            print(var[count_mean])
            count_mean+=1
            print('')
            
          else:
            print('')
            print("The mode of the column  "+ "''"+self.feature_names[count_mode+count_mean] +"''"+'   is:')
            print(mode[count_mode])
            count_mode+=1


        # TODO: Implement outputing plot histograms for each feature
        for i in (self.feature_names):
          print('')
          plt.figure(figsize=(20,8))
          plt.title('Histogram of '+str(i))
          plt.xlabel('Categories')
          plt.ylabel('Frequency')
          #plt.axis([40, 160, 0, 0.03])
          plt.hist(self.X[:,(self.feature_names).index(i)])



def load_dataset(filename, preprocess_onehot=False, apply_scaling=False):
    
    dataset = BankDataset()

    dataset.load(filename)
    if preprocess_onehot:
        dataset.preprocess("one-hot", apply_scaling=apply_scaling)

    return dataset


# Simply calls report() on the corresponding dataset
def analyze(dataset,name= None):
    
    c = load_dataset(filename=dataset)
    c.report()


def decision_trees(train_dataset, dev_dataset, test_dataset):
    
    #  1. Implement Decision Tree classifier 
    #  2. Then predict on train/dev and compute accuracy.
    #  3. To run hyperparameter tuning on dev set print out the accuracies versus depth values and select best
    #  hyperparameter specification
    #  4. Finally, report accuracy on test dataset

    data  = train_dataset
    depth = 1
    c = load_dataset(filename=data, preprocess_onehot=True)
    DTC1 = DecisionTreeClassifier(random_state = 1,
                               max_depth=depth)
    DTC1.fit(c.X, c.y)
    print("The accuracy over train dataset is "+ str(DTC1.score( c.X, c.y)*100)+" %" )
    
    depth = 2
    DTC2 = DecisionTreeClassifier(random_state = 1,
                               max_depth=depth)
    DTC2.fit(c.X, c.y)
    print("The accuracy over train dataset is "+ str(DTC2.score( c.X, c.y)*100)+" %" )



def kNN(train_dataset, dev_dataset, test_dataset):

    #  1. Implement kNN classifier and train with k=1,2.
    #  2. Then predict on train/dev and compute accuracy.
    #  3. To run hyperparameter tuning on dev set print out the accuracies versus k values and select best k
    #  4. Finally, report accuracy on test dataset

    if train_dataset:
            
          data  = train_dataset
          k = 1
          c = load_dataset(filename=data, preprocess_onehot=True)
          knn = KNeighborsClassifier(n_neighbors=k, weights="uniform")
          knn.fit(c.X, c.y)
          print("The accuracy over train dataset is "+ str(knn.score( c.X, c.y)*100)+" %" )

    elif dev_dataset:
      
          data  = dev_dataset
          k = 1
          c = load_dataset(filename=data, preprocess_onehot=True)
          knn = KNeighborsClassifier(n_neighbors=k, weights="uniform")
          knn.fit(c.X, c.y)
          print("The accuracy over dev dataset is "+ str(knn.score( c.X, c.y)*100)+" %" )
    elif test_dataset:
      
          data  = dev_dataset
          k = 1
          c = load_dataset(filename=data, preprocess_onehot=True)
          knn = KNeighborsClassifier(n_neighbors=k, weights="uniform")
          knn.fit(c.X, c.y)
          print("The accuracy over dev dataset is "+ str(knn.score( c.X, c.y)*100)+" %" )

