In [3]:
import pandas as pd
import numpy as np
from sklearn import *
from sklearn.preprocessing import LabelEncoder
import collections
from sklearn.metrics import accuracy_score

In [4]:
columns_name = ['age','workclass','fnlwgt','education','education-num','marital-status',
               'occupation','relationship','race','sex','capital-gain','capital-loss',
               'hours-per-week','native-country']
df = pd.read_csv('train-features.csv',names=columns_name,header=None,na_values=' ?')
df['income'] = pd.read_csv('train-output.csv') #Binary (0 means <=50K, 1 means >50K)

## Classification

https://medium.com/datadriveninvestor/k-fold-cross-validation-6b8518070833

In [70]:
class Classification:
    def __init__(self):
        
        self.model = None
        self.columns_name = ['age','workclass','fnlwgt','education','education-num',
                                       'marital-status','occupation','relationship','race','sex',
                                       'capital-gain','capital-loss','hours-per-week','native-country']
        self.train_data = pd.read_csv('train-features.csv',names=self.columns_name,header=None,na_values=' ?')
        self.train_data['income'] = pd.read_csv('train-output.csv') #Binary (0 means <=50K, 1 means >50K)
        self.train_data = self.train_data.dropna()
        self.test_data = pd.read_csv('test-features.csv',names=self.columns_name,header=None,na_values=' ?')
        
        self.x_train = None
        self.x_test  = None
        self.y_train = None
        self.y_test  = None
    def preprocess(self,df):
        '''
        Cleans df and performs feature engineering.
        '''
        # replace nan values with random values of it's column's value
        for i in self.columns_name:
            df[i] = df[i].fillna(np.random.choice(df[i].dropna().unique()))

        # category
        df['education'] = df['education'].str.replace('Preschool', 'dropout')
        df['education'] = df['education'].str.replace('10th', 'dropout')
        df['education'] = df['education'].str.replace('11th', 'dropout')
        df['education'] = df['education'].str.replace('12th', 'dropout')
        df['education'] = df['education'].str.replace('1st-4th', 'dropout')
        df['education'] = df['education'].str.replace('5th-6th', 'dropout')
        df['education'] = df['education'].str.replace('7th-8th', 'dropout')
        df['education'] = df['education'].str.replace('9th', 'dropout')
        df['education'] = df['education'].str.replace('HS-Grad', 'HighGrad')
        df['education'] = df['education'].str.replace('HS-grad', 'HighGrad')
        df['education'] = df['education'].str.replace('Some-college', 'CommunityCollege')
        df['education'] = df['education'].str.replace('Assoc-acdm', 'CommunityCollege')
        df['education'] = df['education'].str.replace('Assoc-voc', 'CommunityCollege')
        
        # [' Never-married', ' Married-civ-spouse', ' Divorced',' Married-spouse-absent', ' Separated', ' Married-AF-spouse',
        # ' Widowed']
        df['marital-status'] = df['marital-status'].str.replace('Never-married','notMarried')
        df['marital-status'] = df['marital-status'].str.replace('Seperated','notMarried')
        df['marital-status'] = df['marital-status'].str.replace('Divorced','Seperated')
        df['marital-status'] = df['marital-status'].str.replace('Seperated','Seperated')
        df['marital-status'] = df['marital-status'].str.replace('Married-AF-spouse','Married')
        df['marital-status'] = df['marital-status'].str.replace('Married-civ-spouse','Married')
        df['marital-status'] = df['marital-status'].str.replace('Married-spouse-absent','Married')
        
        # Binning
        
        df.loc[df['education-num']  < 9,'education-num']   = 0 # dropout
        df.loc[df['education-num'] == 9,'education-num']  = 1 # high school
        df.loc[df['education-num'] == 10,'education-num'] = 2 # Community College
        df.loc[df['education-num'] == 11,'education-num'] = 2 # Community College
        df.loc[df['education-num'] == 12,'education-num'] = 2 # Community College
        df.loc[df['education-num'] == 13,'education-num'] = 3 # Bachelor
        df.loc[df['education-num'] == 14,'education-num'] = 4 # Master
        df.loc[df['education-num'] == 15,'education-num'] = 5 # Prof-school
        df.loc[df['education-num'] == 16,'education-num'] = 6 # Doctorate
        
        # binary
        df.loc[df['capital-gain'] >= 2000,'capital-gain'] = 1
        df.loc[df['capital-gain']  < 2000,'capital-gain'] = 0
        
        df.loc[df['capital-loss']  <= 600,'capital-loss'] = 1
        df.loc[df['capital-loss']  > 600, 'capital-loss'] = 0
        
        return df
    def take_model(self,model):
        '''
        Takes in model.
        '''
        self.model = model
    def split_data(self,df):
        
        self.x_train,self.x_test, self.y_train, self.y_test = model_selection.train_test_split(df[self.columns_name],df['income'],test_size=0.2,random_state=2)

        
    def plain_training(self,train_df,test_df,selected):
        '''
        It trains model once and returns accuracy score
        '''
        self.x_train,self.x_test, self.y_train, self.y_test = model_selection.train_test_split(train_df[selected],train_df['income'],test_size=0.2,random_state=2)
        self.x_train = self.normalize_data(self.x_train)
        self.x_test = self.normalize_data(self.x_test)
        test = self.normalize_data(test_df[selected])
        clf = self.model()
        clf.fit(self.x_train,self.y_train)
        y_pred = clf.predict(test)
        
        return clf.score(self.x_test,self.y_test)
    def generate_submission(self, y_pred):
        '''
        Saves submission in the right format.
        '''
        pred_df = pd.DataFrame()
        pred_df['Id'] = np.arange(0,len(test_df[self.columns_name]))
        pred_df['Category'] = y_pred
        pred_df.to_csv('submission.csv',index=False)
        
    def encode_df(self,df):
        '''
        Encode string values in dataframe into numbers.
        '''
        
        return df.apply(LabelEncoder().fit_transform)
    def normalize_data(self, X):
        # TO DO: Normalize the feature values of dataset X using the mean and standard deviation of the respective features 
    
        return preprocessing.scale(X)
    def one_hotencoder(self,df):
        ''' Input data should be in number (encoded)'''
        from sklearn.preprocessing import OneHotEncoder
        onehotencoder = OneHotEncoder()
        data = onehotencoder.fit_transform(df).toarray()
        
        return data
    
    def kbins(self,df,bins):
        from sklearn.preprocessing import KBinsDiscretizer
        est = KBinsDiscretizer(n_bins=bins,encode='onehot-dense',strategy='kmeans')
        est.fit(df)
        Xt = est.transform(df)
        return Xt
    def kfold_cw(self,model,train_df,test_df,niter):
        from sklearn.model_selection import cross_val_score, cross_val_predict
        score = cross_val_score(model,train_df,test_df,cv=niter)
        pred  = cross_val_predict(model,train_df,test_df,cv=niter)
        return model,np.mean(score),pred
    def randomforest_cw(self,train_df,test_df):
        '''
        Cross-validation training for Random Forest method.
        '''
        from sklearn.model_selection import KFold
        
        kfold = KFold(n_splits=5)
        n_estimators = [int(x) for x in np.linspace(200,2000,10)]
        for i in n_estimators:
            # resample train and test data. Normalize test data in this step as well.
            self.x_train,self.x_test, self.y_train, self.y_test = model_selection.train_test_split(train_df[self.columns_name],train_df['income'],test_size=0.2,random_state=2)
            self.x_train = self.normalize_data(self.x_train)
            self.x_test = self.normalize_data(self.x_test)
            
            # k-fold split again
            
            # train and fit on random forest method
            randomforest = ensemble.RandomForestClassifier(max_depth=20, random_state=0)
            randomforest.fit(self.x_train,self.y_train)
            
            # store n_estimator and it's accuracy value
            
        
        test = self.normalize_data(test_df[selected])
        
        y_pred = randomforest.predict(test)
        return y_pred, randomforest.score(self.x_test,self.y_test)

In [71]:
clf = Classification()
df = clf.train_data
# df = clf.preprocess(df)

test_df = clf.test_data
test_df = clf.preprocess(test_df)

encoded_df = clf.encode_df(df)
encoded_test_df = clf.encode_df(test_df)

# kbins = clf.kbins(encoded_df,5)
test_columns = ['age','workclass','fnlwgt','education-num','marital-status',
               'occupation','relationship','race','sex','capital-gain','capital-loss',
               'hours-per-week']
# encoded_df.head()
# clf.take_model(ensemble.RandomForestClassifier)
# clf.model
# clf.plain_training(encoded_df,encoded_test_df,test_columns)

In [73]:
encoded_df[:10]

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,22,5,2491,9,12,4,0,1,4,1,24,0,39,38,0
1,33,4,2727,9,12,2,3,0,4,1,0,0,12,38,0
2,21,2,13188,11,8,0,5,1,4,1,0,0,39,38,0
3,36,2,14354,1,6,2,5,0,2,1,0,0,39,38,0
4,11,2,18120,9,12,2,9,5,2,0,0,0,39,4,0
5,20,2,16567,12,13,2,3,5,4,0,0,0,39,38,0
6,32,2,7982,6,4,3,7,1,2,0,0,0,15,22,1
7,35,4,12746,11,8,2,3,0,4,1,0,0,44,38,1
8,14,2,1225,12,13,4,9,1,4,0,104,0,49,38,1
9,25,2,7908,9,12,2,3,0,4,1,78,0,39,38,1


In [None]:
# Random forest parameter
# https://towardsdatascience.com/hyperparameter-tuning-the-random-forest-in-python-using-scikit-learn-28d2aa77dd74
from sklearn.model_selection import RandomizedSearchCV

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

# now we instantiate the random search and fit it like any Sklearn model

# Use the random grid to search for best hyperparameters
# First create the base model to tune
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(encoded_df[:100][columns_name], encoded_df[:100]['income'])
rf_random.best_params_

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   20.3s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  1.4min


In [77]:
clf.split_data(encoded_df)
print(clf.x_test.shape)
clf.y_test.shape

(6033, 14)


(6033,)

In [84]:
def evaluate(model, test_features, test_labels):
    predictions = model.predict(test_features)
    errors = abs(predictions - test_labels)
    mape = 100 * np.mean(errors / test_labels)
    accuracy = 100 - mape
    print('Model Performance')
    print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))
    print('Accuracy = {:0.2f}%.'.format(accuracy))
    
    return accuracy,predictions

best_random = rf_random.best_estimator_
test_features = clf.x_test
test_features = clf.encode_df(test_features)
test_labels = clf.y_test
# test_labels = clf.encode_df(test_labels)
random_accuracy,pred = evaluate(best_random, test_features, test_labels)


Model Performance
Average Error: 0.3926 degrees.
Accuracy = -inf%.


In [85]:
pred

array([0.15496905, 0.37962651, 0.30631676, ..., 0.36953456, 0.31191919,
       0.29294345])

In [28]:
# %%time
# from sklearn.svm import SVC
# clf.take_model(SVC)
# clf.plain_training(encoded_df,encoded_test_df)


In [29]:
clf.take_model(linear_model.Lasso)
clf.plain_training(encoded_df,encoded_test_df,test_columns)


-0.00010095938492371559

In [30]:
# Naive Bayes
clf.take_model(naive_bayes.GaussianNB)
clf.plain_training(encoded_df,encoded_test_df,test_columns)
# NB = naive_bayes.GaussianNB()
# NB.fit(x_train,y_train)
# y_pred = NB.predict(x_test)
# accuracy_score(y_test,y_pred)

0.7566716393170894

## Random Forest Classifier

In [31]:
# Number of 
randomforest = ensemble.RandomForestClassifier(max_depth=3, random_state=0)
randomforest.get_params()

{'bootstrap': True,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': 3,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 'warn',
 'n_jobs': None,
 'oob_score': False,
 'random_state': 0,
 'verbose': 0,
 'warm_start': False}

## Perceptron

In [41]:
from sklearn.linear_model import Perceptron
model = Perceptron(tol=1e-3,random_state=0,penalty='l2',max_iter=20)
model.fit(clf.x_train,clf.y_train)
model.score(clf.x_test,clf.y_test)

0.5701972484667661

In [42]:
model.score(clf.x_test,clf.y_test)

0.5701972484667661