In [3]:
import pandas as pd
import numpy as np
from sklearn import *
from sklearn.preprocessing import LabelEncoder
import collections
from sklearn.metrics import accuracy_score

In [4]:
columns_name = ['age','workclass','fnlwgt','education','education-num','marital-status',
               'occupation','relationship','race','sex','capital-gain','capital-loss',
               'hours-per-week','native-country']
df = pd.read_csv('train-features.csv',names=columns_name,header=None,na_values=' ?')
df['income'] = pd.read_csv('train-output.csv') #Binary (0 means <=50K, 1 means >50K)

## Classification

https://medium.com/datadriveninvestor/k-fold-cross-validation-6b8518070833

In [171]:
class Classification:
    def __init__(self):
        
        self.model = None
        self.columns_name = ['age','workclass','fnlwgt','education','education-num',
                                       'marital-status','occupation','relationship','race','sex',
                                       'capital-gain','capital-loss','hours-per-week','native-country']
        self.train_data = pd.read_csv('train-features.csv',names=self.columns_name,header=None,na_values=' ?')
        self.train_label = pd.read_csv('train-output.csv',names=['income'],header=None) #Binary (0 means <=50K, 1 means >50K)
        self.train_label = self.train_label.values.ravel()
        self.test_data = pd.read_csv('test-features.csv',names=self.columns_name,header=None,na_values=' ?')
        
        self.x_train = None
        self.x_test  = None
        self.y_train = None
        self.y_test  = None
    def preprocess(self,df):
        '''
        Cleans df and performs feature engineering.
        '''
        # replace nan values with random values of it's column's value
        for i in self.columns_name:
            df[i] = df[i].fillna(np.random.choice(df[i].dropna().unique()))

        # category
        df['education'] = df['education'].str.replace('Preschool', 'dropout')
        df['education'] = df['education'].str.replace('10th', 'dropout')
        df['education'] = df['education'].str.replace('11th', 'dropout')
        df['education'] = df['education'].str.replace('12th', 'dropout')
        df['education'] = df['education'].str.replace('1st-4th', 'dropout')
        df['education'] = df['education'].str.replace('5th-6th', 'dropout')
        df['education'] = df['education'].str.replace('7th-8th', 'dropout')
        df['education'] = df['education'].str.replace('9th', 'dropout')
        df['education'] = df['education'].str.replace('HS-Grad', 'HighGrad')
        df['education'] = df['education'].str.replace('HS-grad', 'HighGrad')
        df['education'] = df['education'].str.replace('Some-college', 'CommunityCollege')
        df['education'] = df['education'].str.replace('Assoc-acdm', 'CommunityCollege')
        df['education'] = df['education'].str.replace('Assoc-voc', 'CommunityCollege')
        
        # [' Never-married', ' Married-civ-spouse', ' Divorced',' Married-spouse-absent', ' Separated', ' Married-AF-spouse',
        # ' Widowed']
        df['marital-status'] = df['marital-status'].str.replace('Never-married','notMarried')
        df['marital-status'] = df['marital-status'].str.replace('Seperated','notMarried')
        df['marital-status'] = df['marital-status'].str.replace('Divorced','Seperated')
        df['marital-status'] = df['marital-status'].str.replace('Seperated','Seperated')
        df['marital-status'] = df['marital-status'].str.replace('Married-AF-spouse','Married')
        df['marital-status'] = df['marital-status'].str.replace('Married-civ-spouse','Married')
        df['marital-status'] = df['marital-status'].str.replace('Married-spouse-absent','Married')
        
        # Binning
        
        df.loc[df['education-num']  < 9,'education-num']   = 0 # dropout
        df.loc[df['education-num'] == 9,'education-num']  = 1 # high school
        df.loc[df['education-num'] == 10,'education-num'] = 2 # Community College
        df.loc[df['education-num'] == 11,'education-num'] = 2 # Community College
        df.loc[df['education-num'] == 12,'education-num'] = 2 # Community College
        df.loc[df['education-num'] == 13,'education-num'] = 3 # Bachelor
        df.loc[df['education-num'] == 14,'education-num'] = 4 # Master
        df.loc[df['education-num'] == 15,'education-num'] = 5 # Prof-school
        df.loc[df['education-num'] == 16,'education-num'] = 6 # Doctorate
        
        # binary
        df.loc[df['capital-gain'] >= 2000,'capital-gain'] = 1
        df.loc[df['capital-gain']  < 2000,'capital-gain'] = 0
        
        df.loc[df['capital-loss']  <= 600,'capital-loss'] = 1
        df.loc[df['capital-loss']  > 600, 'capital-loss'] = 0
        
        
        return df
    def take_model(self,model):
        '''
        Takes in model.
        '''
        self.model = model
    def split_data(self,df):
        
        self.x_train,self.x_test, self.y_train, self.y_test = model_selection.train_test_split(df_data,df_label,test_size=0.2,random_state=2)

        
    def plain_training(self,df_data,df_label,test_df):
        '''
        It trains model once and returns accuracy score
        '''
        self.x_train,self.x_test, self.y_train, self.y_test = model_selection.train_test_split(df_data,df_label,test_size=0.2,random_state=2)
        self.x_train = self.normalize_data(self.x_train)
        self.x_test = self.normalize_data(self.x_test)
        test = self.normalize_data(test_df)
        clf = self.model()
        clf.fit(self.x_train,self.y_train)
        y_pred = clf.predict(test)
        
        return clf.score(self.x_test,self.y_test)
    def generate_submission(self, y_pred):
        '''
        Saves submission in the right format.
        '''
        pred_df = pd.DataFrame()
        pred_df['Id'] = np.arange(0,len(y_pred))
        pred_df['Category'] = y_pred
        pred_df.to_csv('submission.csv',index=False)
        
    def encode_df(self,df):
        '''
        Encode string values in dataframe into numbers.
        '''
        
        return df.apply(LabelEncoder().fit_transform)
    def normalize_data(self, X):
        # TO DO: Normalize the feature values of dataset X using the mean and standard deviation of the respective features 
    
        return preprocessing.scale(X)
    def one_hotencoder(self,df):
        ''' Input data should be in number (encoded)'''
        from sklearn.preprocessing import OneHotEncoder
        onehotencoder = OneHotEncoder()
        data = onehotencoder.fit_transform(df).toarray()
        
        return data
    
    def kbins(self,df,bins):
        from sklearn.preprocessing import KBinsDiscretizer
        est = KBinsDiscretizer(n_bins=bins,encode='onehot-dense',strategy='kmeans')
        est.fit(df)
        Xt = est.transform(df)
        return Xt
    def kfold_cw(self,model,df_data,df_label,niter):
        from sklearn.model_selection import cross_val_score, cross_val_predict
        score = cross_val_score(model,df_data,df_label,cv=niter)
        pred  = cross_val_predict(model,df_data,df_label,cv=niter)
        
        return np.mean(score),pred
    def randomforest_cw(self,train_df,test_df):
        '''
        Cross-validation training for Random Forest method.
        '''
        from sklearn.model_selection import KFold
        
        kfold = KFold(n_splits=5)
        n_estimators = [int(x) for x in np.linspace(200,2000,10)]
        for i in n_estimators:
            # resample train and test data. Normalize test data in this step as well.
            self.x_train,self.x_test, self.y_train, self.y_test = model_selection.train_test_split(train_df[self.columns_name],train_df['income'],test_size=0.2,random_state=2)
            self.x_train = self.normalize_data(self.x_train)
            self.x_test = self.normalize_data(self.x_test)
            
            # k-fold split again
            
            # train and fit on random forest method
            randomforest = ensemble.RandomForestClassifier(max_depth=20, random_state=0)
            randomforest.fit(self.x_train,self.y_train)
            
            # store n_estimator and it's accuracy value
            
        
        test = self.normalize_data(test_df[selected])
        
        y_pred = randomforest.predict(test)
        return y_pred, randomforest.score(self.x_test,self.y_test)

In [172]:
# Naive Bayes
# clf.take_model(naive_bayes.GaussianNB)
# clf.plain_training(encoded_df,encoded_test_df,test_columns)
# x_train,x_test, y_train, y_test = model_selection.train_test_split(df_norm,clf.train_label,test_size=0.2,random_state=2)

# NB = naive_bayes.GaussianNB()
# NB.fit(x_train,y_train)
# y_pred = NB.predict(x_test)
# accuracy_score(y_test,y_pred)

In [173]:
clf = Classification()
df_data = clf.train_data
df_label = clf.train_label
df = clf.preprocess(df_data)

test_df = clf.test_data
test_df = clf.preprocess(test_df)

encoded_df = clf.encode_df(df_data)
encoded_test_df = clf.encode_df(test_df)

# kbins = clf.kbins(encoded_df,5)
test_columns = ['age','workclass','fnlwgt','education-num','marital-status',
               'occupation','relationship','race','sex','capital-gain','capital-loss',
               'hours-per-week']
encoded_df.head()
# print(encoded_df.head())
# print(clf.train_label.head())
df_norm = clf.normalize_data(encoded_df)
df_test_norm = clf.normalize_data(encoded_test_df)
df[:10]
clf.take_model(ensemble.RandomForestClassifier)
# clf.model
clf.plain_training(df_norm,clf.train_label,df_test_norm)



0.8134500230308613

In [177]:
clf.train_label.shape

(32561,)

In [165]:
from sklearn.model_selection import cross_val_score, cross_val_predict
# {'n_estimators': 1600,
#  'min_samples_split': 2,
#  'min_samples_leaf': 4,
#  'max_features': 'sqrt',
#  'max_depth': 10,
#  'bootstrap': True}
rdmforest = ensemble.RandomForestClassifier(n_estimators=1600,min_samples_split=2,min_samples_leaf=4,max_depth=10,bootstrap=True)

score = cross_val_score(rdmforest,df_norm,df_label,cv=3)
print('prediction')
pred  = cross_val_predict(rdmforest,df_norm,df_label,cv=3)

prediction


In [167]:
print(score)
np.unique(pred,return_counts=True)

[0.83618942 0.83812419 0.84059707]


(array([0, 1]), array([26802,  5759]))

https://stats.stackexchange.com/questions/411290/how-to-use-a-cross-validated-model-for-prediction

In [179]:
final_model = rdmforest.fit(df_norm,df_label)
new_pred = rdmforest.predict(df_test_norm)


In [182]:
new_pred.shape
clf.generate_submission(new_pred)

In [183]:
np.unique(new_pred,return_counts=True)

(array([0, 1]), array([13466,  2815]))

In [129]:
from sklearn.svm import SVR
best_svr = SVR(kernel='rbf',gamma='auto')
clf.kfold_cw(best_svr,df_norm,clf.train_label,3)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(0.30673797398967845,
 array([0.03965261, 0.22632972, 0.03198692, ..., 0.04392514, 0.07944159,
        0.26698445]))

In [86]:
# Random forest parameter
# https://towardsdatascience.com/hyperparameter-tuning-the-random-forest-in-python-using-scikit-learn-28d2aa77dd74
from sklearn.model_selection import RandomizedSearchCV

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

# now we instantiate the random search and fit it like any Sklearn model

# Use the random grid to search for best hyperparameters
# First create the base model to tune
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(encoded_df[:100][columns_name], encoded_df[:100]['income'])
rf_random.best_params_

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   20.3s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:  2.7min finished


{'n_estimators': 1600,
 'min_samples_split': 2,
 'min_samples_leaf': 4,
 'max_features': 'sqrt',
 'max_depth': 10,
 'bootstrap': True}

In [87]:
clf.split_data(encoded_df)
print(clf.x_test.shape)
clf.y_test.shape

(6033, 14)


(6033,)

In [88]:
def evaluate(model, test_features, test_labels):
    predictions = model.predict(test_features)
    errors = abs(predictions - test_labels)
    mape = 100 * np.mean(errors / test_labels)
    accuracy = 100 - mape
    print('Model Performance')
    print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))
    print('Accuracy = {:0.2f}%.'.format(accuracy))
    
    return accuracy,predictions

best_random = rf_random.best_estimator_
test_features = clf.x_test
test_features = clf.encode_df(test_features)
test_labels = clf.y_test
# test_labels = clf.encode_df(test_labels)
random_accuracy,pred = evaluate(best_random, test_features, test_labels)


Model Performance
Average Error: 0.2458 degrees.
Accuracy = -inf%.


In [92]:
pred = best_random.predict(encoded_test_df)

In [93]:
pred.shape

(16281,)

In [28]:
# %%time
from sklearn.svm import SVC
clf.take_model(SVC)
clf.plain_training(encoded_df,encoded_test_df)


In [29]:
clf.take_model(linear_model.Lasso)
clf.plain_training(encoded_df,encoded_test_df,test_columns)


-0.00010095938492371559

## Random Forest Classifier

In [31]:
# Number of 
randomforest = ensemble.RandomForestClassifier(max_depth=3, random_state=0)
randomforest.get_params()

{'bootstrap': True,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': 3,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 'warn',
 'n_jobs': None,
 'oob_score': False,
 'random_state': 0,
 'verbose': 0,
 'warm_start': False}

## Perceptron

In [41]:
from sklearn.linear_model import Perceptron
model = Perceptron(tol=1e-3,random_state=0,penalty='l2',max_iter=20)
model.fit(clf.x_train,clf.y_train)
model.score(clf.x_test,clf.y_test)

0.5701972484667661

In [42]:
model.score(clf.x_test,clf.y_test)

0.5701972484667661