In [28]:
import pandas as pd
import numpy as np
from sklearn import *
from sklearn.preprocessing import LabelEncoder
import collections
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.model_selection import RandomizedSearchCV
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import RFE


In [29]:
columns_name = ['age','workclass','fnlwgt','education','education_num','marital-status',
               'occupation','relationship','race','sex','capital-gain','capital-loss',
               'hours-per-week','native-country']
df = pd.read_csv('train-features.csv',names=columns_name,header=None,na_values=' ?')
df['income'] = pd.read_csv('train-output.csv') #Binary (0 means <=50K, 1 means >50K)
df.columns = df.columns.str.replace('-','_')

In [30]:
df['workclass'].unique()

array([' State-gov', ' Self-emp-not-inc', ' Private', ' Federal-gov',
       ' Local-gov', nan, ' Self-emp-inc', ' Without-pay',
       ' Never-worked'], dtype=object)

## Classification

https://medium.com/datadriveninvestor/k-fold-cross-validation-6b8518070833

In [31]:
# df.native_country.unique()

In [45]:
class Classification:
    def __init__(self):
        
        self.model = None
        self.columns_name = ['age','workclass','fnlwgt','education','education-num',
                                       'marital-status','occupation','relationship','race','sex',
                                       'capital-gain','capital-loss','hours-per-week','native-country']
        self.train_data = pd.read_csv('train-features.csv',names=self.columns_name,header=None,na_values=' ?')
        self.train_data.columns = self.train_data.columns.str.replace('-','_')
        self.train_label = pd.read_csv('train-output.csv',names=['income'],header=None) #Binary (0 means <=50K, 1 means >50K)
#         self.train_label = self.train_label.values.ravel()
        self.test_data = pd.read_csv('test-features.csv',names=self.columns_name,header=None,na_values=' ?')
        self.test_data.columns = self.test_data.columns.str.replace('-','_')
        self.x_train = None
        self.x_test  = None
        self.y_train = None
        self.y_test  = None
    def preprocess(self,df,bins):
        '''
        Cleans df and performs feature engineering.
        '''
        
        for i in df.columns:
            
            df[i] = df[i].fillna(list(dict(df[i].dropna().value_counts()))[0]) # replace nan values with most commone values of it's column
        # category
        df['education'] = df['education'].str.replace('Preschool', 'dropout')
        df['education'] = df['education'].str.replace('10th', 'dropout')
        df['education'] = df['education'].str.replace('11th', 'dropout')
        df['education'] = df['education'].str.replace('12th', 'dropout')
        df['education'] = df['education'].str.replace('1st-4th', 'dropout')
        df['education'] = df['education'].str.replace('5th-6th', 'dropout')
        df['education'] = df['education'].str.replace('7th-8th', 'dropout')
        df['education'] = df['education'].str.replace('9th', 'dropout')
        df['education'] = df['education'].str.replace('HS-Grad', 'HighGrad')
        df['education'] = df['education'].str.replace('HS-grad', 'HighGrad')
        df['education'] = df['education'].str.replace('Some-college', 'CommunityCollege')
        df['education'] = df['education'].str.replace('Assoc-acdm', 'CommunityCollege')
        df['education'] = df['education'].str.replace('Assoc-voc', 'CommunityCollege')
        
        # [' Never-married', ' Married-civ-spouse', ' Divorced',' Married-spouse-absent', ' Separated', ' Married-AF-spouse',
        # ' Widowed']
        df['marital_status'] = df['marital_status'].str.replace('Never-married','notMarried')
        df['marital_status'] = df['marital_status'].str.replace('Married-spouse-absent','notMarried')
        df['marital_status'] = df['marital_status'].str.replace('Seperated','notMarried')
        df['marital_status'] = df['marital_status'].str.replace('Divorced','notMarried')
        df['marital_status'] = df['marital_status'].str.replace('Widowed','notMarried')
        df['marital_status'] = df['marital_status'].str.replace('Married-AF-spouse','Married')
        df['marital_status'] = df['marital_status'].str.replace('Married-civ-spouse','Married')
        
        
        
#         # df['workclass'] doesn't effect performance
#         '''
#         ' State-gov', ' Self-emp-not-inc', ' Private', ' Federal-gov',
#        ' Local-gov', nan, ' Self-emp-inc', ' Without-pay',
#        ' Never-worked']
#         '''
#         df['workclass'] = df['workclass'].str.replace('Never-worked','LowIncome')
#         df['workclass'] = df['workclass'].str.replace('Without-pay','LowIncome')
#         df['workclass'] = df['workclass'].str.replace('State-gov','Govt')
#         df['workclass'] = df['workclass'].str.replace('Federal-gov','Govt')
#         df['workclass'] = df['workclass'].str.replace('Local-gov','Govt')
        
        
        # Binning
#         education_num = pd.cut(df.education_num,bins=[0,8,9,12,13,14,15,16],labels=[0,1,2,3,4,5,6])
#         df['education_num'] = education_num
#         CG = pd.cut(df.capital_gain,bins=[-1,7700,1000000],labels=[1,2]) # [low gain,high gain]
#         df['capital_gain'] = CG
        
#         CL = pd.cut(df.capital_loss, bins=[-1,1900,1000000],labels= [1,0]) # [low loss,high loss]
#         df['capital_loss'] = CL
        
        # Create categories for continuous values. Run it only once
#         age = pd.cut(df.age,bins=[0,18,27,50,100],labels=[0,1,2,3]) # ['HighGrad','Bachelor','Grad','others'] ==> https://en.wikipedia.org/wiki/Education_in_the_United_States
        
#         age = pd.cut(df.age,bins)  
        hours = pd.cut(df.hours_per_week,bins=[0,20,40,49,100],labels=['PT','FT','OverTime','TwoJobs'])
        df['hours_per_week'] = hours
#         replace old column with new column with category values
#         df['age'] = age
        
        return df
    def take_model(self,model):
        '''
        Takes in model.
        '''
        self.model = model
        
        
    def plain_training(self,df_data,df_label,test_df):
        '''
        It trains model once and returns accuracy score
        '''
        self.x_train,self.x_test, self.y_train, self.y_test = model_selection.train_test_split(df_data,df_label,test_size=0.2,random_state=2)
        self.x_train = self.normalize_data(self.x_train)
        self.x_test = self.normalize_data(self.x_test)
        test = self.normalize_data(test_df)
        clf = self.model()
        clf.fit(self.x_train,self.y_train)
        y_pred = clf.predict(test)
        
        return clf.score(self.x_test,self.y_test)
    
    def generate_submission(self, y_pred):
        '''
        Saves submission in the right format.
        '''
        pred_df = pd.DataFrame()
        pred_df['Id'] = np.arange(0,len(y_pred))
        pred_df['Category'] = y_pred
        pred_df.to_csv('submission.csv',index=False)
        
    def encode_df(self,df):
        '''
        Encode string values in dataframe into numbers.
        '''
        
        return df.apply(LabelEncoder().fit_transform)
    
    def normalize_data(self, X):
        # TO DO: Normalize the feature values of dataset X using the mean and standard deviation of the respective features 
        # min_max has is better than robustscaler
#         min_max_scaler = preprocessing.MinMaxScaler() 
#         X = min_max_scaler.fit_transform(X)
        
#         scaler = preprocessing.RobustScaler()
#         X = scaler.fit_transform(X)
        # scale is better than min_max
        X = preprocessing.scale(X)
        # quantile_transformer did better than scale
#         quantile_transformer = preprocessing.QuantileTransformer(random_state=0)
#         X = quantile_transformer.fit_transform(X)
        
        return X

    def panda_onehotencode(self,train,test,selected):
        '''
        One hot encode train and test data. Then make sure they have same column amount.
        # https://stackoverflow.com/questions/58101126/using-scikit-learn-onehotencoder-with-a-pandas-dataframe
        https://towardsdatascience.com/encoding-categorical-features-21a2651a065c
        
        returns train, test
        '''
        tmp = pd.get_dummies(train[selected],prefix_sep='_',columns=selected)
        print("train data shape: ",tmp.shape)

        tmp1 = pd.get_dummies(test[selected],prefix_sep='_',columns=selected)
        print("test data shape: ",tmp1.shape)

        final_train, final_test = tmp.align(tmp1, join='inner', axis=1)  # inner join
        print("final train data : {}\nfinal test data: {}".format(final_train.shape,final_test.shape))
        return final_train,final_test
    
    def kfold_cw(self,model,df_data,df_label,df_test,niter):
        scores = cross_val_score(model,df_data,df_label.values.ravel(),cv=niter)
        print("Accuracy score: ",scores)
#         pred  = cross_val_predict(model,df_data,df_label,cv=niter)
        print('fitting now')
        final_model = model.fit(df_data,df_label.values.ravel())
        print('predicting now')
        new_pred = model.predict(df_test)
        print('acc: ',np.mean(scores))
        return np.mean(scores),new_pred
    

In [46]:
clf = Classification()

# convert train label to dataframe
label = clf.train_label.values.ravel()
df_label = pd.DataFrame(data=label,columns=['income'])
df_label.shape

# train data
df_data = clf.train_data
df_data = clf.preprocess(df_data,10)


# test data
test_df = clf.test_data
print(df_data.shape,test_df.shape)
test_df = clf.preprocess(test_df,10)


(32561, 14) (16281, 14)


In [47]:
# print(df_data.head() )
categorical_feature_mask = df_data.dtypes==object # gets features with string or object value
categorical_feature_mask
# filter categorical columns using mask and turn it into a list
categorical_cols = df_data.columns[categorical_feature_mask].tolist()
categorical_cols.append('hours_per_week')
# categorical_cols.append('education_num')
# categorical_cols.append('capital_gain')
# categorical_cols.append('capital_loss')

# categorical_cols
non_categorical_cols = list(set(df_data.columns) - set(categorical_cols))

# encode categorical data
encoded = clf.encode_df(df_data[categorical_cols])
test_encoded = clf.encode_df(test_df[categorical_cols])

# encoded,test_encoded = clf.panda_onehotencode(df_data,test_df,categorical_cols)
print(encoded.shape,test_encoded.shape)
encoded

# min_max has is better than robustscaler
# min_max_scaler = preprocessing.MinMaxScaler() 
# X = min_max_scaler.fit_transform(X)

# scale is better than min_max
# scale = preprocessing.scale(df_data[non_categorical_cols])
# x_test = s
# quantile_transformer did better than scale
# quantile_transformer = preprocessing.QuantileTransformer(random_state=0)
# X = quantile_transformer.fit_transform(X)
quantile_transformer = preprocessing.QuantileTransformer(random_state=0)
X_train_trans = quantile_transformer.fit_transform(df_data[non_categorical_cols])
X_test_trans = quantile_transformer.transform(test_df[non_categorical_cols])

# Normalized continous data
# normalized = clf.normalize_data(df_data[non_categorical_cols])
normalized = pd.DataFrame(data=X_train_trans,columns=non_categorical_cols)

# test_normalized = clf.normalize_data(test_df[non_categorical_cols])
test_normalized = pd.DataFrame(data=X_test_trans,columns=non_categorical_cols)

# Concat both data
final_train = pd.concat([encoded,normalized],axis=1)
final_test = pd.concat([test_encoded,test_normalized],axis=1)
print(final_train.shape,final_test.shape)
final_test

(32561, 9) (16281, 9)
(32561, 14) (16281, 14)


Unnamed: 0,workclass,education,marital_status,occupation,relationship,race,sex,native_country,hours_per_week,fnlwgt,age,education_num,capital_loss,capital_gain
0,3,6,2,6,3,2,1,37,0,0.721506,0.183684,0.099600,0.0,0.000000
1,3,3,0,4,0,4,1,37,3,0.146294,0.525025,0.291792,0.0,0.000000
2,1,1,0,10,0,4,1,37,0,0.908988,0.259760,0.735736,0.0,0.000000
3,3,1,0,6,0,2,1,37,0,0.407489,0.670671,0.565065,0.0,0.969469
4,3,1,2,9,3,4,0,37,0,0.188438,0.020521,0.565065,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16276,3,0,2,9,1,4,0,37,0,0.685686,0.550551,0.834334,0.0,0.000000
16277,3,3,2,9,2,2,1,37,0,0.891671,0.955956,0.291792,0.0,0.000000
16278,3,0,0,9,0,4,1,37,3,0.946008,0.525025,0.834334,0.0,0.000000
16279,3,0,2,0,3,1,1,37,0,0.133620,0.670671,0.834334,0.0,0.954687


In [35]:
# final_train,final_test = clf.panda_onehotencode(final_train,final_test,selected)

In [48]:
# # normalize train and test data
# df_norm = clf.normalize_data(encoded_df[selected])
# df_test_norm = clf.normalize_data(encoded_test_df[selected])

# # convert from numpy to dataframe
# df_norm = pd.DataFrame(data=df_norm,columns=selected)
# df_test_norm = pd.DataFrame(data=df_test_norm,columns=selected)

# print(df_norm)
clf.take_model(ensemble.GradientBoostingClassifier)
clf.plain_training(final_train,label,final_test)

0.8484569322892677

In [37]:
# unique count of prediction
# clf.generate_submission(gradient.predict(df_test_norm))

## Running CV with different method and in the order accuracy score

Below k-fold CV uses 

`cross_val_score`

`cross_val_pred`

In [38]:
# Gradient Boosting
gradient = ensemble.GradientBoostingClassifier(n_estimators=200,learning_rate=0.2,min_samples_split=10,max_features='auto',min_samples_leaf=1)
gdb_score,gdb_pred = clf.kfold_cw(gradient,final_train,df_label,final_test,3)


Accuracy score:  [0.86797494 0.87221301 0.87579471]
fitting now
predicting now
acc:  0.8719942200943166


In [61]:
feature = list(final_train.columns)
acc = []
f_pred = []
feat = []
for i in range(8,13):
    # run RFE
    selector = RFE(gradient, i, step=1)
    selector = selector.fit(final_train, df_label.values.ravel())

    rank = selector.ranking_
    
    # selected those top tank features
    f_feature = []
    for r,f in zip(rank,feature):
        if(r == 1):
            f_feature.append(f)
    feat.append(f_feature)
    # train model on new selected features
    gradient = ensemble.GradientBoostingClassifier(n_estimators=200,learning_rate=0.2,min_samples_split=10,max_features='auto',min_samples_leaf=1)
    gdb_score,gdb_pred = clf.kfold_cw(gradient,final_train[f_feature],df_label,final_test[f_feature],3)
    acc.append(gdb_score)
    f_pred.append(gdb_pred)

data_info = pd.DataFrame()
data_info['feature'] = feat
data_info['f_pred'] = f_pred
data_info['acc'] = acc
for f in data_info.feature:
    print(f)
data_info

Accuracy score:  [0.86484245 0.86944905 0.8725698 ]
fitting now
predicting now
acc:  0.8689537672684838
Accuracy score:  [0.86705362 0.87138382 0.87468903]
fitting now
predicting now
acc:  0.8710421561644272
Accuracy score:  [0.86742215 0.87193661 0.87514973]
fitting now
predicting now
acc:  0.8715028299775255
Accuracy score:  [0.86668509 0.87037037 0.87625541]
fitting now
predicting now
acc:  0.8711036255578052
Accuracy score:  [0.86981758 0.86963331 0.87422832]
fitting now
predicting now
acc:  0.8712264058818245
['marital_status', 'occupation', 'hours_per_week', 'fnlwgt', 'age', 'education_num', 'capital_loss', 'capital_gain']
['workclass', 'marital_status', 'occupation', 'hours_per_week', 'fnlwgt', 'age', 'education_num', 'capital_loss', 'capital_gain']
['workclass', 'marital_status', 'occupation', 'relationship', 'hours_per_week', 'fnlwgt', 'age', 'education_num', 'capital_loss', 'capital_gain']
['workclass', 'marital_status', 'occupation', 'relationship', 'native_country', 'hours_

Unnamed: 0,feature,f_pred,acc
0,"[marital_status, occupation, hours_per_week, f...","[0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, ...",0.868954
1,"[workclass, marital_status, occupation, hours_...","[0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, ...",0.871042
2,"[workclass, marital_status, occupation, relati...","[0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, ...",0.871503
3,"[workclass, marital_status, occupation, relati...","[0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, ...",0.871104
4,"[workclass, marital_status, occupation, relati...","[0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, ...",0.871226


In [66]:
def compare(pred):
    df = pd.read_csv('submission1.csv') #last submission
    mis_classified = 0
    for i in range(len(pred)):
#         if(df['Category'][i] == 1):
        if(df['Category'][i] != pred[i]):
            mis_classified += 1 # last best was 290 (505 with above if statement) mis_classified or close to this
    return mis_classified
compare(gdb_pred)

1235

In [67]:
for i in data_info.f_pred:
    print(compare(i))

634
545
505
1211
1235


In [59]:
np.unique(data_info.f_pred[3],return_counts=True)

(array([0, 1]), array([12706,  3575]))

In [60]:
clf.generate_submission(data_info.f_pred[3])

In [18]:
# AdaBoostClassifier
adaboost = ensemble.AdaBoostClassifier(n_estimators=200,learning_rate= 1)

ada_score,ada_pred = clf.kfold_cw(adaboost,final_train,df_label,final_test,3)


Accuracy score:  [0.86355261 0.86668509 0.86906846]
fitting now
final prediction
acc:  0.8664353869068341


In [27]:
feature = list(final_train.columns)
acc = []
f_pred = []
feat = []
for i in range(10,14):
    # run RFE
    selector = RFE(gradient, i, step=1)
    selector = selector.fit(final_train, df_label.values.ravel())

    rank = selector.ranking_
    
    # selected those top tank features
    f_feature = []
    for r,f in zip(rank,feature):
        if(r == 1):
            f_feature.append(f)
    feat.append(f_feature)
    # train model on new selected features
    adaboost = ensemble.AdaBoostClassifier(n_estimators=200,learning_rate= 1)

    ada_score,ada_pred = clf.kfold_cw(adaboost,final_train[f_feature],df_label,final_test[f_feature],3)
    acc.append(ada_score)
    f_pred.append(ada_pred)

data_info = pd.DataFrame()
data_info['feature'] = feat
data_info['f_pred'] = f_pred
data_info['acc'] = acc
for f in data_info.feature:
    print(f)
data_info

Accuracy score:  [0.86014373 0.86733002 0.86888418]
fitting now
final prediction
acc:  0.8654526406295525
Accuracy score:  [0.8615257  0.86650083 0.86943702]
fitting now
final prediction
acc:  0.8658211853394148
Accuracy score:  [0.86134144 0.86594804 0.86943702]
fitting now
final prediction
acc:  0.86557550018494
Accuracy score:  [0.86318408 0.8664087  0.86952916]
fitting now
final prediction
acc:  0.8663739797666742
['workclass', 'marital_status', 'occupation', 'relationship', 'hours_per_week', 'education_num', 'fnlwgt', 'age', 'capital_gain', 'capital_loss']
['workclass', 'marital_status', 'occupation', 'relationship', 'native_country', 'hours_per_week', 'education_num', 'fnlwgt', 'age', 'capital_gain', 'capital_loss']
['workclass', 'education', 'marital_status', 'occupation', 'relationship', 'native_country', 'hours_per_week', 'education_num', 'fnlwgt', 'age', 'capital_gain', 'capital_loss']
['workclass', 'education', 'marital_status', 'occupation', 'relationship', 'sex', 'native_c

Unnamed: 0,feature,f_pred,acc
0,"[workclass, marital_status, occupation, relati...","[0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, ...",0.865453
1,"[workclass, marital_status, occupation, relati...","[0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...",0.865821
2,"[workclass, education, marital_status, occupat...","[0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, ...",0.865576
3,"[workclass, education, marital_status, occupat...","[0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, ...",0.866374


In [None]:
compare(ada_pred)

In [None]:
# last submission
f = pd.read_csv('submission.csv')
compare(f.Category)

In [None]:
# clf.generate_submission(gdb_pred)

In [None]:
# feature_scores1 = pd.Series(adaboost.feature_importances_, index=final_train.columns).sort_values(ascending=False)
# list(dict(feature_scores1[feature_scores1 > 0.009]))

# final_train = final_train[list(dict(feature_scores1[feature_scores1 > 0.009]))]
# final_train

# feature_scores = pd.Series(adaboost.feature_importances_, index=final_test.columns).sort_values(ascending=False)
# list(dict(feature_scores[feature_scores > 0.009]))

# final_test = final_test[list(dict(feature_scores[feature_scores > 0.009]))]
# final_test

In [None]:
# Random forest
rdmforest = ensemble.RandomForestClassifier(n_estimators=200,min_samples_split=2,min_samples_leaf=4,max_depth=10,bootstrap=True)

rdfst_score,rdfst_pred = clf.kfold_cw(rdmforest,final_train,df_label,final_test,3)
compare(rdfst_pred)

In [None]:
rdmforest = ensemble.RandomForestClassifier(n_estimators=200,bootstrap=False)

rdfst_score1,rdfst_pred1 = clf.kfold_cw(rdmforest,final_train,df_label,final_test,10)
compare(rdfst_pred1)

In [None]:
# extra tree classifier
extratree = ensemble.ExtraTreesClassifier(n_estimators=100, max_depth=None,min_samples_split=2, random_state=0)
extra_score, extra_pred = clf.kfold_cw(extratree,final_train,df_label,final_test,3)


In [None]:
# # Naive Bayes
# NB = naive_bayes.GaussianNB()
# nb_score,nb_pred = clf.kfold_cw(NB,final_train,df_label,final_test,10)


In [None]:
# SVC
from sklearn.svm import SVC
svc = SVC(gamma='auto')
svc_score,svc_pred = clf.kfold_cw(svc,final_train,df_label,final_test,3)


In [None]:
# clf.generate_submission(gdb_pred)

In [None]:
# Decision Tree
decisiontree = tree.DecisionTreeClassifier(max_depth=None,min_samples_split=2,random_state=0)
tree_score,tree_pred = clf.kfold_cw(decisiontree,final_train,df_label,final_test,10)


In [None]:
# Lasso
# lasso = linear_model.Lasso(alpha=0.7)
# lasso_score, lasso_pred = clf.kfold_cw(lasso,final_train,df_label,final_test,3)


In [None]:
bagging = ensemble.BaggingClassifier(n_estimators=10,base_estimator=SVC(gamma='auto'),random_state=0)
score,pred = clf.kfold_cw(bagging,final_train,df_label,final_test,3)

# accuracy of [0.82725263 0.83185922 0.83165945]

With normalization and 3 kfold cv, the random forest with n_estimators=1600,min_samples_split=2,min_samples_leaf=4,max_depth=10,bootstrap=True parameters 

gave this accuracy scores

[0.83618942 0.83812419 0.84059707]

And with minmaxscaler and svr model ```best_svr = SVR(kernel='rbf',gamma='auto') ```, we got

[0.85185185 0.85756403 0.86289505]

In [None]:
np.unique(ada_pred,return_counts=True)

In [None]:
np.unique(extra_pred,return_counts=True)

In [None]:
np.unique(rdfst_pred,return_counts=True)

In [None]:
np.unique(gdb_pred,return_counts=True)

https://stats.stackexchange.com/questions/411290/how-to-use-a-cross-validated-model-for-prediction

In [None]:
# clf.generate_submission(gradient.predict(final_test))

## Random Forest Classifier

In [None]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 400, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
random_grid

In [None]:
score = []
pred = []
step_size = [0.1,0.2,0.3,0.5,0.6,0.7,0.8,0.9,1,1.5,1.8]
actual_pred = []
mis_classified = []
for n in range(2,10,3):
    print(n)
    gradient = ensemble.GradientBoostingClassifier(n_estimators=200,learning_rate=0.2,min_samples_split=10,max_features='auto',min_samples_leaf=1)
    gdb_score,gdb_pred = clf.kfold_cw(gradient,final_train,df_label,final_test,n)
    score.append(gdb_score)
    pred.append(np.unique(gdb_pred,return_counts=True))
    actual_pred.append(gdb_pred)
    mis_classified.append(compare(gdb_pred))
  
for i in range(0,5,2):
    print(i)

In [None]:
data_info = pd.DataFrame()
# data_info['n_estimators'] = random_grid['n_estimators']
# data_info['max_features'] = random_grid['max_features']
# data_info['current'] = random_grid['n_estimators']
# data_info['step_size'] = step_size
data_info['score'] = score
data_info['pred_count'] = pred
data_info['mis_classified'] = mis_classified
data_info['actual_pred'] = actual_pred
data_info

In [None]:
data_info[data_info.score == data_info['score'].max()]mis_classified

In [None]:
score1 = []
pred1 = []
step_size = [0.1,0.3,0.5,0.6,0.7,0.8,0.9,1,1.5,1.8]
for n,s in zip(random_grid['n_estimators'],step_size):
    gradient = ensemble.GradientBoostingClassifier(n_estimators=n,learning_rate=s)
    gdb_score,gdb_pred = clf.kfold_cw(gradient,final_train,df_label,final_test,3)
    print(gdb_score)
    score1.append(gdb_score)
    pred1.append(np.unique(gdb_pred,return_counts=True))


In [None]:
# # https://towardsdatascience.com/hyperparameter-tuning-the-random-forest-in-python-using-scikit-learn-28d2aa77dd74
# # Number of trees in random forest
# n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# # Number of features to consider at every split
# max_features = ['auto', 'sqrt']
# # Maximum number of levels in tree
# max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
# max_depth.append(None)
# # Minimum number of samples required to split a node
# min_samples_split = [2, 5, 10]
# # Minimum number of samples required at each leaf node
# min_samples_leaf = [1, 2, 4]
# # Method of selecting samples for training each tree
# bootstrap = [True, False]

# # Create the random grid
# random_grid = {'n_estimators': n_estimators,
#                'max_features': max_features,
#                'max_depth': max_depth,
#                'min_samples_split': min_samples_split,
#                'min_samples_leaf': min_samples_leaf,
#                'bootstrap': bootstrap}

# # Use the random grid to search for best hyperparameters
# # First create the base model to tune
# rf = RandomForestRegressor()
# # Random search of parameters, using 3 fold cross validation, 
# # search across 100 different combinations, and use all available cores
# rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# # Fit the random search model
# rf_random.fit(final_train, df_label)