In [419]:
import pandas as pd
import numpy as np
from sklearn import *
from sklearn.preprocessing import LabelEncoder
import collections
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.model_selection import RandomizedSearchCV
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from sklearn.preprocessing import OneHotEncoder


In [420]:
columns_name = ['age','workclass','fnlwgt','education','education_num','marital-status',
               'occupation','relationship','race','sex','capital-gain','capital-loss',
               'hours-per-week','native-country']
df = pd.read_csv('train-features.csv',names=columns_name,header=None,na_values=' ?')
df['income'] = pd.read_csv('train-output.csv') #Binary (0 means <=50K, 1 means >50K)
df.columns = df.columns.str.replace('-','_')

In [421]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,0.0
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,0.0
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,0.0
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,0.0
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,0.0


## Classification

https://medium.com/datadriveninvestor/k-fold-cross-validation-6b8518070833

In [503]:
class Classification:
    def __init__(self):
        
        self.model = None
        self.columns_name = ['age','workclass','fnlwgt','education','education_num',
                                       'marital_status','occupation','relationship','race','sex',
                                       'capital_gain','capital_loss','hours_per_week','native_country']
        self.train_data = pd.read_csv('train-features.csv',names=self.columns_name,header=None,na_values=' ?')
        self.train_label = pd.read_csv('train-output.csv',names=['income'],header=None) #Binary (0 means <=50K, 1 means >50K)
#         self.train_label = self.train_label.values.ravel()
        self.test_data = pd.read_csv('test-features.csv',names=self.columns_name,header=None,na_values=' ?')
        
        self.x_train = None
        self.x_test  = None
        self.y_train = None
        self.y_test  = None
    def preprocess(self,df,bins):
        '''
        Cleans df and performs feature engineering.
        '''
        # replace nan values with most commone values of it's column
        for i in self.columns_name:
            df[i] = df[i].fillna(list(dict(df_data['education'].dropna().value_counts()))[0])

        # category
        df['education'] = df['education'].str.replace('Preschool', 'dropout')
        df['education'] = df['education'].str.replace('10th', 'dropout')
        df['education'] = df['education'].str.replace('11th', 'dropout')
        df['education'] = df['education'].str.replace('12th', 'dropout')
        df['education'] = df['education'].str.replace('1st-4th', 'dropout')
        df['education'] = df['education'].str.replace('5th-6th', 'dropout')
        df['education'] = df['education'].str.replace('7th-8th', 'dropout')
        df['education'] = df['education'].str.replace('9th', 'dropout')
        df['education'] = df['education'].str.replace('HS-Grad', 'HighGrad')
        df['education'] = df['education'].str.replace('HS-grad', 'HighGrad')
        df['education'] = df['education'].str.replace('Some-college', 'CommunityCollege')
        df['education'] = df['education'].str.replace('Assoc-acdm', 'CommunityCollege')
        df['education'] = df['education'].str.replace('Assoc-voc', 'CommunityCollege')
        
        # [' Never-married', ' Married-civ-spouse', ' Divorced',' Married-spouse-absent', ' Separated', ' Married-AF-spouse',
        # ' Widowed']
        df['marital_status'] = df['marital_status'].str.replace('Never-married','notMarried')
        df['marital_status'] = df['marital_status'].str.replace('Seperated','notMarried')
        df['marital_status'] = df['marital_status'].str.replace('Divorced','Seperated')
        df['marital_status'] = df['marital_status'].str.replace('Seperated','Seperated')
        df['marital_status'] = df['marital_status'].str.replace('Married-AF-spouse','Married')
        df['marital_status'] = df['marital_status'].str.replace('Married-civ-spouse','Married')
        df['marital_status'] = df['marital_status'].str.replace('Married-spouse-absent','Married')
        
        # Binning
        education_num = pd.cut(df.education_num,bins=[0,8,9,12,13,14,15,16],labels=[0,1,2,3,4,5,6])
        df['education_num'] = education_num
        
        # Create categories for continuous values. Run it only once
        age = pd.cut(df.age,bins)
        hours = pd.cut(df.hours_per_week,bins=[0,20,40,100],labels=['PT','FT','OverTime'])
        CG = pd.cut(df.capital_gain,bins)
        CL = pd.cut(df.capital_loss,bins)
        fnlwgt = pd.cut(df.fnlwgt,bins)
        
        # replace old column with new column with category values
        df['age'] = age
        df['hours_per_week'] = hours
        df['capital_gain'] = CG
        df['capital_loss'] = CL
        df['fnlwgt'] = fnlwgt
        
        return df
    def take_model(self,model):
        '''
        Takes in model.
        '''
        self.model = model
        
    def split_data(self,df_data,df_label):
        
        self.x_train,self.x_test, self.y_train, self.y_test = model_selection.train_test_split(df_data,df_label,test_size=0.2,random_state=2)

        
    def plain_training(self,df_data,df_label,test_df):
        '''
        It trains model once and returns accuracy score
        '''
        self.x_train,self.x_test, self.y_train, self.y_test = model_selection.train_test_split(df_data,df_label,test_size=0.2,random_state=2)
        self.x_train = self.normalize_data(self.x_train)
        self.x_test = self.normalize_data(self.x_test)
        test = self.normalize_data(test_df)
        clf = self.model()
        clf.fit(self.x_train,self.y_train)
        y_pred = clf.predict(test)
        
        return clf.score(self.x_test,self.y_test)
    
    def generate_submission(self, y_pred):
        '''
        Saves submission in the right format.
        '''
        pred_df = pd.DataFrame()
        pred_df['Id'] = np.arange(0,len(y_pred))
        pred_df['Category'] = y_pred
        pred_df.to_csv('submission.csv',index=False)
        
    def encode_df(self,df):
        '''
        Encode string values in dataframe into numbers.
        '''
        
        return df.apply(LabelEncoder().fit_transform)
    
    def normalize_data(self, X):
        # TO DO: Normalize the feature values of dataset X using the mean and standard deviation of the respective features 
#         min_max_scaler = preprocessing.MinMaxScaler() 
#         X = min_max_scaler.fit_transform(X)
        
        scaler = preprocessing.RobustScaler()
        X = scaler.fit_transform(X)
        return X
    
    def kbins(self,df,bins):
        from sklearn.preprocessing import KBinsDiscretizer
        est = KBinsDiscretizer(n_bins=bins,encode='onehot-dense',strategy='kmeans')
        est.fit(df)
        Xt = est.transform(df)
        
        return Xt
    def panda_onehotencode(self,train,test,selected):
        '''
        One hot encode train and test data. Then make sure they have same column amount.
        # https://stackoverflow.com/questions/58101126/using-scikit-learn-onehotencoder-with-a-pandas-dataframe
        https://towardsdatascience.com/encoding-categorical-features-21a2651a065c
        
        returns train, test
        '''
        tmp = pd.get_dummies(train[selected],prefix_sep='_',columns=selected)
        print("train data shape: ",tmp.shape)

        tmp1 = pd.get_dummies(test[selected],prefix_sep='_',columns=selected)
        print("train data shape: ",tmp1.shape)

#         for i in selected:
#             tmp[i] = pd.get_dummies(train,prefix=[i], prefix_sep='_',columns = [i], drop_first=True)
#         print("train data shape: ",tmp.shape)
#         for i in selected:
#             tmp1[i] = pd.get_dummies(test,prefix=[i], prefix_sep='_', columns = [i], drop_first=True)
#         print("test data shape: ",tmp.shape)
        final_train, final_test = tmp.align(tmp1, join='inner', axis=1)  # inner join
        print("final train data : {}\nfinal test data: {}".format(final_train.shape,final_test.shape))
        return final_train,final_test
    
    def category_labelencoder(self,data):
        '''
        convert categories into something.
        https://towardsdatascience.com/encoding-categorical-features-21a2651a065c
        
        But I don't know how to make test and train with same column shape.
        '''
        tmp = data
        # print(df_data.head() )
        categorical_feature_mask = tmp.dtypes==object # gets features with string or object value
        categorical_feature_mask
        # filter categorical columns using mask and turn it into a list
        categorical_cols = tmp.columns[categorical_feature_mask].tolist()
        categorical_cols

        # instantiate labelencoder object
        le = LabelEncoder()
        # apply le on categorical feature columns
        tmp[categorical_cols] = tmp[categorical_cols].apply(lambda col: le.fit_transform(col))
        tmp[categorical_cols].head(10)

        # instantiate OneHotEncoder
        ohe = OneHotEncoder(categorical_features = categorical_feature_mask, sparse=False ) 
        # categorical_features = boolean mask for categorical columns
        # sparse = False output an array not sparse matrix

        # apply OneHotEncoder on categorical feature columns
        test_ohe = ohe.fit_transform(tmp) # It returns an numpy array
        return test_ohe
    def kfold_cw(self,model,df_data,df_label,df_test,niter):
        scores = cross_val_score(model,df_data,df_label,cv=niter)
        print("Accuracy score: ",score)
#         pred  = cross_val_predict(model,df_data,df_label,cv=niter)
        print('fitting now')
        final_model = model.fit(df_data,df_label)
        print('final prediction')
        new_pred = model.predict(df_test)
        
#         # manual k-fold implementation 
#         # https://medium.com/datadriveninvestor/k-fold-cross-validation-6b8518070833

#         X = df_data.to_numpy() # feature with good correlation with class
#         y = df_label.values.ravel()

#         scores = []
#         acc = []
#         cv = KFold(n_splits=niter, random_state=42,shuffle=True)
#         i = 0
#         for train_index, test_index in cv.split(X):
#             print(i)
#             i +=1
#             X_train, X_test, y_train, y_test = X[train_index], X[test_index,:], y[train_index], y[test_index]
#             model.fit(X_train, y_train)
#             scores.append(model.score(X_test, y_test))
#             acc.append(accuracy_score(model.predict(X_test),y_test))
#         model.fit(X,y)
#         new_pred = model.predict(df_test)
#         print("acc ", np.mean(acc))
        return np.mean(scores),new_pred
    

In [504]:
clf = Classification()

# convert train label to dataframe
label = clf.train_label.values.ravel()
df_label = pd.DataFrame(data=label,columns=['income'])
df_label.shape

# train data
df_data = clf.train_data
df_data = clf.preprocess(df_data,10)


# test data
test_df = clf.test_data
test_df = clf.preprocess(test_df,10)



# encode both datas
encoded_df = clf.encode_df(df_data)
encoded_test_df = clf.encode_df(test_df)



# finds correlation between features
temp_df = pd.concat([encoded_df,clf.train_label],axis=1,sort=False)

# plt.figure(figsize=(12,10))
# corr = temp_df.corr()
# sns.heatmap(corr, annot=True, cmap=plt.cm.Reds)

#Correlation with output variable
cor_target = abs(corr["income"])

#Selecting highly correlated features
relevant_features = cor_target[cor_target>0.2]
selected = list(dict(relevant_features))
selected = selected[:-1]
print(selected)


for i in selected:
    print(i,df_data[i].unique())
    



['age', 'education_num', 'marital_status', 'relationship', 'sex', 'capital_gain']
age [(38.9, 46.2], (46.2, 53.5], (31.6, 38.9], (24.3, 31.6], (16.927, 24.3], (53.5, 60.8], (75.4, 82.7], (60.8, 68.1], (68.1, 75.4], (82.7, 90.0]]
Categories (10, interval[float64]): [(16.927, 24.3] < (24.3, 31.6] < (31.6, 38.9] < (38.9, 46.2] ... (60.8, 68.1] < (68.1, 75.4] < (75.4, 82.7] < (82.7, 90.0]]
education_num [3, 1, 0, 4, 2, 6, 5]
Categories (7, int64): [0 < 1 < 2 < 3 < 4 < 5 < 6]
marital_status [' notMarried' ' Married' ' Seperated' ' Separated' ' Widowed']
relationship [' Not-in-family' ' Husband' ' Wife' ' Own-child' ' Unmarried'
 ' Other-relative']
sex [' Male' ' Female']
capital_gain [(-99.999, 9999.9], (9999.9, 19999.8], (29999.7, 39999.6], (19999.8, 29999.7], (89999.1, 99999.0], (39999.6, 49999.5]]
Categories (6, interval[float64]): [(-99.999, 9999.9] < (9999.9, 19999.8] < (19999.8, 29999.7] < (29999.7, 39999.6] < (39999.6, 49999.5] < (89999.1, 99999.0]]


In [505]:
# df_data.head()

In [506]:
# encoded_df[selected]

In [507]:
# encoded_test_df[selected]

In [508]:
# tmp = pd.get_dummies(encoded_df[selected],prefix_sep='_',columns=selected)
# for i in selected:
#     tmp = pd.get_dummies(encoded_df,prefix=[i], prefix_sep='_',columns = [i], drop_first=True)
# print("train data shape: ",tmp.shape)
# for i in selected:
#     tmp1[i] = pd.get_dummies(encoded_test_df,prefix=[i], prefix_sep='_', columns = [i], drop_first=True)
# print("test data shape: ",tmp.shape)
# final_train, final_test = tmp.align(tmp1, join='inner', axis=1)  # inner join
# print("final train data : {}\nfinal test data: {}".format(final_train.shape,final_test.shape))
# print(encoded_df.capital_gain.unique())
# tmp

In [509]:
final_train,final_test = clf.panda_onehotencode(encoded_df,encoded_test_df,selected)

train data shape:  (32561, 36)
train data shape:  (16281, 36)
final train data : (32561, 36)
final test data: (16281, 36)


In [510]:
final_train

Unnamed: 0,age_0,age_1,age_2,age_3,age_4,age_5,age_6,age_7,age_8,age_9,...,relationship_4,relationship_5,sex_0,sex_1,capital_gain_0,capital_gain_1,capital_gain_2,capital_gain_3,capital_gain_4,capital_gain_5
0,0,0,0,1,0,0,0,0,0,0,...,0,0,0,1,1,0,0,0,0,0
1,0,0,0,0,1,0,0,0,0,0,...,0,0,0,1,1,0,0,0,0,0
2,0,0,1,0,0,0,0,0,0,0,...,0,0,0,1,1,0,0,0,0,0
3,0,0,0,0,1,0,0,0,0,0,...,0,0,0,1,1,0,0,0,0,0
4,0,1,0,0,0,0,0,0,0,0,...,0,1,1,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,0,1,0,0,0,0,0,0,0,0,...,0,1,1,0,1,0,0,0,0,0
32557,0,0,0,1,0,0,0,0,0,0,...,0,0,0,1,1,0,0,0,0,0
32558,0,0,0,0,0,1,0,0,0,0,...,1,0,1,0,1,0,0,0,0,0
32559,1,0,0,0,0,0,0,0,0,0,...,0,0,0,1,1,0,0,0,0,0


In [511]:
# normalize train and test data
df_norm = clf.normalize_data(encoded_df[selected])
df_test_norm = clf.normalize_data(encoded_test_df[selected])

# convert from numpy to dataframe
df_norm = pd.DataFrame(data=df_norm,columns=selected)
df_test_norm = pd.DataFrame(data=df_test_norm,columns=selected)

# print(df_norm)
clf.take_model(ensemble.RandomForestClassifier)
clf.plain_training(final_train,label,final_test)



0.826040227237832

In [None]:
# score of above implementation
# print(scores)
print(np.mean(scores))

In [None]:
# unique count of prediction
# clf.generate_submission(gradient.predict(df_test_norm))

## Running CV with different method and in the order accuracy score

Below k-fold CV uses 

`cross_val_score`

`cross_val_pred`

In [None]:
def compare(pred):
    df = pd.read_csv('submission1.csv')
    mis_classified = 0
    for i in range(len(pred)):
        if(df['Category'][i] != pred[i]):
            mis_classified += 1
    return mis_classified


In [513]:
# Gradient Boosting
gradient = ensemble.GradientBoostingClassifier(n_estimators=500)
# cross_val_score(gradient,X_ohe,label,cv=3)
gdb_score,gdb_pred = clf.kfold_cw(gradient,final_train,label,final_test,3)
compare(gdb_pred)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Accuracy score:  [0.8513559229095312, 0.8571604723780971, 0.8596481024797534, 0.8615829041978493, 0.8628113752452897, 0.8641626690621252, 0.8646233372158401, 0.864623348534607, 0.8647769526906047, 0.8650533258518548]
fitting now


  y = column_or_1d(y, warn=True)


final prediction


1522

In [None]:
gradient.fit(X_ohe,label)
pred = gradient.predict(test_ohe)

In [None]:
# AdaBoostClassifier
adaboost = ensemble.AdaBoostClassifier(n_estimators=500,learning_rate= 1)

ada_score,ada_pred = clf.kfold_cw(adaboost,df_norm,df_label,df_test_norm,10)
print("adaboost ", compare(ada_pred))

In [None]:
# Random forest
rdmforest = ensemble.RandomForestClassifier(n_estimators=200,min_samples_split=2,min_samples_leaf=4,max_depth=10,bootstrap=True)

rdfst_score,rdfst_pred = clf.kfold_cw(rdmforest,df_norm,df_label,df_test_norm,3)
compare(rdfst_pred)

In [None]:
rdmforest = ensemble.RandomForestClassifier(n_estimators=200,bootstrap=True)

rdfst_score1,rdfst_pred1 = clf.kfold_cw(rdmforest,df_norm,df_label,df_test_norm,10)
compare(rdfst_pred)

In [None]:
# extra tree classifier
extratree = ensemble.ExtraTreesClassifier(n_estimators=100, max_depth=None,min_samples_split=2, random_state=0)
extra_score, extra_pred = clf.kfold_cw(extratree,df_norm,df_label,df_test_norm,3)
compare(extra_pred)

In [None]:
# Naive Bayes
NB = naive_bayes.GaussianNB()
nb_score,nb_pred = clf.kfold_cw(NB,df_norm,df_label,df_test_norm,10)
compare(nb_pred)

In [None]:
# SVC
from sklearn.svm import SVC
svc = SVC(gamma='auto')
svc_score,svc_pred = clf.kfold_cw(svc,df_norm,df_label,df_test_norm,3)
compare(svc_pred)

In [None]:
# Decision Tree
decisiontree = tree.DecisionTreeClassifier(max_depth=None,min_samples_split=2,random_state=0)
tree_score,tree_pred = clf.kfold_cw(decisiontree,df_norm,df_label,df_test_norm,10)
compare(tree_pred)

In [None]:
# Lasso
# lasso = linear_model.Lasso(alpha=0.7)
# lasso_score, lasso_pred = clf.kfold_cw(lasso,df_norm,df_label,df_test_norm,3)
# compare(lasso_pred)

In [None]:
# bagging = ensemble.BaggingClassifier(n_estimators=10,base_estimator=SVC(gamma='auto'),random_state=0)
# score,pred = clf.kfold_cw(bagging,df_norm,df_label,df_test_norm,3)

# accuracy of [0.82725263 0.83185922 0.83165945]

With normalization and 3 kfold cv, the random forest with n_estimators=1600,min_samples_split=2,min_samples_leaf=4,max_depth=10,bootstrap=True parameters 

gave this accuracy scores

[0.83618942 0.83812419 0.84059707]

And with minmaxscaler and svr model ```best_svr = SVR(kernel='rbf',gamma='auto') ```, we got

[0.85185185 0.85756403 0.86289505]

In [None]:
np.unique(ada_pred,return_counts=True)

In [None]:
np.unique(extra_pred,return_counts=True)

In [None]:
np.unique(rdfst_pred,return_counts=True)

In [None]:
np.unique(gdb_pred,return_counts=True)

https://stats.stackexchange.com/questions/411290/how-to-use-a-cross-validated-model-for-prediction

In [None]:
clf.generate_submission(gradient.predict(df_test_norm))

## Random Forest Classifier

In [None]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
random_grid

In [None]:
score = []
pred = []
step_size = [0.1,0.3,0.5,0.6,0.7,0.8,0.9,1,1.5,1.8]
mis_classified = []
actual_pred = []
for n,s in zip(random_grid['n_estimators'],step_size):
    adaboost = ensemble.AdaBoostClassifier(n_estimators=n,learning_rate=s)

    ada_score,ada_pred = clf.kfold_cw(adaboost,df_norm,df_label,df_test_norm,3)
    score.append(ada_score)
    pred.append(np.unique(ada_pred,return_counts=True))
    mis_classified.append(compare(ada_pred))
    actual_pred.append(ada_pred)
    print(ada_score)
    

In [None]:
data_info = pd.DataFrame()
data_info['n_estimators'] = random_grid['n_estimators']
data_info['step_size'] = step_size
data_info['score'] = score
data_info['pred_count'] = pred
data_info['mis_classified'] = mis_classified
data_info['actual_pred'] = actual_pred
data_info

In [None]:
compare(actual_pred[1])

In [None]:
data_info[data_info.score == data_info['score'].max()]mis_classified

In [None]:
score1 = []
pred1 = []
step_size = [0.1,0.3,0.5,0.6,0.7,0.8,0.9,1,1.5,1.8]
for n,s in zip(random_grid['n_estimators'],step_size):
    gradient = ensemble.GradientBoostingClassifier(n_estimators=n,learning_rate=s)
    gdb_score,gdb_pred = clf.kfold_cw(gradient,df_norm,df_label,df_test_norm,3)
    print(gdb_score)
    score1.append(gdb_score)
    pred1.append(np.unique(gdb_pred,return_counts=True))


In [None]:
# # https://towardsdatascience.com/hyperparameter-tuning-the-random-forest-in-python-using-scikit-learn-28d2aa77dd74
# # Number of trees in random forest
# n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# # Number of features to consider at every split
# max_features = ['auto', 'sqrt']
# # Maximum number of levels in tree
# max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
# max_depth.append(None)
# # Minimum number of samples required to split a node
# min_samples_split = [2, 5, 10]
# # Minimum number of samples required at each leaf node
# min_samples_leaf = [1, 2, 4]
# # Method of selecting samples for training each tree
# bootstrap = [True, False]

# # Create the random grid
# random_grid = {'n_estimators': n_estimators,
#                'max_features': max_features,
#                'max_depth': max_depth,
#                'min_samples_split': min_samples_split,
#                'min_samples_leaf': min_samples_leaf,
#                'bootstrap': bootstrap}

# # Use the random grid to search for best hyperparameters
# # First create the base model to tune
# rf = RandomForestRegressor()
# # Random search of parameters, using 3 fold cross validation, 
# # search across 100 different combinations, and use all available cores
# rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# # Fit the random search model
# rf_random.fit(df_norm, df_label)