# Ensemble Classifier Mode Data

# Import Libraries

In [1]:
import pandas as pd, numpy as np
from sklearn.ensemble import BaggingClassifier

from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

from scipy import stats

In [2]:
df_train = pd.read_csv('train_rf_norm.csv', index_col = 0)
df_test = pd.read_csv('test_rf_norm.csv', index_col = 0)

In [3]:
len(df_train.columns) == len(df_test.columns)

True

In [4]:
print(len(df_train), len(df_test))

32561 16281


# Separate Data into X and y

In [5]:
X_train = df_train.iloc[:,:-1]
y_train = df_train.iloc[:,-1]

In [6]:
X_test = df_test.iloc[:,:-1]
y_test = df_test.iloc[:,-1]

# Individual Algorithms

In [7]:
def rforest(X_train, y_train, X_test):

    rforest = BaggingClassifier(base_estimator = RandomForestClassifier(n_estimators = 90))
    rforest.fit(X_train, y_train)
    y_pred = rforest.predict(X_test)
    
    return y_pred

In [8]:
def knn(X_train, y_train, X_test):
    
    knn = BaggingClassifier(base_estimator = KNeighborsClassifier(n_neighbors = 15))
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)
    
    return y_pred

In [9]:
def nb(X_train, y_train, X_test):
    
    nb = BaggingClassifier(base_estimator = GaussianNB())
    nb.fit(X_train,y_train)
    y_pred = nb.predict(X_test)
    
    return y_pred


In [10]:
def lr(X_train, y_train, X_test):

    lr = BaggingClassifier(base_estimator = LogisticRegression(C = 10))
    lr.fit(X_train, y_train)
    y_pred = lr.predict(X_test)
    
    return y_pred

In [11]:
def svm(X_train, y_train, X_test):
    svm = BaggingClassifier(base_estimator = SVC(C = 10, gamma = 0.01, kernel = 'rbf'))
    svm.fit(X_train,y_train)    
    y_pred = svm.predict(X_test)
    
    return y_pred

# Ensemble Function


In [12]:
def ensemble(X_train, y_train, X_test):
    
    r_y = rforest(X_train, y_train, X_test)
    k_y = knn(X_train, y_train, X_test)
    l_y = lr(X_train, y_train, X_test)
    n_y = nb(X_train, y_train, X_test)
    s_y = svm(X_train, y_train, X_test)
    
    
    final_y = []
    
    for i in range(len(X_test)):
        final_y.append(stats.mode([r_y[i], k_y[i], l_y[i], n_y[i], s_y[i]])[0][0])
        
    return final_y

# PCC

In [13]:
def pearson(x,y):
    
    sum_sq_x = 0
    sum_sq_y = 0 
    sum_coproduct = 0
    mean_x = 0
    mean_y = 0
    
    N = len(x)
    
    for i in range(N):
        
        sum_sq_x += x[i] * x[i]
        sum_sq_y += y[i] * y[i]
        sum_coproduct += x[i] * y[i]
        mean_x += x[i]
        mean_y += y[i]
        
    mean_x = mean_x / N
    mean_y = mean_y / N
    pop_sd_x = np.sqrt((sum_sq_x/N) - (mean_x * mean_x))
    pop_sd_y = np.sqrt((sum_sq_y / N) - (mean_y * mean_y))
    cov_x_y = (sum_coproduct / N) - (mean_x * mean_y)
    correlation = cov_x_y / (pop_sd_x * pop_sd_y)
    
    return correlation

In [14]:
def ppc_features(x,y):

    a = np.array(y)
    filt_feat = []

    for i in range(x.shape[1]):
        b = np.array(x.iloc[:,i])
        val = np.abs(pearson(a,b)) # absolute value of R
        filt_feat.append([val,i])

    filt_sort = sorted(filt_feat, reverse=True)
    
    rank = np.array(list(range(len(filt_sort))))
    rank = [x+1 for x in rank]
    feat_n = []
    r_score = []
    for i in range(len(filt_sort)):
        feat_n.append(filt_sort[i][1])
        r_score.append(filt_sort[i][0])
        
    Filter_Ranks = pd.DataFrame(data=np.column_stack((rank, feat_n, r_score)),columns=['Rank','Feature #','R_score'])
    Filter_Ranks["Feature #"] = Filter_Ranks["Feature #"].astype(int)
    Filter_Ranks["Rank"] = Filter_Ranks["Rank"].astype(int)
    
    return Filter_Ranks

In [15]:
filter_method = ppc_features(X_train, y_train)

In [16]:
filter_method.head()

Unnamed: 0,Rank,Feature #,R_score
0,1,13,0.444696
1,2,32,0.401035
2,3,10,0.335154
3,4,15,0.31844
4,5,0,0.234037


# Mode Filter Method 20

In [17]:
filter_20 = list(filter_method.iloc[:20, 1])

In [18]:
X_train.iloc[:, filter_20].head()

Unnamed: 0,marital-status_ Married-civ-spouse,relationship_ Husband,education-num,marital-status_ Never-married,Age,hours-per-week,relationship_ Own-child,capital-gain,occupation_ Exec-managerial,sex_ Female,sex_ Male,relationship_ Not-in-family,occupation_ Prof-specialty,occupation_ Other-service,capital-loss,WorkClass_ Self-emp-inc,relationship_ Unmarried,marital-status_ Divorced,relationship_ Wife,WorkClass_ Private
0,0,0,13,1,0.03067,-0.035429,0,0.148451,0,0,1,1,0,0,-0.216656,0,0,0,0,0
1,1,1,13,0,0.837096,-2.222119,0,-0.145918,1,0,1,0,0,0,-0.216656,0,0,0,0,0
2,0,0,9,0,-0.042641,-0.035429,0,-0.145918,0,0,1,1,0,0,-0.216656,0,0,1,0,1
3,1,1,7,0,1.057031,-0.035429,0,-0.145918,0,0,1,0,0,0,-0.216656,0,0,0,0,1
4,1,0,13,0,-0.775756,-0.035429,0,-0.145918,0,1,0,0,1,0,-0.216656,0,0,0,1,1


In [19]:
trial_run = ensemble(X_train.iloc[:, filter_20], y_train, X_test.iloc[:, filter_20])

In [20]:
for i in range(4):
    print(['tn', 'fp', 'fn', 'tp'][i],confusion_matrix(trial_run, y_test).ravel()[i])

tn 11519
fp 1470
fn 916
tp 2376


In [21]:
accuracy_score(trial_run, y_test)

0.85344880535593637

In [22]:
precision_score(trial_run, y_test)

0.61778471138845559

In [23]:
recall_score(trial_run, y_test)

0.72174969623329288

In [24]:
f1_score(trial_run, y_test)

0.66573269823479975

# Mode Filter Method 40

In [25]:
filter_40 = list(filter_method.iloc[:40, 1])

In [26]:
X_train.iloc[:, filter_40].head()

Unnamed: 0,marital-status_ Married-civ-spouse,relationship_ Husband,education-num,marital-status_ Never-married,Age,hours-per-week,relationship_ Own-child,capital-gain,occupation_ Exec-managerial,sex_ Female,...,WorkClass_ Federal-gov,occupation_ Priv-house-serv,marital-status_ Married-spouse-absent,native-country_ United-States,race_ Other,WorkClass_ Local-gov,race_ Amer-Indian-Eskimo,occupation_ Tech-support,WorkClass_ Self-emp-not-inc,native-country_ India
0,0,0,13,1,0.03067,-0.035429,0,0.148451,0,0,...,0,0,0,1,0,0,0,0,0,0
1,1,1,13,0,0.837096,-2.222119,0,-0.145918,1,0,...,0,0,0,1,0,0,0,0,1,0
2,0,0,9,0,-0.042641,-0.035429,0,-0.145918,0,0,...,0,0,0,1,0,0,0,0,0,0
3,1,1,7,0,1.057031,-0.035429,0,-0.145918,0,0,...,0,0,0,1,0,0,0,0,0,0
4,1,0,13,0,-0.775756,-0.035429,0,-0.145918,0,1,...,0,0,0,0,0,0,0,0,0,0


In [27]:
trial_40 = ensemble(X_train.iloc[:, filter_40], y_train, X_test.iloc[:, filter_40])

In [28]:
for i in range(4):
    print(['tn', 'fp', 'fn', 'tp'][i],confusion_matrix(trial_40, y_test).ravel()[i])

tn 11520
fp 1422
fn 915
tp 2424


In [29]:
accuracy_score(trial_40, y_test)

0.85645844849824948

In [30]:
precision_score(trial_40, y_test)

0.63026521060842433

In [31]:
recall_score(trial_40, y_test)

0.72596585804132974

In [32]:
f1_score(trial_40, y_test)

0.67473903966597071

# Mode Filter Method 60

In [33]:
filter_60 = list(filter_method.iloc[:60, 1])

In [34]:
X_train.iloc[:, filter_60].head()

Unnamed: 0,marital-status_ Married-civ-spouse,relationship_ Husband,education-num,marital-status_ Never-married,Age,hours-per-week,relationship_ Own-child,capital-gain,occupation_ Exec-managerial,sex_ Female,...,native-country_ Philippines,native-country_ Iran,native-country_ Nicaragua,native-country_ Jamaica,native-country_ Haiti,native-country_ Peru,occupation_ Craft-repair,native-country_ Germany,WorkClass_ State-gov,WorkClass_ Without-pay
0,0,0,13,1,0.03067,-0.035429,0,0.148451,0,0,...,0,0,0,0,0,0,0,0,1,0
1,1,1,13,0,0.837096,-2.222119,0,-0.145918,1,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,9,0,-0.042641,-0.035429,0,-0.145918,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,1,7,0,1.057031,-0.035429,0,-0.145918,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,13,0,-0.775756,-0.035429,0,-0.145918,0,1,...,0,0,0,0,0,0,0,0,0,0


In [35]:
trial_60 = ensemble(X_train.iloc[:, filter_60], y_train, X_test.iloc[:, filter_60])

In [36]:
for i in range(4):
    print(['tn', 'fp', 'fn', 'tp'][i],confusion_matrix(trial_60, y_test).ravel()[i])

tn 11451
fp 1391
fn 984
tp 2455


In [37]:
accuracy_score(trial_60, y_test)

0.85412443953074135

In [38]:
precision_score(trial_60, y_test)

0.63832553302132089

In [39]:
recall_score(trial_60, y_test)

0.71387031113695842

In [40]:
f1_score(trial_60, y_test)

0.67398764584763216