# Ensemble Classifier Mode Data

# Import Libraries

In [6]:
import pandas as pd, numpy as np
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

from scipy import stats

In [7]:
df_train = pd.read_csv('mode_train.csv', index_col = 0)

In [8]:
len(df_train.columns) == len(df_test.columns)

True

In [9]:
print(len(df_train), len(df_test))

32561 16281


# Separate Data into X and y

In [10]:
X_train = df_train.iloc[:,:-1]
y_train = df_train.iloc[:,-1]

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, random_state = 42)

# Individual Algorithms

In [12]:
def rforest(X_train, y_train, X_test):

    rforest = RandomForestClassifier()
    rforest.fit(X_train, y_train)
    y_pred = rforest.predict(X_test)
    
    return y_pred

In [13]:
def knn(X_train, y_train, X_test):
    
    knn = KNeighborsClassifier()
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)
    
    return y_pred

In [14]:
def nb(X_train, y_train, X_test):
    
    nb = GaussianNB()
    nb.fit(X_train,y_train)
    y_pred = nb.predict(X_test)
    
    return y_pred


In [15]:
def lr(X_train, y_train, X_test):

    lr = LogisticRegression()
    lr.fit(X_train, y_train)
    y_pred = lr.predict(X_test)
    
    return y_pred

In [16]:
def svm(X_train, y_train, X_test):
    svm = SVC()
    svm.fit(X_train,y_train)    
    y_pred = svm.predict(X_test)
    
    return y_pred

# Coefficient

In [23]:
def pearson(x,y):
    
    sum_sq_x = 0
    sum_sq_y = 0 
    sum_coproduct = 0
    mean_x = 0
    mean_y = 0
    
    N = len(x)
    
    for i in range(N):
        
        sum_sq_x += x[i] * x[i]
        sum_sq_y += y[i] * y[i]
        sum_coproduct += x[i] * y[i]
        mean_x += x[i]
        mean_y += y[i]
        
    mean_x = mean_x / N
    mean_y = mean_y / N
    pop_sd_x = np.sqrt((sum_sq_x/N) - (mean_x * mean_x))
    pop_sd_y = np.sqrt((sum_sq_y / N) - (mean_y * mean_y))
    cov_x_y = (sum_coproduct / N) - (mean_x * mean_y)
    correlation = cov_x_y / (pop_sd_x * pop_sd_y)
    
    return correlation

# Ensemble Function


In [17]:
def ensemble(X_train, y_train, X_test):
    
    r_y = rforest(X_train, y_train, X_test)
    k_y = knn(X_train, y_train, X_test)
    l_y = lr(X_train, y_train, X_test)
    n_y = nb(X_train, y_train, X_test)
    s_y = svm(X_train, y_train, X_test)
    
    
    final_y = []
        
    return (r_y, k_y, l_y, n_y, s_y)

#  Correlation Function

# Mode Ensemble

In [18]:
rf_pred, knn_pred, lr_pred, nb_pred, svm_pred = ensemble(X_train, y_train, X_test)

# Ensemble A

RF & LR, LR & KNN, RF & KNN

In [29]:
print(pearson(rf_pred, lr_pred), pearson(lr_pred, knn_pred), pearson(rf_pred, knn_pred))

0.581548953252 0.620631186324 0.673533256085


# Ensemble B

RF & NB, NB & KNN, RF & KNN

In [27]:
print(pearson(rf_pred, nb_pred), pearson(nb_pred, knn_pred), pearson(rf_pred, knn_pred))

0.406179781165 0.432476764906 0.673533256085


# Ensemble C

SVM & NB, SVM & LR, SVM & KNN, SVM & RF

In [32]:
print(pearson(svm_pred, nb_pred), pearson(svm_pred, lr_pred),pearson(svm_pred, knn_pred), pearson(svm_pred, rf_pred))

0.464021168008 0.843888560319 0.665344964606 0.605990667541
