In this notebook, the power of SKlearn library is explored through a sample data in a binary classification setting. The analysis considers the following setps:
1. Data Pre Processing/data description
2. Dimensional Analysis
3. Applying non-parametric learniong method - KNN, Tree based models
4. Classification with NaiveBayes with assumptions about data


In [None]:
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn import neighbors
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.naive_bayes import ComplementNB,MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
import seaborn as sns
sns.set(style="ticks", color_codes=True)
from sklearn.feature_selection import RFECV

In [None]:
import  matplotlib.pyplot as plt
import seaborn as sn
from sklearn.decomposition import PCA
import scipy.stats as stats
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

import numpy as np
import pandas as pd 
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))



In [None]:
dirname
filenames

# Data Pre-Processing

In [None]:
train_df=pd.read_csv(os.path.join(dirname, filenames[1]))
train_df.columns
train_labels=pd.read_csv(os.path.join(dirname, filenames[0]))
print(train_labels.head(3))
test_df=pd.read_csv(os.path.join(dirname, filenames[2]))
print(test_df.head(3))

test_df.index
my_submission = pd.DataFrame({'id':test_df.index,'Solution':[x for x in range(0,9000)]})
my_submission

In [None]:
#Assigning the column header as the first row for train and the test datasets
train_df=pd.concat([train_df.columns.to_frame().T, train_df],ignore_index=True)
print(train_df.shape)
test_df=pd.concat([test_df.columns.to_frame().T, test_df],ignore_index=True)
print(test_df.shape)
train_labels=pd.concat([train_labels.columns.to_frame().T, train_labels],ignore_index=True)
print(train_labels.shape)

In [None]:
whole_data=pd.concat([train_df,test_df],ignore_index=True)
whole_data

In [None]:
train_labels=train_labels.astype(int)

In [None]:
train_labels.dtypes

In [None]:
#Changing the column names of the dataframe for train and the test dataframe
col=[]
for i in range(train_df.shape[1]):
    col.append(f'f_{i}')
train_df.columns=col
test_df.columns=col
train_labels.columns=['label']

In [None]:
train_df.columns

In [None]:
train_df.dtypes
train_df=train_df.astype(float)
test_df=test_df.astype(float)

In [None]:
## Checking for missing values
train_df.isna().sum()

# Exploratory Data Analysis

Variance measures the spread or dispersion of data points in a dataset. It quantifies how much individual data points deviate from the mean (average) of the dataset. A high variance indicates that data points are more spread out, while a low variance suggests that data points are closer to the mean.


In [None]:
print(train_df.var())
print(train_df.skew())

Visualising with boxplots


In [None]:
plt.figure(figsize=(15,6))
train_df.boxplot()
plt.title('Boxplot of Variables')
plt.ylabel('Value')
plt.show()

f_4,f_12,f_23 features have high number of outliers.

In [None]:
train_labels

In [None]:
train_df_1=pd.concat([train_df, train_labels],axis=1)
train_df_1

In [None]:
train_df.hist(figsize=(20,10))
plt.show()

In [None]:
correlation_matrix=pd.DataFrame(train_df.corr()) #Taking the correlartion between dependent variables

correlation_matrix.reset_index(level=0, inplace=True)
correlation_matrix.head()
#Converting the data from wide to long format
correlation_matrix=pd.melt(correlation_matrix, id_vars=['index'], var_name='Variable', value_name='corr')


In [None]:
correlation_matrix[(correlation_matrix['corr'].abs() > 0.5) & (correlation_matrix['corr'] < 1)]

In [None]:
fig,axes=plt.subplots(1,3,figsize=(10,5))
axes[0].scatter(train_df['f_28'],train_df['f_12'])
axes[0].set_xlabel('f_28')
axes[0].set_ylabel('f_12')
axes[0].set_title('f_28 vs f_12')
axes[1].scatter(train_df['f_28'],train_df['f_4'])
axes[1].set_xlabel('f_28')
axes[1].set_ylabel('f_4')
axes[1].set_title('f_28 vs f_4')
axes[2].scatter(train_df['f_23'],train_df['f_4'])
axes[2].set_xlabel('f_23')
axes[2].set_ylabel('f_4')
axes[2].set_title('f_23 vs f_4')

From the scatter plots and correlation matrix it can be inferred that these three features exhibit strong correlation with each other. There could be redundancy in the information and noise introuduced as a result of that. Linear dimensionality reduction techniques can be evaluated to check the separation of labels.

In [None]:

train_df_z = train_df.apply(stats.zscore)
X=train_df_z
y=train_df_1['label']

### PCA plot of the features

In [None]:
pca = PCA(n_components=4, random_state=42)
X_pca = pca.fit(X).transform(X)
target_names=train_df_1.label.unique()
print(X.shape)
print(X_pca.shape)
print("explained variance ratio (first five components): %s"
    % str(pca.explained_variance_ratio_))
np.cumsum(pca.explained_variance_ratio_)

plt.figure()
#colors = ["navy", "turquoise", "darkorange","green","black"]
lw = 2
for i in target_names:
        plt.scatter(
        X_pca[y == i, 0], X_pca[y == i, 1], marker='.',label=i, cmap="Dark2",
        alpha=0.8, lw=lw)
plt.legend(loc="best")
plt.title("PCA of London dataset")
#plt.ylim(-4,4)
plt.axhline(y=0, color='black', linestyle='-')
plt.axvline(x=0, color='black', linestyle='-')

In [None]:
pca.explained_variance_ratio_

Linear models might not be a great choice.

###  Produce a scree-plot to look at the cumulative variance represented by the PCA eigenvectors.


In [None]:
pca = PCA(n_components=5)  ## 5 components
pca_m=pca.fit(X)
X_pca = pca.fit(X).transform(X)
PC_values = np.arange(pca.n_components_) + 1
plt.plot(PC_values, pca.explained_variance_ratio_, 'o-', linewidth=2, color='blue')
plt.title('Scree Plot for PCA Analysis on Abalone')
plt.xlabel('Principal Component')
plt.ylabel('Variance Explained')
plt.show()

In [None]:
train_df_1.groupby('label').size() ## The classes are balanced

The classes 0 and 1 are balanced

# Building models with no feature transformation

### Function definitions

In [None]:
# Function for KNN model for range of neighbours. 
def knn_model(n, train_x, train_y, test_x, test_y, name,p):
    knn_acc = []
    for i in range(1,n,5):
        knn = neighbors.KNeighborsClassifier(n_neighbors=i, weights='distance', p=p)
        knn.fit(train_x,train_y)
        y_pred = knn.predict(test_x)
        test_accuracy = accuracy_score(test_y, y_pred)
        train_score = cross_val_score(knn, train_x, train_y, cv=5, scoring='accuracy').mean()
        knn_acc.append((i,train_score ,test_accuracy))
    return pd.DataFrame(knn_acc, columns=['K','Training Accuracy '+name,'Test Accuracy '+name])


# K vs accuracy plot
def plot_accuracy_k(name, df):
    plt.title('kNN: ' + name +' parameters, K vs accuracy')
    plt.plot(df['K'].values, df['Test Accuracy '+name].values, label = 'Test accuracy '+name)
    plt.plot(df['K'].values, df['Training Accuracy '+name].values, label = 'Training accuracy '+name)
    plt.legend()
    plt.legend()
    plt.xlabel('Number of Neighbors')
    plt.ylabel('Accuracy')
    plt.show()

Splitting the training data into train and test set for validating the model

In [None]:
## Split the model into test and train set
X=np.array(train_df_z) # dropping the target variable
y=np.array(train_df_1['label']) 
random_state=42
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=random_state) # splitting the dataset into test and train based on 80% and 20% split

In [None]:
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

### KNN function


In [None]:
n_classes = len(np.unique(y))
n_neighbors=175
df_res =knn_model(200,  x_train, y_train, x_test, y_test, 'minkowski',2) # Storing the result after callling the function


It can be observed from the plot that including more points beyond the number of neighbors 6 does not seem to be impacting the model performance much.

In [None]:
plot_accuracy_k('minkowski',df_res)

In [None]:
df_res.loc[df_res['Test Accuracy minkowski']==df_res['Test Accuracy minkowski'].max()]

With 126 neighbors the performance of the KNN produces model with accuracy of 85%

### Multinomial Naive Bayes

In [None]:
# Function to perform Naive Bayes classification. Returns the test accuracy and the cross validation score
def naive_bayes(model, x_train, y_train, x_test, y_test):
    model.fit(x_train, y_train)
    cv_score = cross_val_score(model, x_train, y_train, cv = 5, scoring='accuracy')
    y_pred = model.predict(x_test)
    test_accuracy = accuracy_score(y_test, y_pred)
    return cv_score.mean(), test_accuracy


In [None]:
#There can be negative values which naive bayes will not accept, 
# so performing min max normalization so that the features are non negative
scaler=MinMaxScaler()
x_train_mm = scaler.fit_transform(x_train)
x_test_mm = scaler.transform(x_test)

In [None]:
cv_score_cnb, test_acc_cnb = naive_bayes(ComplementNB(), x_train_mm, y_train, x_test_mm, y_test)
print("cross-val_score:",cv_score_cnb)
print("test_acc_cnb:",test_acc_cnb)

In [None]:
cv_score_mnb, test_acc_mnb = naive_bayes(MultinomialNB(), x_train_mm, y_train, x_test_mm, y_test)
print("cross-val_score:",cv_score_mnb)
print("test_acc_cnb:",test_acc_mnb)

## Tree based Algorithms

### Random Forest

Starting with evaluation of important features using decision trees

In [None]:
##Data normlisation is not required
# Function to caculate the best set of parameters and the cross val score of that best model
def decsion_tree_gs(x_train, y_train,x_test):
    params = {
              'max_depth':list(range(2,20))
             }
    gs_dtc = GridSearchCV(DecisionTreeClassifier(random_state=27), params, verbose=1, cv=5, return_train_score=True, n_jobs=-1)

    gs_results_dtc = gs_dtc.fit(x_train, y_train)
    print('The best classifer is for the values - ')
    print('One leave out Accuracy of the best model - ', gs_results_dtc.best_score_)
    print(gs_results_dtc.best_estimator_)
    print(gs_results_dtc.best_params_)
    results_df = pd.DataFrame(gs_results_dtc.cv_results_['params'])
    results_df["Train Accuracy"] = gs_results_dtc.cv_results_['mean_train_score']
    results_df["Valid Accuracy"] = gs_results_dtc.cv_results_['mean_test_score']
    return gs_results_dtc, results_df
    
# Function to calculate the test accuracy of the model using the best parameters ?R remove the function
def decision_tree_classifier(max_depth, x_train, y_train, x_test, y_test):
    dtc = DecisionTreeClassifier(max_depth=max_depth, random_state=27)
    dtc.fit(x_train, y_train)
    scores = cross_val_score(dtc, x_train, y_train, cv = 5, scoring='accuracy',)
    y_pred = dtc.predict(x_test)
    test_accuracy = accuracy_score(y_test, y_pred)
    return dtc, scores.mean(), test_accuracy

# Plot of accuracy vs the depth for the Decision Tree Classifier
def plot_depth_accuracy(df,mod): 
    plt.title('Decison Tree: '+mod +' Max Depth vs Accuracy (Train and Test)')
    plt.plot(df['max_depth'].values, df['Train Accuracy'].values, label = 'Mean Train Accuracy')
    plt.plot(df['max_depth'].values, df['Valid Accuracy'].values, label = 'Valid Accuracy')
    plt.legend()
    plt.legend()
    plt.xlabel('Max_Depth')
    plt.ylabel('Accuracy')
    plt.show()
    
def RF_gs(x_train, y_train,x_test):
    d_l=2
    d_u=20
    tree_para = {'max_depth':list(range(d_l,d_u,2)),'n_estimators':[100,120,150, 175, 200, 220, 250,300],'max_features':['sqrt','log2']}  #range(t_l,t_u,2)}
    clf_rf =  GridSearchCV(RandomForestClassifier(),tree_para,cv=5,return_train_score=True,n_jobs=-1)
    #gs_dtc = GridSearchCV(DecisionTreeClassifier(random_state=27), params, verbose=1, cv=5)
    clf_rf = clf_rf.fit(x_train, y_train)
    scores_raw_rf = cross_val_score(clf_rf, x_train, y_train, cv = 5, scoring='accuracy')
    print('The best classifer is for the values - ')
    print('One leave out Accuracy of the best model - ', clf_rf.best_score_)
    print(clf_rf.best_estimator_)
    print(clf_rf.best_params_)
    y_pred_rf = clf_rf.predict(x_test)
    test_accuracy_rf = accuracy_score(y_test, y_pred_rf)
    results_df = pd.DataFrame(clf_rf.cv_results_['params'])
    results_df["Train Accuracy"] = clf_rf.cv_results_['mean_train_score']
    results_df["Valid Accuracy"] = clf_rf.cv_results_['mean_test_score']
    return clf_rf, results_df
    

In [None]:
def heatplot_depth_estimators_accuracy_valid(df ,model_n):
        #Pivoting the dataframe for plotting heat map
        ac_df=df.pivot(index='max_depth',columns='n_estimators',values='Valid Accuracy')
        #Plotting the graph
        plt.figure(figsize=(15,8))
        sns.heatmap(data=ac_df,annot=True)
        plt.title("heat plot of accuracy with "+ model_n)
        plt.show()
def heatplot_depth_estimators_accuracy_train(df,model_n):
        #Pivoting the dataframe for plotting heat map
        ac_df=df.pivot(index='max_depth',columns='n_estimators',values='Train Accuracy')
        #Plotting the graph
        plt.figure(figsize=(15,8))
        sns.heatmap(data=ac_df,annot=True)
        plt.title("heat plot of accuracy with "+ model_n)
        plt.show()

In [None]:
best_model_dt, df_dtc = decsion_tree_gs(x_train, y_train,x_test) ## call the function and pass the parameters

In [None]:
plot_depth_accuracy(df_dtc,"")

It can be observed that the after a certain point depth of 6 the model startes overfitting because there is significant bump in the training data accuracy compared to the validation set from the cross validation excercise.

In [None]:
max_depth=5
dtc, mean_error, test_accuracy=decision_tree_classifier(max_depth, x_train, y_train, x_test, y_test)

In [None]:
#Fitering the features which helps in dividing the space to separate the groups
feat_importance = best_model_dt.best_estimator_.tree_.compute_feature_importances(normalize=False)
#print("feature importance = " + str(feat_importance))
data = {'Feature':list(train_df.columns), 'feat_importance': feat_importance}  
df_feature_imp=pd.DataFrame(data)
df_feature_imp.sort_values(by='feat_importance',ascending=False).head()

Gini Importance or Mean Decrease in Impurity (MDI) calculates each feature importance as the sum over the number of splits (across all tress) that include the feature, proportionally to the number of samples it splits.From the table it can inferred that the contribution of the features towards classifying the target variable is poor.

In [None]:
len(list(df_feature_imp[df_feature_imp['feat_importance']>0]['Feature']))

In [None]:
## Split the model into test and tr'ain set
X=np.array(train_df[list(df_feature_imp[df_feature_imp['feat_importance']>0]['Feature'])]) # dropping the target variable
y=np.array(train_df_1['label']) 
random_state=42
x_train_rf, x_test_rf, y_train_rf, y_test_rf = train_test_split(X, y, test_size=0.2,random_state=random_state) # splitting the dataset into test and train based on 80% and 20% split

In [None]:
best_model, df_rf = RF_gs(x_train_rf, y_train_rf,x_test_rf) ## call the function and pass the parameters (25 minutes run time)

In [None]:
#df_rf.abshead()

The heatmap is plotted for both the training and validation accuracy based on K fold cross validatpion. 

In [None]:
#model_n="Random forest Raw Dataset Training"
#heatplot_depth_estimators_accuracy_train(df_rf,model_n)


In [None]:
#model_n="Random forest Raw Dataset Validation"
#heatplot_depth_estimators_accuracy_valid(df_rf,model_n)


### Gradient Boosting Classifier

In [None]:
def gbt(x_train, y_train,x_test):
    tree_para = {'n_estimators': [150,200,220,300,350,400,450,500,550]}  #[20,50,70,90,100]}  #range(t_l,t_u,2)}#20,50,70,90,100,120,
    clf =  GridSearchCV(GradientBoostingClassifier(learning_rate=0.01),tree_para,cv=5,return_train_score=True,n_jobs=-1)
    clf = clf.fit(x_train, y_train)
    scores_raw_rf = cross_val_score(clf, x_train, y_train, cv = 5, scoring='accuracy')
    print('The best classifer is for the values - ')
    print('One leave out Accuracy of the best model - ', clf.best_score_)
    print(clf.best_estimator_)
    print(clf.best_params_)
    y_pred_rf = clf.predict(x_test)
    test_accuracy_rf = accuracy_score(y_test, y_pred_rf)
    results_df = pd.DataFrame(clf.cv_results_['params'])
    results_df["Train Accuracy"] = clf.cv_results_['mean_train_score']
    results_df["Valid Accuracy"] = clf.cv_results_['mean_test_score']
    return clf, results_df

def plot_depth_accuracy_gb(df,mod): 
    plt.title('Gradient Boosted Tree: '+mod +' Estimators vs Accuracy (Train and Test)')
    plt.plot(df['n_estimators'].values, df['Train Accuracy'].values, label = 'Mean Train Accuracy')
    plt.plot(df['n_estimators'].values, df['Valid Accuracy'].values, label = 'Valid Accuracy')
    plt.legend()
    plt.legend()
    plt.xlabel('n_estimators')
    plt.ylabel('Accuracy')
    plt.show()

In [None]:
#Passing the training array with the selcted features
#best_model_gbt, df_gbt_raw = gbt(x_train_rf, y_train_rf,x_test_rf) #Assessed with the tress  20,50,70,90,100,150,200,220

In [None]:
#Passing the training array with the selcted features
best_model_gbt, df_gbt_raw = gbt(x_train_rf, y_train_rf,x_test_rf) 

In [None]:
plot_depth_accuracy_gb(df_gbt_raw,"Gradient Boosted Trees")

After testing the performance of each model with the cross validation technique it can be concluded that the validation accuracy reported is best for  Random Forest and XGBoost models.

The best classifer is for the values - 

One leave out Accuracy of the best model -  0.89

RandomForestClassifier(max_depth=18)

{'max_depth': 18, 'n_estimators': 100}

In [None]:
##Values are passed based on the RF evaluation
rf_f=RandomForestClassifier(max_depth=18,n_estimators=100,random_state=27)
rf_f.fit(x_train_rf,y_train_rf)
score=cross_val_score(rf_f,x_train_rf,y_train_rf,cv=5,scoring='accuracy')
y_pred=rf_f.predict(x_test_rf)
test_accuracy=accuracy_score(y_test_rf,y_pred)

In [None]:
print("test_accuracy:",test_accuracy)

Testing results of the tuned RF
'max_depth': 18, 'max_features': 'sqrt', 'n_estimators': 175

In [None]:
##Values are passed based on the RF evaluation
rf_f=RandomForestClassifier(max_depth=18,n_estimators=150,random_state=27,criterion='entropy')
rf_f.fit(x_train_rf,y_train_rf)
score=cross_val_score(rf_f,x_train_rf,y_train_rf,cv=5,scoring='accuracy')
y_pred=rf_f.predict(x_test_rf)
test_accuracy=accuracy_score(y_test_rf,y_pred)

In [None]:
print("test_accuracy:",test_accuracy)

Extracting features after GMM fit 

In [None]:
grid_space={'max_depth':[3,5,10,None],
              'n_estimators':[10,100,200],
              'min_samples_leaf':[1,2,3],
              'min_samples_split':[1,2,3]
           }

In [None]:
rf=RandomForestClassifier()
grid_search_rf = GridSearchCV(rf, param_grid=grid_space, verbose=3,scoring='accuracy',cv=10).fit(x_train,y_train)
print('best estimator RandomForest:',grid_search_rf.best_estimator_,'Best Score', grid_search_rf.best_estimator_.score(x_train,y_train))
rf_best = grid_search_rf.best_estimator_

In [None]:
##Values are passed based on the RF evaluation
rf_f=RandomForestClassifier(max_depth=18,n_estimators=20,random_state=27,criterion='entropy')
rf_f.fit(x_train,y_train)
score=cross_val_score(rf_f,x_train,y_train,cv=5,scoring='accuracy')
y_pred=rf_f.predict(x_test)
test_accuracy=accuracy_score(y_test,y_pred)

In [None]:
print(test_accuracy)

In [None]:
test_ext=np.array(test_df[list(df_feature_imp[df_feature_imp['feat_importance']>0]['Feature'])])
test_ext.shape

In [None]:
list(df_feature_imp[df_feature_imp['feat_importance']>0]['Feature'])
list(df_feature_imp[df_feature_imp['feat_importance']>0]['Feature'])

In [None]:
pred_test_label=rf_f.predict(test_ext)
pred_test_label

In [None]:
pred_test_label[pred_test_label>0].shape

In [None]:
Id=[x+1 for x in test_df.index.tolist()]
Id

In [None]:
my_submission = pd.DataFrame({'Id':Id,'Solution': pred_test_label})
# Submitting the predictions
my_submission.to_csv('submission.csv', index=False)
