# Soft Voting Classification

## Importing necessary libraries

In [28]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
sns.set_theme()
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, mutual_info_classif, f_classif, SelectPercentile, chi2
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline

### Loading the modified dataset

In [29]:
df = pd.read_csv('dataset/data_merged.csv')
del df['mode'] ## Deleted unnecessary attributes
del df['key_6_yr']

## Drppoing the "popularity" attribute from the data frame and declaring it as the dependant variable or the  output
X = df[df.columns.difference(['popularity'])]
y = pd.cut(x=df["popularity"], bins=[-1,42,100], labels=[0,1])

In [30]:
from sklearn.base import TransformerMixin, BaseEstimator

# custom transformer for sklearn pipeline
class ColumnExtractor(TransformerMixin, BaseEstimator):
    def __init__(self, cols):
        self.cols = cols

    def transform(self, X):
        X_new = df[df.columns.difference(self.cols)]
        return X_new[:10000]

    def fit(self, X, y=None):
        return self

### Splitting the data into training and testing sets

In [31]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, stratify=y, random_state=42)

### SVM with pipeline

In [32]:
select = SelectPercentile(chi2, percentile=10)
scaler = StandardScaler()

## Hyperparameters are tuned according to gridsearch results
svm = SVC(kernel="linear", C=1000, random_state=42, probability=True)

svm_pipe = Pipeline(
    [
        (
            "extract_cols",
            ColumnExtractor(cols=["loudness", "loudness_ar", "loudness_yr"]),
        ),
        ("select", select),
        ("scaler", scaler),
        ("svm", svm),
    ]
)

### Logistic Regressor with pipeline

In [33]:
## number of features selected through previous runs of logistic regression
select = SelectKBest(f_classif, k=27)
log_clf = LogisticRegression(random_state=42)

log_pipe = Pipeline([('select', select), ('log_clf', log_clf)])

### Decision Tree Classifier with pipeline

## Hyperparameters are tuned according to gridsearch results

In [34]:
tree_clf = DecisionTreeClassifier(max_depth=10, max_features='auto', criterion='gini', min_samples_leaf=1,
                               min_samples_split=12)
tree_pipe = Pipeline([('tree_clf',tree_clf)])

### Voting Classifier

## Defining the Voting Classifier

In [35]:
vote_clf = VotingClassifier(
    estimators=[
        ("svm_pipe", svm_pipe),
        ("tree_pipe", tree_pipe),
        ("log_pipe", log_pipe),
    ],
    voting="soft",
    n_jobs=-1,
)

## n_jobs=-1 uses all the cores
## Soft voting gives more weightage to the more confident classifiers
vote_clf.fit(X_train, y_train)

ValueError: Found input variables with inconsistent numbers of samples: [10000, 137784]

In [31]:
vote_pred = vote_clf.predict(X_test)

### Performance

In [32]:
print(classification_report(y_test, vote_pred))

              precision    recall  f1-score   support

           0       0.90      0.96      0.93     25942
           1       0.84      0.67      0.75      8504

    accuracy                           0.89     34446
   macro avg       0.87      0.81      0.84     34446
weighted avg       0.88      0.89      0.88     34446

