In [11]:
## Importing necessary libraries

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
sns.set_theme()
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, mutual_info_classif, f_classif
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline

### Loading the modified dataset

In [12]:
df = pd.read_csv('data_merged.csv')
del df['mode'] ## Deleted unnecessary attributes
del df['key_6_yr']

## Drppoing the "popularity" attribute from the data frame and declairing it as the dependant variable or the  output
X = df[df.columns.difference(['popularity'])]
y = pd.cut(x=df["popularity"], bins=[-1,42,100], labels=[0,1])

### Splitting the data into training and testing sets

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

### SVM with pipeline

In [21]:
scaler = StandardScaler()

## Hyperparameters are tuned according to gridsearch results
svm = SVC(kernel='rbf', random_state=42, probability = True)

svm_pipe = Pipeline([('scaler', scaler), ('svm', svm)])

### Logistic Regressor with pipeline

In [22]:
## Hyperparameters are tuned according to gridsearch results
select = SelectKBest(f_classif, k=27)
log_clf = LogisticRegression(random_state=42)

log_pipe = Pipeline([('select', select), ('log_clf', log_clf)])

### Decision Tree Classifier with pipeline

In [23]:
## Hyperparameters are tuned according to gridsearch results
tree_clf = DecisionTreeClassifier(max_depth=8, max_features='sqrt', criterion='gini', min_samples_leaf=3,
                               min_samples_split=14)
tree_pipe = Pipeline([('tree_clf',tree_clf)])

### Voting Classifier

In [28]:
## The pipelines of the defined classifiers will be voting in the voting classifier
vote_clf = VotingClassifier(estimators=[('svm_pipe', svm_pipe), ('tree_pipe', tree_pipe), ('log_pipe', log_pipe)], 
                         voting='soft', n_jobs=-1)

## n_jobs=-1 uses all the cores 
## The soft voting gives more weightage to the more confident classifiers 
vote_clf.fit(X_train, y_train)

VotingClassifier(estimators=[('svm_pipe',
                              Pipeline(steps=[('scaler', StandardScaler()),
                                              ('svm',
                                               SVC(probability=True,
                                                   random_state=42))])),
                             ('tree_pipe',
                              Pipeline(steps=[('tree_clf',
                                               DecisionTreeClassifier(max_depth=20,
                                                                      max_features='sqrt',
                                                                      min_samples_leaf=4))])),
                             ('log_pipe',
                              Pipeline(steps=[('select', SelectKBest(k=27)),
                                              ('log_clf',
                                               LogisticRegression(random_state=42))]))],
                 n_jobs=-1, voting='soft')

In [31]:
vote_pred = vote_clf.predict(X_test)

### Performance

In [32]:
print(classification_report(y_test, vote_pred))

              precision    recall  f1-score   support

           0       0.90      0.96      0.93     25942
           1       0.84      0.67      0.75      8504

    accuracy                           0.89     34446
   macro avg       0.87      0.81      0.84     34446
weighted avg       0.88      0.89      0.88     34446

