In [1]:
## Importing necessary libraries

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
#sns.set_theme()
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, mutual_info_classif, f_classif
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.linear_model import Perceptron
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline


  import pandas.util.testing as tm


### Loading the modified dataset

In [2]:
df = pd.read_csv('data_merged.csv')
del df['mode']  ## Deleted unnecessary attributes
del df['key_6_yr']

## Drppoing the "popularity" attribute from the data frame and declairing it as the dependant variable or the  output
X = df[df.columns.difference(['popularity'])]
y = pd.cut(x=df["popularity"], bins=[-1,42,100], labels=[0,1])

### Standardizing the data and splitting it into training and testing sets

In [3]:
input_scaler = StandardScaler()
X_normalized = input_scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(
    X_normalized, y, test_size=0.30, random_state=42, stratify=y
)

### SVM with pipeline

In [4]:
scaler = StandardScaler()
## Hyperparameters are tuned according to gridsearch results
svm = SVC(kernel='rbf', random_state=42, probability = True)

svm_pipe = Pipeline([('scaler', scaler), ('svm', svm)])

### Logistic Regressor with pipeline

In [5]:
## Hyperparameters are tuned according to gridsearch results
select = SelectKBest(f_classif, k=27)
log_clf = LogisticRegression(random_state=42)

log_pipe = Pipeline([('select', select), ('log_clf', log_clf)])

### Decision Tree Classifier with pipeline

In [6]:
## Hyperparameters are tuned according to gridsearch results
tree_clf = DecisionTreeClassifier(max_depth=10, max_features='auto', criterion='gini', min_samples_leaf=1,
                               min_samples_split=12)
tree_pipe = Pipeline([('tree_clf',tree_clf)])

### Perceptron with pipeline

In [7]:
## Hyperparameters are tuned according to gridsearch results
ppn_clf = Perceptron(random_state=0, eta0=1, max_iter=10000)

ppn_pipe = Pipeline([('ppn_clf',ppn_clf)])

### Voting Classifier

In [None]:
## The pipelines of the defined classifiers will be voting in the voting classifier

vote_clf = VotingClassifier(estimators=[('svm_pipe', svm_pipe), ('tree_pipe', tree_pipe), ('log_pipe', log_pipe), ('ppn_pipe', ppn_pipe)], 
                         voting='hard', n_jobs=-1)

## n_jobs=-1 uses all the cores 
## The hard voting counts the majority vote of the classifiers 
vote_clf.fit(X_train, y_train)

In [27]:

y_pred = vote_clf.predict(X_test)

In [28]:
accuracy_score(y_test, y_pred)

0.8826762662331379

### Confusion matrix

In [29]:
from sklearn.metrics import confusion_matrix, classification_report

print(confusion_matrix(y_test, y_pred))


[[37857   954]
 [ 5108  7750]]


### Classification report

In [30]:
print("Classification report\n")
print(classification_report(y_test, y_pred))

Classification report

              precision    recall  f1-score   support

           0       0.88      0.98      0.93     38811
           1       0.89      0.60      0.72     12858

    accuracy                           0.88     51669
   macro avg       0.89      0.79      0.82     51669
weighted avg       0.88      0.88      0.87     51669

