#Getting ready to estimation!

It's used **sklearn version 1.0** in this script. The version of the library defines models and their hyperparameters to estimate. But you can use any version, just put suitable models and parameters or update your sklearn with below command

In [1]:
!pip install -U scikit-learn



# Importing dependencies

In [1]:
import itertools
import numpy as np
import pandas as pd
from tqdm import tqdm
import time

from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.model_selection import GridSearchCV

import warnings
warnings.filterwarnings('ignore')

# Handle with input data

Choose models and parameters. Put it into model list

In [3]:
params_dt = {'criterion' : ['gini', 'entropy'],
             'splitter': ['best', 'random'],
             'max_depth': [None, 2, 3, 5, 7, 9],
             'max_features':[None, 'auto', 'sqrt', 'log2']}

params_knn = {'n_neighbors': [3, 4, 5, 7],
              'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']}

params_svc = {'kernel':['linear', 'poly', 'rbf', 'sigmoid', 'precomputed'],
              'degree':[2, 3, 4, 5, 6],
              'gamma':['scale', 'auto']}

params_lr = {'penalty' : ['l1', 'l2', 'elasticnet', 'none'],
             'fit_intercept': [True, False],
             'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']}

params_gnb = {'var_smoothing':[1e-9, 1e-8, 1e-7]}

params_rc = {'n_estimators': [100, 150, 200, 250],
             'criterion': ['gini', 'entropy'],
             'max_depth': [None, 5, 7, 9],
             'max_features': ['auto', 'sqrt', 'log2']}

In [4]:
# models = [(DecisionTreeClassifier, params_dt), (KNeighborsClassifier, params_knn), 
#           (SVC, params_svc), (LogisticRegression, params_lr), 
#           (GaussianNB, params_gnb), (RandomForestClassifier, params_rc)]
models = [(DecisionTreeClassifier, params_dt), (KNeighborsClassifier, params_knn), 
          (LogisticRegression, params_lr), 
          (GaussianNB, params_gnb), (RandomForestClassifier, params_rc)]          

In [6]:
dataset = pd.read_csv("messages_group_db.csv", sep="\t")

In [7]:
# Ensure label balance
dataset.sort_values(by='label', ascending=True, inplace=True)
dataset.reset_index(inplace=True, drop=True)
dataset = dataset.iloc[len(dataset['label'])-dataset[dataset['label']==1].count().values[0]*2:,:]

In [8]:
X = dataset['message']
tfidf = TfidfVectorizer(max_features=10000, ngram_range=(1,2))
X = tfidf.fit_transform(X)
y = dataset['label']

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

# Main function

In [10]:
def params_model_selection(model, parameters) -> pd.DataFrame:
    """
    Function to run DS model with different hyperparameters in order to 
    estimate it and choose the one with the highest accuracy
    """
    def combination_params(*params):
        return itertools.product(*params)
    accs = []
    params_final = []
    params_values = list(combination_params(*list(parameters.values())))
    print(f"Estimation parameters of {model.__name__} model")

    for c in tqdm(params_values, position=0, leave=False):
        # print(c)
        try:
            params = dict(zip(tuple(parameters.keys()), c))
            classifier = model(**params)


            classifier.fit(X_train, y_train)

            y_pred = classifier.predict(X_test)

            accs.append(accuracy_score(y_test, y_pred))
            params_final.append(params)
            # print(f' Params {params_values.index(c)+1}/{len(params_values)} of {model.__name__} model is estimated')
        except ValueError:
            pass
        
    print(f'Estimation {model.__name__} model done!')
    df_res = pd.DataFrame({'Model':model.__name__, 'Accuracy': accs, 'params_final': params_final})
    max_acc = df_res['Accuracy'].max()
    print(f"Max value of accuracy is {max_acc}")
    return df_res


# Create result in table

In [11]:
df_result = pd.DataFrame(columns=['Model', 'Accuracy', 'params_final'])

In [12]:
for model in models:
    try:
        df_result = pd.concat([df_result, params_model_selection(model[0], model[1])])
        print('\n-----------------------')
    except:
        pass

  3%|▎         | 3/96 [00:00<00:03, 29.81it/s]

Estimation parameters of DecisionTreeClassifier model


  0%|          | 0/40 [00:00<?, ?it/s]          

Estimation DecisionTreeClassifier model done!
Max value of accuracy is 0.7414965986394558

-----------------------
Estimation parameters of KNeighborsClassifier model
Estimation KNeighborsClassifier model done!
Max value of accuracy is 0.4965986394557823

-----------------------
Estimation parameters of LogisticRegression model


  0%|          | 0/96 [00:00<?, ?it/s]         

Estimation LogisticRegression model done!
Max value of accuracy is 0.7959183673469388

-----------------------
Estimation parameters of GaussianNB model
Estimation parameters of RandomForestClassifier model


                                               

Estimation RandomForestClassifier model done!
Max value of accuracy is 0.7959183673469388

-----------------------




In [13]:
df_result

Unnamed: 0,Model,Accuracy,params_final
0,DecisionTreeClassifier,0.741497,"{'criterion': 'gini', 'splitter': 'best', 'max..."
1,DecisionTreeClassifier,0.673469,"{'criterion': 'gini', 'splitter': 'best', 'max..."
2,DecisionTreeClassifier,0.687075,"{'criterion': 'gini', 'splitter': 'best', 'max..."
3,DecisionTreeClassifier,0.673469,"{'criterion': 'gini', 'splitter': 'best', 'max..."
4,DecisionTreeClassifier,0.646259,"{'criterion': 'gini', 'splitter': 'best', 'max..."
...,...,...,...
91,RandomForestClassifier,0.755102,"{'n_estimators': 250, 'criterion': 'entropy', ..."
92,RandomForestClassifier,0.673469,"{'n_estimators': 250, 'criterion': 'entropy', ..."
93,RandomForestClassifier,0.775510,"{'n_estimators': 250, 'criterion': 'entropy', ..."
94,RandomForestClassifier,0.761905,"{'n_estimators': 250, 'criterion': 'entropy', ..."


In [14]:
df_result[df_result['Accuracy']==df_result['Accuracy'].max()]

Unnamed: 0,Model,Accuracy,params_final
17,LogisticRegression,0.795918,"{'penalty': 'none', 'fit_intercept': True, 'so..."
40,RandomForestClassifier,0.795918,"{'n_estimators': 150, 'criterion': 'entropy', ..."


In [15]:
list(df_result[df_result['Accuracy']==df_result['Accuracy'].max()]['params_final'])

[{'penalty': 'none', 'fit_intercept': True, 'solver': 'saga'},
 {'n_estimators': 150,
  'criterion': 'entropy',
  'max_depth': 5,
  'max_features': 'sqrt'}]