In [40]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
#Models
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from xgboost import plot_importance
from sklearn.metrics import classification_report
import sys
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.under_sampling import RandomUnderSampler, TomekLinks

if not sys.warnoptions:
    import warnings
    warnings.simplefilter("ignore")

def load_dataset(file_path):
    # Misalkan dataset adalah file CSV dengan kolom 'text' dan 'label'
    data = pd.read_csv(file_path)
    return data['Indikator'], data['Tipe']

In [41]:
#  Daftar model yang akan diuji
models = [
    ('Logistic Regression', LogisticRegression()),
    ('Stochastic Gradient Descent', SGDClassifier()),
    ('Random Forest', RandomForestClassifier()),
    ('Gradient Boosting', GradientBoostingClassifier()),
    ('XGBoost', XGBClassifier()),
    ('K-Nearest Neighbors', KNeighborsClassifier()),
    ('Naive Bayes', MultinomialNB()),
    ('Support Vector Machine', SVC())
]


In [42]:
resampling_techniques = [
    ('None', None),
    ('SMOTE', SMOTE()),
    ('ADASYN', ADASYN()),
    ('RandomUnderSampler', RandomUnderSampler()),
    ('TomekLinks', TomekLinks())
]

In [43]:
for name, model in models:
    pipe = Pipeline([
        ('tfidf', TfidfVectorizer()),
        ('model', model)
    ])

In [44]:
param_grid = {
    'Logistic Regression': {
        'model__penalty': ['l1', 'l2', 'elasticnet', 'none'],
        'model__C': [0.01, 0.1, 1, 10, 100],
        'model__solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
        'model__max_iter': [100, 200, 300, 400, 500]
    },
    'Stochastic Gradient Descent': {
        'model__loss': ['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron'],
        'model__penalty': ['l2', 'l1', 'elasticnet'],
        'model__alpha': [0.0001, 0.001, 0.01, 0.1, 1],
        'model__max_iter': [1000, 2000, 3000],
        'model__learning_rate': ['constant', 'optimal', 'invscaling', 'adaptive']
    },
    'Random Forest': {
        'model__n_estimators': [100, 200, 300, 400, 500],
        'model__criterion': ['gini', 'entropy'],
        'model__max_depth': [None, 10, 20, 30, 40, 50],
        'model__min_samples_split': [2, 5, 10],
        'model__min_samples_leaf': [1, 2, 4],
        'model__bootstrap': [True, False]
    },
    'Gradient Boosting': {
        'model__loss': ['deviance', 'exponential'],
        'model__learning_rate': [0.01, 0.1, 0.2, 0.3],
        'model__n_estimators': [100, 200, 300, 400, 500],
        'model__subsample': [0.6, 0.8, 1.0],
        'model__criterion': ['friedman_mse', 'mse', 'mae'],
        'model__max_depth': [3, 4, 5, 6, 7, 8],
        'model__min_samples_split': [2, 5, 10],
        'model__min_samples_leaf': [1, 2, 4],
        'model__max_features': ['auto', 'sqrt', 'log2', None]
    },
    'XGBoost': {
        'model__n_estimators': [100, 200, 300, 400, 500],
        'model__learning_rate': [0.01, 0.1, 0.2, 0.3],
        'model__max_depth': [3, 4, 5, 6, 7, 8],
        'model__min_child_weight': [1, 3, 5],
        'model__subsample': [0.6, 0.8, 1.0],
        'model__colsample_bytree': [0.6, 0.8, 1.0],
        'model__gamma': [0, 0.1, 0.2, 0.3]
    },
    'K-Nearest Neighbors': {
        'model__n_neighbors': [3, 5, 7, 9, 11],
        'model__weights': ['uniform', 'distance'],
        'model__algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
        'model__leaf_size': [20, 30, 40, 50],
        'model__p': [1, 2]
    },
    'Naive Bayes': {
        'model__alpha': [0.01, 0.1, 0.5, 1.0, 2.0]
    },
    'Support Vector Machine': {
        'model__C': [0.1, 1, 10, 100, 1000],
        'model__kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
        'model__gamma': ['scale', 'auto'],
        'model__degree': [3, 4, 5],
        'model__probability': [True]
    }
}

In [45]:
# Daftar file path dataset
dataset_files = ['/Users/tirtarumy/Documents/Data scientist porto/structured MBTI Classifiers/Research/Data/train_data/non_cum trainset/A1.csv',
                    '/Users/tirtarumy/Documents/Data scientist porto/structured MBTI Classifiers/Research/Data/train_data/non_cum trainset/A2.csv',
                    '/Users/tirtarumy/Documents/Data scientist porto/structured MBTI Classifiers/Research/Data/train_data/non_cum trainset/A3.csv',
                    '/Users/tirtarumy/Documents/Data scientist porto/structured MBTI Classifiers/Research/Data/train_data/non_cum trainset/P1.csv',
                    '/Users/tirtarumy/Documents/Data scientist porto/structured MBTI Classifiers/Research/Data/train_data/non_cum trainset/P2.csv',
                    '/Users/tirtarumy/Documents/Data scientist porto/structured MBTI Classifiers/Research/Data/train_data/non_cum trainset/P3.csv',
                    '/Users/tirtarumy/Documents/Data scientist porto/structured MBTI Classifiers/Research/Data/train_data/non_cum trainset/J1.csv',
                    '/Users/tirtarumy/Documents/Data scientist porto/structured MBTI Classifiers/Research/Data/train_data/non_cum trainset/J2.csv',
                    '/Users/tirtarumy/Documents/Data scientist porto/structured MBTI Classifiers/Research/Data/train_data/non_cum trainset/J3.csv',
                    '/Users/tirtarumy/Documents/Data scientist porto/structured MBTI Classifiers/Research/Data/train_data/non_cum trainset/C1.csv',
                    '/Users/tirtarumy/Documents/Data scientist porto/structured MBTI Classifiers/Research/Data/train_data/non_cum trainset/C2.csv',
                    '/Users/tirtarumy/Documents/Data scientist porto/structured MBTI Classifiers/Research/Data/train_data/non_cum trainset/C3.csv',
                    ]

In [50]:
best_models = {}

for file_path in dataset_files:
    # Memuat dataset
    texts, labels = load_dataset(file_path)
    
    # Membagi data menjadi training dan test set
    X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.3, random_state=42)
    
    for name, model in models:
        for resample_name, resampler in resampling_techniques:
            # Pipeline dengan resampling, embedding, dan model
                pipe = Pipeline([
                ('embed', TfidfVectorizer()),
                ('model', model)
            ])
                
            # Melakukan grid search dengan cross-validation
                grid = GridSearchCV(pipe, param_grid=param_grid[name], cv=5, scoring='accuracy')
                grid.fit(X_train, y_train)
                
            # Menyimpan model terbaik
                best_model = grid.best_estimator_
                best_models[(file_path, name, TfidfVectorizer())] = best_model
                
            # Evaluasi model terbaik pada test set
                y_pred = best_model.predict(X_test)
                print(f"Dataset: {file_path}")
                print(f"Model: {name}")
                print(f"Embedding: TF-IDF")
                print(classification_report(y_test, y_pred))
                print(f"Best Model: {grid.best_params_}")
                print("\n")

Dataset: /Users/tirtarumy/Documents/Data scientist porto/structured MBTI Classifiers/Research/Data/train_data/non_cum trainset/A1.csv
Model: Logistic Regression
Embedding: TF-IDF
              precision    recall  f1-score   support

           E       0.74      1.00      0.85        23
           I       0.00      0.00      0.00         8

    accuracy                           0.74        31
   macro avg       0.37      0.50      0.43        31
weighted avg       0.55      0.74      0.63        31

Best Model: {'model__C': 0.01, 'model__max_iter': 100, 'model__penalty': 'l1', 'model__solver': 'liblinear'}


Dataset: /Users/tirtarumy/Documents/Data scientist porto/structured MBTI Classifiers/Research/Data/train_data/non_cum trainset/A1.csv
Model: Logistic Regression
Embedding: TF-IDF
              precision    recall  f1-score   support

           E       0.74      1.00      0.85        23
           I       0.00      0.00      0.00         8

    accuracy                           0

KeyboardInterrupt: 