In [59]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
#Models
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from xgboost import plot_importance
from sklearn.metrics import classification_report
import sys
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.under_sampling import RandomUnderSampler, TomekLinks

if not sys.warnoptions:
    import warnings
    warnings.simplefilter("ignore")

# Contoh fungsi untuk memuat dataset (Anda perlu menyesuaikan sesuai dengan format dataset Anda)
def load_dataset(file_path):
    # Misalkan dataset adalah file CSV dengan kolom 'text' dan 'label'
    data = pd.read_csv(file_path)
    return data['Indikator'], data['Tipe']

In [79]:
#  Daftar model yang akan diuji
models = [
    ('Logistic Regression', LogisticRegression()),
    ('Stochastic Gradient Descent', SGDClassifier()),
    ('Random Forest', RandomForestClassifier()),
    ('Gradient Boosting', GradientBoostingClassifier()),
    ('XGBoost', XGBClassifier()),
    ('K-Nearest Neighbors', KNeighborsClassifier()),
    ('Naive Bayes', MultinomialNB()),
    ('Support Vector Machine', SVC())
]


In [80]:
resampling_methods = [
    ('None', None),
    ('SMOTE', SMOTE()),
    ('ADASYN', ADASYN()),
    ('RandomUnderSampler', RandomUnderSampler()),
    ('TomekLinks', TomekLinks())
]

In [81]:
# Pipeline untuk preprocessing dan model
def create_pipeline(model):
    return Pipeline([
        ('tfidf', TfidfVectorizer()),  # Preprocessing teks dengan TF-IDF
        # ('Resample', resampling_methods), # Resampling
        ('model', model)               # Model yang akan diuji
    ])

In [82]:
# Hyperparameter grid untuk grid search
param_grid = [
    {
        'model': [LogisticRegression()],
        'model__C': [0.1, 1, 10]
    },
    {
        'model': [MultinomialNB()],
    },
    {
        'model': [SVC()],
        'model__C': [0.1, 1, 10],
        'model__kernel': ['linear', 'rbf']
    },
    {
        'model': [RandomForestClassifier()],
        'n_estimators': [25, 50, 100, 150, 200],
        'max_features': ['sqrt', 'log2', None],
        'max_depth': [3, 6, 9, None],
        'max_leaf_nodes': [3, 6, 9, None],
    },
    {
        'model': [GradientBoostingClassifier()],
        'model__n_estimators': [50, 100, 200]
    },
    {
        'model': [XGBClassifier()],
        'model__n_estimators': [50, 100, 200]
    },
    {
        'model': [KNeighborsClassifier()],
        "n_neighbors" : [3,5,7],
        "weights" :['uniform', 'distance'],
        "leaf_size" : range(1,60),
        "p" : [1, 2],
        'metric' : ['minkowski','euclidean','manhattan']
    },
    {
        'model': [SGDClassifier()],
        "loss" : ['hinge','log_loss','modified_huber','perceptron','squared_error','squared_hinge','huber','psilon_insensitive','squared_psilon_insensitive'],
        "penalty" : ['l2','l1','elasticnet',None],
        "alpha" : [0.0001, 0.001, 0.01, 0.1, 1.0],
        "max_iter" : np.arange(0, 1000, 100),
    }
]

In [83]:
# Daftar file path dataset
dataset_files = ['/Users/tirtarumy/Documents/Data scientist porto/structured MBTI Classifiers/Research/Data/train_data/non_cum trainset/A1.csv',
                    '/Users/tirtarumy/Documents/Data scientist porto/structured MBTI Classifiers/Research/Data/train_data/non_cum trainset/A2.csv',
                    '/Users/tirtarumy/Documents/Data scientist porto/structured MBTI Classifiers/Research/Data/train_data/non_cum trainset/A3.csv',
                    '/Users/tirtarumy/Documents/Data scientist porto/structured MBTI Classifiers/Research/Data/train_data/non_cum trainset/P1.csv',
                    '/Users/tirtarumy/Documents/Data scientist porto/structured MBTI Classifiers/Research/Data/train_data/non_cum trainset/P2.csv',
                    '/Users/tirtarumy/Documents/Data scientist porto/structured MBTI Classifiers/Research/Data/train_data/non_cum trainset/P3.csv',
                    '/Users/tirtarumy/Documents/Data scientist porto/structured MBTI Classifiers/Research/Data/train_data/non_cum trainset/J1.csv',
                    '/Users/tirtarumy/Documents/Data scientist porto/structured MBTI Classifiers/Research/Data/train_data/non_cum trainset/J2.csv',
                    '/Users/tirtarumy/Documents/Data scientist porto/structured MBTI Classifiers/Research/Data/train_data/non_cum trainset/J3.csv',
                    '/Users/tirtarumy/Documents/Data scientist porto/structured MBTI Classifiers/Research/Data/train_data/non_cum trainset/C1.csv',
                    '/Users/tirtarumy/Documents/Data scientist porto/structured MBTI Classifiers/Research/Data/train_data/non_cum trainset/C2.csv',
                    '/Users/tirtarumy/Documents/Data scientist porto/structured MBTI Classifiers/Research/Data/train_data/non_cum trainset/C3.csv',
                    ]

In [84]:
# Menyimpan hasil model terbaik untuk setiap dataset
best_models = {}

for file_path in dataset_files:
    # Memuat dataset
    texts, labels = load_dataset(file_path)
    
    # Membagi data menjadi training dan test set
    X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.3, random_state=42)
    
    # Melakukan grid search dengan cross-validation
    grid_search = GridSearchCV(create_pipeline(LogisticRegression()), param_grid, cv=5, scoring='accuracy')
    grid_search.fit(X_train, y_train)
    
    # Menyimpan model terbaik
    best_model = grid_search.best_estimator_
    best_models[file_path] = best_model
    
    # Evaluasi model terbaik pada test set
    y_pred = best_model.predict(X_test)
    print(classification_report(y_test, y_pred))
    print(f"Best Model: {grid_search.best_params_}")
    print("\n")

ValueError: Invalid parameter 'max_depth' for estimator Pipeline(steps=[('tfidf', TfidfVectorizer()),
                ('model', RandomForestClassifier())]). Valid parameters are: ['memory', 'steps', 'verbose'].