In [1]:
# Import variables from another Jupyter Notebook
import nbformat
from nbformat import read
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.dummy import DummyClassifier
from sklearn.metrics import f1_score, classification_report
from sklearn.preprocessing import LabelEncoder
from tqdm.notebook import tqdm
import pickle

In [3]:
# Load pre-split training and validation variables from a pickle file
with open('split_data_v2.pkl', 'rb') as f:
    X_train, X_val, y_train, y_val = pickle.load(f)

In [4]:
# Define different text feature sets
feature_sets = {
    'question': X_train['question'],
    'best_answer': X_train['best_answer'],
    'combined': X_train['question'] + ' ' + X_train['best_answer']
}

In [5]:
# Step 1: TF-IDF Encoding for Different Feature Sets and TF-IDF Methods
def tfidf_transform(X_train, X_val):
    tfidf_methods = {
        'TF-IDF (ngram_range=(1,1))': TfidfVectorizer(max_features=10000, ngram_range=(1, 1)),
        'TF-IDF (ngram_range=(1,2))': TfidfVectorizer(max_features=10000, ngram_range=(1, 2)),
        'TF-IDF (sublinear_tf=True)': TfidfVectorizer(max_features=10000, sublinear_tf=True)
    }

    encoded_sets = {}
    for feature_name, X_train_text in feature_sets.items():
        X_val_text = X_val[feature_name] if feature_name in X_val else X_val['question'] + ' ' + X_val['best_answer']

        for tfidf_name, tfidf in tfidf_methods.items():
            # Apply TF-IDF transformation
            X_train_tfidf = tfidf.fit_transform(X_train_text)
            X_val_tfidf = tfidf.transform(X_val_text)

            # Store encoded sets
            encoded_sets[(feature_name, tfidf_name)] = (X_train_tfidf, X_val_tfidf)

    # Save encoded sets to a file for later use
    with open('encoded_sets_v2.pkl', 'wb') as f:
        pickle.dump(encoded_sets, f)

In [6]:
# Apply TF-IDF transformation and save encoded sets
tfidf_transform(X_train, X_val)

In [7]:
# Step 2: Classifier Training and Evaluation Function
def classifier_training_and_evaluation(encoded_file, y_train, y_validation):
    # Load encoded sets from file
    with open(encoded_file, 'rb') as f:
        encoded_sets = pickle.load(f)

    classifiers = {
        'Random Classifier': DummyClassifier(strategy='uniform'),
        'Naive Bayes': MultinomialNB(),
        'Logistic Regression': LogisticRegression(max_iter=1000, solver='saga'),
        'Random Forest': RandomForestClassifier()
    }

    for classifier_name, clf in classifiers.items():
        results = []
        for (feature_name, tfidf_name), (X_train_tfidf, X_validation_tfidf) in encoded_sets.items():
            print(
                f'Starting training for {classifier_name} with Feature Set: {feature_name}, TF-IDF Method: {tfidf_name}')

            # Create and train pipeline
            pipeline = Pipeline([
                ('clf', clf)
            ])
            pipeline.fit(X_train_tfidf, y_train)

            # Make predictions
            y_pred = pipeline.predict(X_validation_tfidf)

            # Detailed Classification Report
            report = classification_report(y_validation, y_pred, output_dict=True)
            print(
                f'Classification Report for {classifier_name} with Feature Set: {feature_name}, TF-IDF Method: {tfidf_name}:\n{classification_report(y_validation, y_pred)}')

            # Extract F1 scores for each class
            f1_scores_per_class = {f'F1 Score (Class {label})': report[label]['f1-score'] for label in report if
                                   label.isdigit()}
            f1_scores_per_class['Feature Set'] = feature_name
            f1_scores_per_class['TF-IDF Method'] = tfidf_name
            f1_scores_per_class['Model'] = classifier_name
            results.append(f1_scores_per_class)

            print(
                f'{feature_name}, {tfidf_name}, {classifier_name} finished: Weighted F1 score is {report.get("weighted avg", {}).get("f1-score")}')

        # Display results as a DataFrame for each classifier
        results_df = pd.DataFrame(results)
        print(f'F1 Scores for {classifier_name}:')
        print(results_df)

In [None]:
# Train and evaluate classifiers with different feature sets and TF-IDF methods
classifier_training_and_evaluation('encoded_sets_v2.pkl', y_train, y_val)

Starting training for Random Classifier with Feature Set: question, TF-IDF Method: TF-IDF (ngram_range=(1,1))
Classification Report for Random Classifier with Feature Set: question, TF-IDF Method: TF-IDF (ngram_range=(1,1)):
              precision    recall  f1-score   support

           0       0.10      0.10      0.10     27495
           1       0.10      0.10      0.10     27713
           2       0.10      0.10      0.10     27209
           3       0.10      0.10      0.10     27038
           4       0.09      0.10      0.10     26189
           5       0.10      0.10      0.10     27562
           6       0.10      0.10      0.10     27104
           7       0.10      0.10      0.10     27005
           8       0.10      0.10      0.10     26571
           9       0.10      0.10      0.10     27434

    accuracy                           0.10    271320
   macro avg       0.10      0.10      0.10    271320
weighted avg       0.10      0.10      0.10    271320

question, TF-IDF