In [4]:
# Uncomment the following lines to install required packages if they are not already installed
# !pip install pandas
# !pip install numpy
# !pip install scikit-learn

# Pandas for handling CSV files
import pandas as pd

# For printing confusion matrix/coefficients
import numpy as np

# For printing results to CSV
import csv

# Import scikit learn and necessary modules
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.model_selection import GridSearchCV

# Verify version numbers
# print(f"Pandas version: {pd.__version__}")
# print(f"NumPy version: {np.__version__}")
# print(f"Scikit-learn version: {sklearn.__version__}")

# import sys
# print("Python version:", sys.version)

Pandas version: 2.0.3
NumPy version: 1.24.3
Scikit-learn version: 1.3.0
Python version: 3.11.5 | packaged by Anaconda, Inc. | (main, Sep 11 2023, 13:26:23) [MSC v.1916 64 bit (AMD64)]


In [None]:
# Function to load data into pandas
def loaddata(filename, column):
    full = pd.read_csv(filename)
    full[column] = full[column].fillna("") #pandas method to fill NaN values
    data = list(full[column])
    return data

# Load training and test data
train_txt = loaddata("train.csv", "TEXT")
train_labels = loaddata("train.csv", "LABEL")
test_ids = loaddata("test.csv", "ID")
test_txt = loaddata("test.csv", "TEXT")

# # count labels per class
# from collections import Counter
# label_counts = Counter(train_labels)
# print(label_counts)

In [None]:
# Initialize TF-IDF Vectorizer for getting features
tfidf_vectorizer = TfidfVectorizer(

    ngram_range=(1,5),
    binary=True,
    max_df=0.7,
    min_df=3,
    norm='l2',
    sublinear_tf=True,
    lowercase=True,
#     token_pattern=r'(?u)\b(?!\bbr\b)\w\w+\b' #this removes br left over from html tags, but seems to decrease F1 score

)

# Transform text data to TF-IDF features
X_train_tfidf = tfidf_vectorizer.fit_transform(train_txt)
X_test_tfidf = tfidf_vectorizer.transform(test_txt)

# Find K Best Features (finds most statistically relevant features with respect to labels)
k_best = SelectKBest(score_func=chi2, k=215000)
X_train_kbest = k_best.fit_transform(X_train_tfidf, train_labels)
X_test_kbest = k_best.transform(X_test_tfidf)

# Encode labels
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(train_labels)

# Split data into training and development sets
X_train, X_dev, y_train, y_dev = train_test_split(X_train_kbest, y_train, test_size=0.2, random_state=42)

# For printing misclassifications later using raw text (same split/random state so should line up)
train_txt_train, train_txt_dev = train_test_split(train_txt, test_size=0.2, random_state=42)

# Train logistic regression model on training data
model = LogisticRegression(

    solver='saga',
    penalty='l2',
    C=5,
    max_iter = 2000,
    class_weight = 'balanced' # more 0 labels, so makes sense to balance classes
)  

# Train the logistic regression model using the selected features and corresponding labels
model.fit(X_train, y_train)


# Evaluate model using F1 score and accuracy
y_train_pred = model.predict(X_train)
y_dev_pred = model.predict(X_dev)

# Get F1 Scores
train_f1 = f1_score(y_train, y_train_pred, average='weighted')
dev_f1 = f1_score(y_dev, y_dev_pred, average='weighted')
print("Development F1 Score:", dev_f1)
print("Training F1 Score:", train_f1)

# Predict on test data
test_predictions = model.predict(X_test_kbest)
decoded_test_predictions = label_encoder.inverse_transform(test_predictions)

In [None]:
# # Save predictions to a CSV file
# decoded_test_predictions_list = decoded_test_predictions.tolist()  # Convert numpy array to list
# with open('results.csv', 'w', newline='') as csv_file:
#     csv_writer = csv.writer(csv_file)
#     csv_writer.writerow(['ID', 'LABEL'])
#     for idnum, prediction in zip(test_ids, decoded_test_predictions_list):
#         csv_writer.writerow([idnum, prediction])


In [None]:
# Print misclassifications
def print_misclassified(texts, labels, predictions, set_name):
    decoded_labels = label_encoder.inverse_transform(labels)
    decoded_predictions = label_encoder.inverse_transform(predictions)
    print(f"\nMisclassified examples in {set_name} set:")
    for text, true_label, pred_label in zip(texts, decoded_labels, decoded_predictions):
        if true_label != pred_label:
            print(f"Text: {text} \nTrue label: {true_label}, \nPredicted: {pred_label}","\n")
            print()

# print_misclassified(train_txt_train, y_train, y_train_pred, "training")
print_misclassified(train_txt_dev, y_dev, y_dev_pred, "development")

In [None]:
# Print features before SelectKBest
print("Total number of features before SelectKBest:", X_train_tfidf.shape[1], "\n")

# Confusion matrix
cm = confusion_matrix(y_dev, y_dev_pred)

# Get class labels
class_labels = label_encoder.classes_

# Print confusion matrix with row and column labels
cm_df = pd.DataFrame(cm, 
                     index=[f"Actual {label}" for label in class_labels], 
                     columns=[f"Predicted {label}" for label in class_labels])

print("Confusion Matrix for Development Set:")
print(cm_df, "\n")

# Get feature names from TF-IDF Vectorizer
feature_names = np.array(tfidf_vectorizer.get_feature_names_out())

# Get coefficients from the Logistic Regression model
coefficients = model.coef_

# Find top features for classes
top_n = 10  # Number of top features to show
print("Top Features:")
for class_index in range(coefficients.shape[0]): # (3 classes)
    
    # Sort coefficients for class and get the top n
    top_features_indices = np.argsort(coefficients[class_index])[-top_n:]
    
    # Get feature names
    top_features = feature_names[k_best.get_support()][top_features_indices]
    
    # Print top features for class
    print(f"Class {label_encoder.inverse_transform([class_index])[0]}:")
    for feature in top_features:
        print(feature)
    print()

In [None]:
# GridSearchCV for Parameter Testing

# from sklearn.pipeline import Pipeline
# from sklearn.model_selection import GridSearchCV

# # Define pipeline with some of my existing params
# pipeline = Pipeline([
#     ('tfidf', TfidfVectorizer(lowercase=True, ngram_range=(1,3), binary=True, max_df=0.7, min_df=3, norm='l2', sublinear_tf=True)),
#     ('kbest', SelectKBest(score_func=chi2, k=215000)),
#     ('logreg', LogisticRegression(class_weight='balanced'))
# ])

# # Create parameter grid
# param_grid = {
#     'tfidf__norm': ['l1', 'l2'],
# #     'kbest__k': [200000, 215000, 230000, 250000],
#     'logreg__solver': ['sag', 'saga', 'lbfgs'],
#     'logreg__C': [3,5,7],
#     'logreg__max_iter': [1000, 2000, 3000]
# }

# # Initialize GridSearchCV
# grid_search = GridSearchCV(pipeline, param_grid, scoring='f1_weighted', cv=3, verbose=1)

# # Fit GridSearchCV on training data
# grid_search.fit(train_txt, train_labels)

# # Print best parameters and best score
# print("Best parameters:", grid_search.best_params_)
# print("Best cross-validation score: {:.2f}".format(grid_search.best_score_))

# # Get best model
# best_pipeline = grid_search.best_estimator_

# # Split the data again
# X_train, X_dev, y_train, y_dev = train_test_split(train_txt, train_labels, test_size=0.2, random_state=42)

# # Evaluate using the best model from GridSearch
# y_dev_pred = best_pipeline.predict(X_dev)
# dev_f1 = f1_score(label_encoder.transform(y_dev), label_encoder.transform(y_dev_pred), average='weighted')

# print("Development F1 Score:", dev_f1)

# # Predict on test data
# test_predictions = best_pipeline.predict(test_txt)
# decoded_test_predictions = label_encoder.inverse_transform(label_encoder.transform(test_predictions)) 
