In [3]:
NAME = "Ahmed Alkuraydis"
# University of Arizona email address
EMAIL = "alkuraydsi@arizona.edu"

In [4]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from joblib import dump

# Loading and preprocessing the data

In [5]:
def load_and_preprocess_data(train_file, test_file):
    train_data = pd.read_csv(train_file)
    test_data = pd.read_csv(test_file)
    train_data['TEXT'] = train_data['TEXT'].fillna('')
    test_data['TEXT'] = test_data['TEXT'].fillna('')
    return train_data, test_data

# Data extraction and selection

In [6]:
def extract_features(train_data, test_data):
    vectorizer = TfidfVectorizer(ngram_range=(1, 3), min_df=5, max_df=0.95, sublinear_tf=True,
                                 stop_words='english')
    X_train = vectorizer.fit_transform(train_data['TEXT'])
    X_test = vectorizer.transform(test_data['TEXT'])
    return X_train, X_test, vectorizer

def select_features(X_train, X_test, y_train, k=20000):
    selector = SelectKBest(chi2, k=k)
    X_train = selector.fit_transform(X_train, y_train)
    X_test = selector.transform(X_test)
    return X_train, X_test, selector

# Model training and evaluation

In [7]:
def train_and_evaluate_model(X_train, y_train):
    model = LogisticRegression(C=1.0, multi_class='multinomial', solver='saga', max_iter=1000)
    X_train_split, X_val, y_train_split, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
    model.fit(X_train_split, y_train_split)
    y_pred_val = model.predict(X_val)
    accuracy = accuracy_score(y_val, y_pred_val)
    f1 = f1_score(y_val, y_pred_val, average='macro')
    print(f"Validation Accuracy: {accuracy:.4f}")
    print(f"Validation F1-score: {f1:.4f}")
    return model

# Prediction and Results

In [8]:
def predict_and_save_results(model, X_train, y_train, X_test, test_data):
    model.fit(X_train, y_train)
    y_pred_test = model.predict(X_test)
    submission = pd.DataFrame({'ID': test_data['ID'], 'LABEL': y_pred_test})
    submission.to_csv('submission.csv', index=False)

# Main program
train_data, test_data = load_and_preprocess_data('train.csv', 'test.csv')
X_train, X_test, vectorizer = extract_features(train_data, test_data)
X_train, X_test, selector = select_features(X_train, X_test, train_data['LABEL'])
model = train_and_evaluate_model(X_train, train_data['LABEL'])
predict_and_save_results(model, X_train, train_data['LABEL'], X_test, test_data)

  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is

Validation Accuracy: 0.9238
Validation F1-score: 0.9119
