In [3]:
NAME = "Ahmed Alkuraydis"
# University of Arizona email address
EMAIL = "alkuraydsi@arizona.edu"

In [4]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from joblib import dump

# Loading and preprocessing the data

In [5]:
def load_and_preprocess_data(train_file, test_file):
    train_data = pd.read_csv(train_file)
    test_data = pd.read_csv(test_file)
    train_data['TEXT'] = train_data['TEXT'].fillna('')
    test_data['TEXT'] = test_data['TEXT'].fillna('')
    return train_data, test_data

# Data extraction and selection

In [6]:
def extract_features(train_data, test_data):
    vectorizer = TfidfVectorizer(ngram_range=(1, 3), min_df=5, max_df=0.95, sublinear_tf=True,
                                 stop_words='english')
    X_train = vectorizer.fit_transform(train_data['TEXT'])
    X_test = vectorizer.transform(test_data['TEXT'])
    return X_train, X_test, vectorizer

def select_features(X_train, X_test, y_train, k=20000):
    selector = SelectKBest(chi2, k=k)
    X_train = selector.fit_transform(X_train, y_train)
    X_test = selector.transform(X_test)
    return X_train, X_test, selector

# Model training and evaluation

In [7]:
def train_and_evaluate_model(X_train, y_train):
    model = LogisticRegression(C=1.0, multi_class='multinomial', solver='saga', max_iter=1000)
    X_train_split, X_val, y_train_split, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
    model.fit(X_train_split, y_train_split)
    y_pred_val = model.predict(X_val)
    accuracy = accuracy_score(y_val, y_pred_val)
    f1 = f1_score(y_val, y_pred_val, average='macro')
    print(f"Validation Accuracy: {accuracy:.4f}")
    print(f"Validation F1-score: {f1:.4f}")
    return model

# Prediction and Results

In [8]:
def predict_and_save_results(model, X_train, y_train, X_test, test_data):
    model.fit(X_train, y_train)
    y_pred_test = model.predict(X_test)
    submission = pd.DataFrame({'ID': test_data['ID'], 'LABEL': y_pred_test})
    submission.to_csv('submission.csv', index=False)

# Main program
train_data, test_data = load_and_preprocess_data('train.csv', 'test.csv')
X_train, X_test, vectorizer = extract_features(train_data, test_data)
X_train, X_test, selector = select_features(X_train, X_test, train_data['LABEL'])
model = train_and_evaluate_model(X_train, train_data['LABEL'])
predict_and_save_results(model, X_train, train_data['LABEL'], X_test, test_data)

  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is

Validation Accuracy: 0.9238
Validation F1-score: 0.9119


# Task
The task I'm addressing in this code is a text classification problem where the goal is to classify movie reviews into one of three categories:
- Not a movie/TV show review
- Positive movie/TV show review
- Negative movie/TV show review
The provided datasets, which include a training set and a test set, contain text documents that need to be classified into the appropriate categories.

# Approach 

I begin by importing the necessary libraries for the task at hand. These include pandas for data manipulation, scikit-learn's TfidfVectorizer for feature extraction, SelectKBest and chi2 from scikit-learn for feature selection, LogisticRegression for model training, train_test_split for splitting the data into training and validation sets, accuracy_score and f1_score for model evaluation, and dump from joblib for saving the trained model.

Next, I define a function called load_and_preprocess_data that takes the file paths of the training and test data as input. This function reads the CSV files using pandas, and fills any missing values in the 'TEXT' column with empty strings. It returns the preprocessed training and test data.

I then define two functions for feature extraction and selection. The extract_features function takes the preprocessed training and test data as input and applies the TF-IDF vectorizer to convert the text data into numerical features. It uses n-grams ranging from 1 to 3, a minimum document frequency of 5, a maximum document frequency of 95%, sublinear term frequency scaling, and removes English stop words. The function returns the transformed training and test features, as well as the fitted vectorizer.

The select_features function takes the extracted features, the training labels, and the number of top features to select (default is 20,000) as input. It uses the chi-squared statistical test to select the most informative features and returns the selected features for both the training and test sets, along with the fitted selector object.

I define a function called train_and_evaluate_model that takes the selected training features and labels as input. It initializes a logistic regression model with specific hyperparameters, such as the regularization strength (C=1.0), the multi-class strategy ('multinomial'), the solver ('saga'), and the maximum number of iterations (1000). The function then splits the training data into a smaller training set and a validation set using train_test_split with a test size of 20% and a fixed random state for reproducibility. It trains the model on the smaller training set and evaluates its performance on the validation set using accuracy and macro-averaged F1-score. The function prints the validation accuracy and F1-score and returns the trained model.

# Results
The model achieves a validation accuracy of 92.38% and a macro-averaged F1-score of 91.19%, indicating its effectiveness in distinguishing between the three categories of documents.
# Code
https://github.com/uazhlt-ms-program/grad-level-term-project-kaggle-competition-Ahmed8M
