In [1]:
NAME = "Poojitha Pasala"
EMAIL = "poojithapasala@arizona.edu"
Topic = "STAT-NLP Kaggle Class Competition 2024"

In [None]:
## Steps for Reproducibility:

### The process involved containerizing the model using Docker.
### First, Docker was utilized to build the image, followed by launching the container to execute the code.
### Below are the commands used:
### docker build -t myimage
### docker run -it -p 7777:9999 -v "$PWD:/app/" myimage:latest

### Dependencies: Run below commands before running the below python cells.
### pip install pandas
### pip install sklearn

### Import Required libraries

In [2]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, f1_score
from sklearn.utils import class_weight 

### Pre-processing Text

In [3]:
def custom_preprocessor(text):
    """
    Custom text preprocessing function that performs pattern substitutions using regular expressions.

    Parameters:
    text (str): Input text to be preprocessed.

    Returns:
    str: Preprocessed text after applying pattern substitutions.
    """
        
    # Define regular expressions for specific pattern substitutions
    url_exp = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))'
    url_exp2 = r'\b(?:www\.)[a-zA-Z0-9]+\b'    
    number_exp = r'\b\d{5}\b'
    currency_exp = r'£\d+(,\d+)*'
    msg_exp = r'\b(?:[a-zA-Z])*\d+/\w+\b'
    
    # Perform pattern substitutions using regular expressions
    text = text.lower()
    text = re.sub(url_exp, '<URL>', text)
    text = re.sub(url_exp2, '<URL1>', text)    
    text = re.sub(number_exp, '<NUMBER1>', text)
    text = re.sub(currency_exp, '<CURR>', text)
    text = re.sub(msg_exp, '<MSG>', text)
    
    return text        

# Load the train and test data
train_df = pd.read_csv("/app/data/train.csv")
test_df = pd.read_csv("/app/data/test.csv")

# Handle missing values in the 'TEXT' column
train_df['TEXT'] = train_df['TEXT'].fillna('')
test_df['TEXT'] = test_df['TEXT'].fillna('')

# Apply custom pre-processing to the text data
train_df['TEXT'] = train_df['TEXT'].apply(custom_preprocessor)
test_df['TEXT'] = test_df['TEXT'].apply(custom_preprocessor)

## Feature Engineering with TF-IDF Vectorization

In [4]:
# Initialize TfidfVectorizer
vectorizer = TfidfVectorizer(ngram_range=(1, 3), max_features=50000)

# Transform training text data into TF-IDF features
X_train = vectorizer.fit_transform(train_df['TEXT'])

# Transform test text data into TF-IDF features
X_test = vectorizer.transform(test_df['TEXT'])

# Assign labels from training data to y_train
y_train = train_df['LABEL']


## Model Selection

In [5]:
# Calculate class weights for imbalanced dataset
class_weights = class_weight.compute_class_weight('balanced', classes=pd.unique(y_train), y=y_train)

# Train the logistic regression model with optimized hyperparameters
model = LogisticRegression(max_iter=3000, class_weight=dict(zip(pd.unique(y_train), class_weights)))

## Cross-Validation & Model Evaluation

In [6]:
# Evaluate model using cross-validation with accuracy scoring
cv_accuracy = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')

# Evaluate model using cross-validation with F1-score weighting
cv_f1 = cross_val_score(model, X_train, y_train, cv=5, scoring='f1_weighted')

# Print mean cross-validation scores
print("Cross-Validation Accuracy:", cv_accuracy.mean())
print("Cross-Validation F1-score:", cv_f1.mean())

Cross-Validation Accuracy: 0.9298178046830866
Cross-Validation F1-score: 0.9294968814556201


## Making Predictions and Saving Results

In [7]:
# Train the model on the entire training set -Fit the logistic regression model on the entire training set
model.fit(X_train, y_train)

# Make predictions on the test set using the trained model
y_test_pred = model.predict(X_test)

# Update the 'LABEL' column in the test DataFrame with the predicted labels
test_df['LABEL'] = y_test_pred

# Save predictions to a submission CSV file
test_df[['ID', 'LABEL']].to_csv('submission.csv', index=False)

# Print confirmation that test set predictions have been saved
print('Test set predictions saved to submission.csv')

Test set predictions saved to submission.csv
