In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
import warnings

In [2]:
def preprocess_data(train_file_path, test_file_path):
    """
    Preprocesses the training and test data.

    Parameters:
    train_file_path (string): The file path to the training data CSV file.
    test_file_path (string): The file path to the test data CSV file.

    Returns:
    tuple: A tuple containing preprocessed data:
        - X_train -> Series of training text data.
        - y_train -> Series of training labels.
        - X_test -> Series of test text data.
        - test_ids -> Series of test IDs.
    """

    train_df = pd.read_csv(train_file_path)
    train_df['TEXT'] = train_df['TEXT'].fillna("")
    X_train = train_df['TEXT']
    y_train = train_df['LABEL']
    test_df = pd.read_csv(test_file_path)
    test_df['TEXT'] = test_df['TEXT'].fillna("")
    
    return X_train, y_train, test_df['TEXT'], test_df['ID']

In [3]:
def train_model(X_train, y_train):
    """
    Train a logistic regression model using CountVectorizer for feature engineering.

    Parameters:
    - X_train (array): Training data containing text inputs.
    - y_train (array): Training labels corresponding to X_train.

    Returns:
    - trained_model (Pipeline): Trained pipeline containing CountVectorizer and LogisticRegression.
    """

    vectorizer = CountVectorizer(analyzer='word', max_features=35000, ngram_range=(1, 2))
    classifier = LogisticRegression(C=4)
    pipeline = make_pipeline(vectorizer, classifier)

    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        pipeline.fit(X_train, y_train)
    
    return pipeline

In [4]:
def predict_and_save_submission(model, X_test, test_ids, submission_file_path):
    """
    Predict labels for the test data using the trained model from above and save the submission file.

    Parameters:
        model (object): The trained machine learning model used for prediction.
        X_test (pd.Series): The test data features.
        test_ids (pd.Series): The IDs corresponding to the test data samples.
        submission_file_path (string): The file path where the submission CSV file will be saved.

    Returns:
        None
    """

    predictions = model.predict(X_test)
    submission_df = pd.DataFrame({'ID': test_ids, 'LABEL': predictions})
    submission_df.to_csv(submission_file_path, index=False)

In [5]:
def main(train_file_path, test_file_path, submission_file_path):
    """
    Perform text classification on the given training and test data and save the submission file.

    Parameters:
    - train_file_path (string): Path to the CSV file containing training data.
    - test_file_path (string): Path to the CSV file containing test data.
    - submission_file_path (string): Path to save the submission CSV file.

    Returns:
    None
    """

    X_train, y_train, X_test, test_ids = preprocess_data(train_file_path, test_file_path)
    model = train_model(X_train, y_train)
    predict_and_save_submission(model, X_test, test_ids, submission_file_path)

main('C:/Users/ual-laptop/Downloads/train.csv/train.csv',
     'C:/Users/ual-laptop/Downloads/test.csv/test.csv',
     'C:/Users/ual-laptop/Downloads/submission_final.csv')