In [1]:
import pandas as pd
import nltk
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV


class DataLoader:
    @staticmethod
    def load_data(train_file_path, test_file_path):
        train_df = pd.read_csv(train_file_path)
        test_df = pd.read_csv(test_file_path)
        return train_df, test_df


class Preprocessor:
    def __init__(self):
        self.lemmatizer = WordNetLemmatizer()

    def preprocess_text(self, text):
        text = text.lower()
        text = text.replace('[^\w\s]', '')
        text = text.replace('\d+', '')
        words = nltk.word_tokenize(text)
        lemmatized_words = [self.lemmatizer.lemmatize(word) for word in words]
        return ' '.join(lemmatized_words)

    def preprocess(self, train_df, test_df):
        train_df['TEXT'] = train_df['TEXT'].fillna('')
        train_df['TEXT'] = train_df['TEXT'].apply(self.preprocess_text)
        test_df['TEXT'] = test_df['TEXT'].fillna('')
        test_df['TEXT'] = test_df['TEXT'].apply(self.preprocess_text)
        return train_df, test_df


class FeatureExtractor:
    def __init__(self):
        self.vectorizer = TfidfVectorizer()

    def vectorize(self, train_df, test_df):
        X_train = self.vectorizer.fit_transform(train_df['TEXT'])
        y_train = train_df['LABEL']
        X_test = self.vectorizer.transform(test_df['TEXT'])
        return X_train, y_train, X_test


class Classifier:
    def __init__(self, X_train, y_train):
        self.X_train = X_train
        self.y_train = y_train

    def train_naive_bayes(self):
        nb = MultinomialNB()
        nb_params = {'alpha': [0.5, 1.0, 1.5]}
        nb_gs = GridSearchCV(nb, nb_params, cv=5, n_jobs=-1)
        nb_gs.fit(self.X_train, self.y_train)
        return nb_gs.best_estimator_

    def train_logistic_regression(self):
        lr = LogisticRegression()
        lr_params = {'C': [0.1, 1.0, 10.0]}
        lr_gs = GridSearchCV(lr, lr_params, cv=5, n_jobs=-1)
        lr_gs.fit(self.X_train, self.y_train)
        return lr_gs.best_estimator_


if __name__ == '__main__':
    train_file_path = 'train.csv'
    test_file_path = 'test.csv'
    submission_file_path = 'output2.csv'

    # Load data
    train_df, test_df = DataLoader.load_data(train_file_path, test_file_path)

    # Preprocess data
    preprocessor = Preprocessor()
    train_df, test_df = preprocessor.preprocess(train_df, test_df)

    # Extract features
    feature_extractor = FeatureExtractor()
    X_train, y_train, X_test = feature_extractor.vectorize(train_df, test_df)

    # Split data
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

    # Train and evaluate classifiers
    classifier = Classifier(X_train, y_train)

    # Naive Bayes
    nb_best = classifier.train_naive_bayes()
    y_pred_nb = nb_best.predict(X_val)
    accuracy_nb = accuracy_score(y_val, y_pred_nb)
    print("Naive Bayes accuracy:", accuracy_nb)

    # Logistic Regression
    lr_best = classifier.train_logistic_regression()
    y_pred_lr = lr_best.predict(X_val)
    accuracy_lr = accuracy_score(y_val, y_pred_lr)
    print("Logistic Regression accuracy:", accuracy_lr)

    # Make predictions on the test data using the best classifier
    y_pred = lr_best.predict(X_test)

    # Save the predictions to a CSV file in the correct format
    submission_df = pd.read_csv(submission_file_path)
    submission_df['LABEL'] = y_pred
    submission_df.to_csv('submission17.csv', index=False)


Naive Bayes accuracy: 0.8786262798634812


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Logistic Regression accuracy: 0.9241325369738339


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
