In [24]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, f1_score

# Download necessary NLTK resources
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mmira\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mmira\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [25]:
# Load data from CSV files
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [26]:
def preprocess_text(text):
    if pd.isna(text) or not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    tokens = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

# Apply preprocessing to text data
train_df['processed_text'] = train_df['TEXT'].apply(preprocess_text)
test_df['processed_text'] = test_df['TEXT'].apply(preprocess_text)

In [27]:
# Use TF-IDF vectorization with unigrams and bigrams
vectorizer = TfidfVectorizer(max_features=10000, min_df=5, max_df=0.5, ngram_range=(1,2))
X_train = vectorizer.fit_transform(train_df['processed_text'])
y_train = train_df['LABEL']

# Split the data into training and validation sets
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42)

In [28]:
# Initialize and train the Logistic Regression model
model = LogisticRegression(max_iter=1000, C=1.0, random_state=42)
model.fit(X_train_split, y_train_split)

# Evaluate the model on the validation set
y_pred = model.predict(X_val_split)
print("Classification Report:\n", classification_report(y_val_split, y_pred))
print("F1 Score:", f1_score(y_val_split, y_pred, average='macro'))

Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.98      0.97      6454
           1       0.87      0.86      0.87      3856
           2       0.89      0.86      0.88      3754

    accuracy                           0.92     14064
   macro avg       0.91      0.90      0.91     14064
weighted avg       0.92      0.92      0.92     14064

F1 Score: 0.9053481717848816


In [29]:
# Predict on the test set
X_test = vectorizer.transform(test_df['processed_text'])
test_predictions = model.predict(X_test)

# Create submission DataFrame
submission_df = pd.DataFrame({'ID': test_df['ID'], 'LABEL': test_predictions})

# Save submission to a CSV file
submission_df.to_csv('submission.csv', index=False)