In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
import langdetect

# Download necessary NLTK data
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

# Load the dataset
train_data = pd.read_csv('data/train.csv')
test_data = pd.read_csv('data/test.csv')

# Fill missing values in the 'TEXT' column with "THE"
train_data['TEXT'] = train_data['TEXT'].fillna("THE")
test_data['TEXT'] = test_data['TEXT'].fillna("THE")

# Remove rows with missing labels in the training data
train_data = train_data.dropna(subset=['LABEL'])

# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

# Define the updated clean_text function with lemmatization
def clean_text(text):
    if isinstance(text, float):
        return ''
    if isinstance(text, str):
        text = re.sub(r'[^a-zA-Z]', ' ', text.lower())
        tokens = nltk.word_tokenize(text)
        stop_words = set(stopwords.words('english'))
        tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]
        clean_text = ' '.join(tokens)
        return clean_text
    return ''

# Apply text cleaning to the 'TEXT' column
train_data['CLEAN_TEXT'] = train_data['TEXT'].apply(clean_text)
test_data['CLEAN_TEXT'] = test_data['TEXT'].apply(clean_text)

# Detect the language of each review and filter out non-English reviews
def detect_language(text):
    try:
        lang = langdetect.detect(text)
        return lang
    except:
        return 'unknown'

train_data['LANGUAGE'] = train_data['TEXT'].apply(detect_language)
train_data = train_data[train_data['LANGUAGE'] == 'en']

test_data['LANGUAGE'] = test_data['TEXT'].apply(detect_language)
test_data = test_data[test_data['LANGUAGE'] == 'en']

# Split the training data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(train_data['CLEAN_TEXT'], train_data['LABEL'], test_size=0.2, random_state=42)

# Create a pipeline with TF-IDF vectorization and logistic regression classifier
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('classifier', LogisticRegression(max_iter=100))
])

# Train the model
pipeline.fit(X_train, y_train)

# Make predictions on the validation set
y_pred = pipeline.predict(X_val)

# Evaluate the model
accuracy = accuracy_score(y_val, y_pred)
print(f"Validation Accuracy: {accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_val, y_pred))

# Make predictions on the test data
test_predictions = pipeline.predict(test_data['CLEAN_TEXT'])

# Create a submission file
submission = pd.DataFrame({'ID': test_data['ID'], 'LABEL': test_predictions})
submission.to_csv('submission1.csv', index=False)

[nltk_data] Downloading package stopwords to /Users/deema/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/deema/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/deema/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Validation Accuracy: 0.9153

Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.99      0.97      5500
           1       0.88      0.86      0.87      3895
           2       0.90      0.87      0.88      3762

    accuracy                           0.92     13157
   macro avg       0.91      0.90      0.91     13157
weighted avg       0.91      0.92      0.91     13157

