In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import make_scorer, f1_score, classification_report, accuracy_score
from nltk.sentiment import SentimentIntensityAnalyzer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler
from scipy.sparse import csr_matrix
import warnings 
from sklearn.naive_bayes import MultinomialNB
warnings.filterwarnings('ignore') 

In [2]:
# Load the datasets
# Training dataset to train the model.
train = pd.read_csv('train.csv')
# Test dataset to predict and evaluate the model's performance.
test = pd.read_csv('test.csv')


In [3]:

train_text = train['TEXT']
train_label = train['LABEL']
test_text = test['TEXT']


# Replace NaN values with an empty string
train_text = train_text.fillna('')
test_text = test_text.fillna('')



In [4]:
vectorizer = TfidfVectorizer()

In [5]:
# Step 4: Train a Classifier
trainer = vectorizer.fit_transform(train_text)

# Fit the classifier with the training data
classifier = LogisticRegression(C=50)
classifier.fit(trainer, train_label)  


# Make predictions on the test data
tester = vectorizer.transform(test_text)
trainer_predictions = classifier.predict(tester)

In [None]:
# Vectorizer
trainer = vectorizer.fit_transform(train_text)
print("Vectorizer Vocabulary Size:", len(vectorizer.vocabulary_))
print("Shape of transformed training data:", trainer.shape)

# Training Data
print("Number of samples in training data:", len(train_text))
print("Number of labels in training data:", len(train_label))

# Classifier
print("Classifier parameters:", classifier.get_params())

# Training Process
classifier.fit(trainer, train_label)
print("Classifier trained successfully.")

# Predictions
tester = vectorizer.transform(test_text)
print("Shape of transformed test data:", tester.shape)
trainer_predictions = classifier.predict(tester)
print("Predictions made on test data:", trainer_predictions)

Vectorizer Vocabulary Size: 278577
Shape of transformed training data: (70317, 278577)
Number of samples in training data: 70317
Number of labels in training data: 70317
Classifier parameters: {'C': 50, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 100, 'multi_class': 'auto', 'n_jobs': None, 'penalty': 'l2', 'random_state': None, 'solver': 'lbfgs', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}


In [None]:
# Step 6: Make Predictions
trainer_predictions = classifier.predict(trainer)
trainer_accuracy = accuracy_score(train_label, trainer_predictions)
print("Training accuracy is: ", trainer_accuracy)

# Evaluate the model performance
f1 = f1_score(train_label, trainer_predictions, average='macro')
report = classification_report(train_label, trainer_predictions)

print("F1 Score:", f1)
print("Report:\n", report)

In [None]:
# Predictions
tester = vectorizer.transform(test_text)
print("Shape of transformed test data:", tester.shape)
trainer_predictions = classifier.predict(tester)
print("Predictions made on test data:", trainer_predictions)

submission = pd.DataFrame({'ID': test['ID'], 'LABEL': trainer_predictions})

# Save the submission DataFrame to a CSV file
submission.to_csv('submission.csv', index=False)
