In [1]:
import re
import pandas as pd
import nltk
import subprocess
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\pzazo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\pzazo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Preprocess the Dataset

In [2]:
regex_pattern = re.compile(r'[^a-zA-Z0-9\s]')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = regex_pattern.sub('', text)
    text = text.lower()
    tokens = word_tokenize(text)
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]
    return ' '.join(lemmatized_tokens)

In [3]:
df = pd.read_csv('bigdata2023classification/train.csv', delimiter=',')
df['Content'] = df['Content'].apply(preprocess_text)

# Plot WordClouds

In [4]:
    # import matplotlib.pyplot as plt
    # from wordcloud import WordCloud
    
    # plt.style.use('ggplot')

    # for category in df['Label'].unique():
    #     texts = df[df['Label'] == category]['Content'].tolist()

    #     aggregated_text = " ".join(texts)

    #     wordcloud = WordCloud(width=800, height=800, 
    #                           background_color='white',
    #                          colormap='viridis',
    #                          max_words=150,
    #                          contour_color='steelblue',
    #                          contour_width=3).generate(aggregated_text)

    #     plt.figure(figsize=(8, 8))
    #     plt.imshow(wordcloud, interpolation='bilinear')
    #     plt.title(f"Word Cloud for {category} Category", fontsize=14, fontweight='bold')
    #     plt.axis("off")
    #     plt.tight_layout(pad=0)
    #     plt.show()

In [5]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_validate, KFold
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier

X = df['Content']
y = df['Label']

cv = KFold(n_splits=5, shuffle=True, random_state=42)

svm_bow_pipeline = Pipeline([
    ('vect', HashingVectorizer(n_features=2**14, alternate_sign=False)),
    ('clf', LinearSVC(dual=False))
])

rf_bow_pipeline = Pipeline([
    ('vect', HashingVectorizer(n_features=2**14, alternate_sign=False)),
    ('clf', RandomForestClassifier(n_estimators=100, max_depth=10, n_jobs=-1))
])

svm_svd_pipeline = Pipeline([
    ('vect', HashingVectorizer(n_features=2**14, alternate_sign=False)),
    ('svd', TruncatedSVD(n_components=50)),
    ('clf', LinearSVC(dual=False))
])

rf_svd_pipeline = Pipeline([
    ('vect', HashingVectorizer(n_features=2**14, alternate_sign=False)),
    ('svd', TruncatedSVD(n_components=50)),
    ('clf', RandomForestClassifier(n_estimators=100, max_depth=10, n_jobs=-1))
])

pipelines = {
    'SVM BOW': svm_bow_pipeline,
    'Random Forest BOW': rf_bow_pipeline,
    'SVM SVD': svm_svd_pipeline,
    'Random Forest SVD': rf_svd_pipeline
}

scoring = {'accuracy': 'accuracy',
           'precision': 'precision_macro',
           'recall': 'recall_macro'
}

for name, pipeline in pipelines.items():
    scores = cross_validate(pipeline, X, y, cv=cv, scoring=scoring, n_jobs=-1)
    print(f"\n{name}:")
    print("Accuracy: {:3f}".format(scores['test_accuracy'].mean()))
    print("Precision: {:3f}".format(scores['test_precision'].mean()))
    print("Recall: {:3f}".format(scores['test_recall'].mean()))



SVM BOW:
Accuracy: 0.964301
Precision: 0.962580
Recall: 0.959800

Random Forest BOW:
Accuracy: 0.754363
Precision: 0.853802
Recall: 0.676196

SVM SVD:
Accuracy: 0.893197
Precision: 0.886952
Recall: 0.872538

Random Forest SVD:
Accuracy: 0.879521
Precision: 0.879919
Recall: 0.849225
