In [1]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [2]:
# Load the dataset
data = pd.read_csv("blogs_categories.csv")

In [3]:
# Data Exploration
print("Dataset shape:", data.shape)
print("Categories:", data['Labels'].unique())

Dataset shape: (19997, 3)
Categories: ['alt.atheism' 'comp.graphics' 'comp.os.ms-windows.misc'
 'comp.sys.ibm.pc.hardware' 'comp.sys.mac.hardware' 'comp.windows.x'
 'misc.forsale' 'rec.autos' 'rec.motorcycles' 'rec.sport.baseball'
 'rec.sport.hockey' 'sci.crypt' 'sci.electronics' 'sci.med' 'sci.space'
 'soc.religion.christian' 'talk.politics.guns' 'talk.politics.mideast'
 'talk.politics.misc' 'talk.religion.misc']


In [4]:
# Data Preprocessing
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    return text

data['Data'] = data['Data'].apply(preprocess_text)

In [5]:
# Feature Extraction using TF-IDF
X = data['Data']
y = data['Labels']

In [6]:
tfidf_vectorizer = TfidfVectorizer(max_features=1000)  # You can adjust max_features as needed
X_tfidf = tfidf_vectorizer.fit_transform(X)

In [7]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

In [8]:
# Naive Bayes Model
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train, y_train)

In [9]:
# Predictions
y_pred = nb_classifier.predict(X_test)

In [10]:
# Evaluation
print("\nAccuracy:", accuracy_score(y_test, y_pred))


Accuracy: 0.8225


In [11]:
print("Precision:", precision_score(y_test, y_pred, average='weighted'))

Precision: 0.8221386044126102


In [12]:
print("Recall:", recall_score(y_test, y_pred, average='weighted'))

Recall: 0.8225


In [13]:
print("F1 Score:", f1_score(y_test, y_pred, average='weighted'))

F1 Score: 0.8199348174631516


In [14]:
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Classification Report:
                          precision    recall  f1-score   support

             alt.atheism       0.63      0.73      0.68       173
           comp.graphics       0.73      0.84      0.78       179
 comp.os.ms-windows.misc       0.88      0.77      0.82       226
comp.sys.ibm.pc.hardware       0.85      0.80      0.82       204
   comp.sys.mac.hardware       0.86      0.86      0.86       205
          comp.windows.x       0.86      0.89      0.87       186
            misc.forsale       0.75      0.84      0.79       190
               rec.autos       0.84      0.90      0.87       203
         rec.motorcycles       0.94      0.88      0.91       218
      rec.sport.baseball       0.94      0.93      0.93       192
        rec.sport.hockey       0.94      0.96      0.95       203
               sci.crypt       0.90      0.85      0.87       200
         sci.electronics       0.91      0.80      0.85       227
                 sci.med       0.82      0.89      

In [15]:
import nltk
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\KCL\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [16]:
# Sentiment Analysis
nltk.download('vader_lexicon')  # Download VADER lexicon
sid = SentimentIntensityAnalyzer()
sentiments = []
for post in data['Data']:
    sentiment_score = sid.polarity_scores(post)
    if sentiment_score['compound'] >= 0.05:
        sentiment = 'positive'
    elif sentiment_score['compound'] <= -0.05:
        sentiment = 'negative'
    else:
        sentiment = 'neutral'
    sentiments.append(sentiment)

data['Sentiment'] = sentiments

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\KCL\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [17]:
# Evaluate sentiment distribution across different categories
sentiment_category_distribution = data.groupby(['Labels', 'Sentiment']).size().unstack(fill_value=0)
print("\nSentiment Distribution Across Categories:")
print(sentiment_category_distribution)


Sentiment Distribution Across Categories:
Sentiment                 negative  neutral  positive
Labels                                               
alt.atheism                    390       16       594
comp.graphics                  122       58       820
comp.os.ms-windows.misc        196       55       749
comp.sys.ibm.pc.hardware       210       38       752
comp.sys.mac.hardware          250       59       691
comp.windows.x                 224       49       727
misc.forsale                   134       77       789
rec.autos                      318       44       638
rec.motorcycles                299       34       667
rec.sport.baseball             223       49       728
rec.sport.hockey               280       30       690
sci.crypt                      293       35       672
sci.electronics                181       44       775
sci.med                        338       40       622
sci.space                      275       30       695
soc.religion.christian         252     

In [18]:
# Evaluation and Discussion
print("\nEvaluation and Discussion:")


Evaluation and Discussion:
