In [16]:
# Installing sentiment analysis tool
!pip install vaderSentiment




In [30]:
# Importing core libraries for text mining and classification
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer


In [31]:
# Ensuring stopwords are available
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [32]:
# Loading blog data
df = pd.read_csv("blogs.csv")

In [33]:
df.head()

Unnamed: 0,Data,Labels
0,Path: cantaloupe.srv.cs.cmu.edu!magnesium.club...,alt.atheism
1,Newsgroups: alt.atheism\nPath: cantaloupe.srv....,alt.atheism
2,Path: cantaloupe.srv.cs.cmu.edu!das-news.harva...,alt.atheism
3,Path: cantaloupe.srv.cs.cmu.edu!magnesium.club...,alt.atheism
4,Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:53...,alt.atheism


In [34]:
len(df)

2000

In [35]:
print(df[['Labels', 'Data']].head())

        Labels                                               Data
0  alt.atheism  Path: cantaloupe.srv.cs.cmu.edu!magnesium.club...
1  alt.atheism  Newsgroups: alt.atheism\nPath: cantaloupe.srv....
2  alt.atheism  Path: cantaloupe.srv.cs.cmu.edu!das-news.harva...
3  alt.atheism  Path: cantaloupe.srv.cs.cmu.edu!magnesium.club...
4  alt.atheism  Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:53...


In [36]:
df.describe()

Unnamed: 0,Data,Labels
count,2000,2000
unique,2000,20
top,Xref: cantaloupe.srv.cs.cmu.edu talk.religion....,alt.atheism
freq,1,100


In [37]:
# Defining stopwords and stemmer
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

# Cleaning logic: lowercase, strip punctuation, remove stopwords, apply stemming
def clean_text(text):
    # Remove headers if present
    _, _, body = text.partition('\n\n')
    text = body if body else text

    # Lowercase and remove non-alphabetic characters
    text = re.sub(r'[^a-z\s]', '', text.lower())

    # Tokenize and clean
    tokens = text.split()
    cleaned = [stemmer.stem(word) for word in tokens if word not in stop_words]

    return " ".join(cleaned)

# Apply cleaning
df['processed_data'] = df['Data'].apply(clean_text)


In [38]:
# Converting cleaned text into TF-IDF features
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df['processed_data'])
y = df['Labels']

print("TF-IDF shape:", X.shape)

TF-IDF shape: (2000, 5000)


In [39]:
# Splitting data for training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Initializing and training Naive Bayes classifier
model = MultinomialNB()
model.fit(X_train, y_train)

# Predicting test labels
y_pred = model.predict(X_test)

In [40]:
y_pred

array(['sci.crypt', 'sci.med', 'talk.politics.mideast',
       'soc.religion.christian', 'alt.atheism', 'sci.med',
       'comp.windows.x', 'rec.motorcycles', 'talk.politics.mideast',
       'comp.graphics', 'comp.sys.mac.hardware', 'misc.forsale',
       'talk.politics.misc', 'rec.sport.baseball', 'rec.autos',
       'alt.atheism', 'comp.sys.ibm.pc.hardware', 'comp.windows.x',
       'sci.med', 'talk.religion.misc', 'sci.electronics',
       'sci.electronics', 'rec.autos', 'rec.sport.baseball', 'sci.space',
       'comp.sys.ibm.pc.hardware', 'rec.autos', 'talk.politics.guns',
       'sci.space', 'talk.politics.misc', 'sci.electronics',
       'soc.religion.christian', 'comp.graphics', 'rec.sport.hockey',
       'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware',
       'sci.electronics', 'comp.os.ms-windows.misc', 'rec.sport.hockey',
       'rec.motorcycles', 'talk.religion.misc', 'rec.autos',
       'comp.sys.ibm.pc.hardware', 'talk.politics.mideast',
       'talk.politics.guns', 'r

In [41]:
# Create a DataFrame to compare actual and predicted labels
results_df = pd.DataFrame({
    'Actual': y_test,
    'Predicted': y_pred
})

# Display first few rows
print(results_df.head(10))

                      Actual               Predicted
1150               sci.crypt               sci.crypt
1309                 sci.med                 sci.med
1707   talk.politics.mideast   talk.politics.mideast
1524  soc.religion.christian  soc.religion.christian
1645      talk.politics.guns             alt.atheism
1305                 sci.med                 sci.med
534           comp.windows.x          comp.windows.x
842          rec.motorcycles         rec.motorcycles
1714   talk.politics.mideast   talk.politics.mideast
171            comp.graphics           comp.graphics


In [42]:
# Initializing sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

# Extracting sentiment from raw blog text
def get_sentiment(text):
    score = analyzer.polarity_scores(text)['compound']
    return 'positive' if score >= 0.05 else 'negative' if score <= -0.05 else 'neutral'

# Applying sentiment analysis
df['sentiment'] = df['Data'].apply(get_sentiment)

# Display sentiment distribution
print("Overall Sentiment:\n", df['sentiment'].value_counts())
print("\nSentiment by Category:\n", df.groupby('Labels')['sentiment'].value_counts().unstack().fillna(0))

Overall Sentiment:
 sentiment
positive    1359
negative     609
neutral       32
Name: count, dtype: int64

Sentiment by Category:
 sentiment                 negative  neutral  positive
Labels                                               
alt.atheism                   41.0      1.0      58.0
comp.graphics                 11.0      2.0      87.0
comp.os.ms-windows.misc       21.0      2.0      77.0
comp.sys.ibm.pc.hardware      18.0      1.0      81.0
comp.sys.mac.hardware         24.0      4.0      72.0
comp.windows.x                18.0      2.0      80.0
misc.forsale                   8.0      8.0      84.0
rec.autos                     27.0      0.0      73.0
rec.motorcycles               30.0      1.0      69.0
rec.sport.baseball            26.0      1.0      73.0
rec.sport.hockey              22.0      1.0      77.0
sci.crypt                     30.0      0.0      70.0
sci.electronics               19.0      3.0      78.0
sci.med                       30.0      1.0      69.0
sci.

In [43]:
# Accuracy and classification report
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.69

Classification Report:
                           precision    recall  f1-score   support

             alt.atheism       0.45      0.50      0.48        20
           comp.graphics       0.77      0.85      0.81        20
 comp.os.ms-windows.misc       0.57      0.60      0.59        20
comp.sys.ibm.pc.hardware       0.39      0.55      0.46        20
   comp.sys.mac.hardware       0.58      0.55      0.56        20
          comp.windows.x       0.81      0.65      0.72        20
            misc.forsale       0.87      0.65      0.74        20
               rec.autos       0.82      0.90      0.86        20
         rec.motorcycles       0.88      0.75      0.81        20
      rec.sport.baseball       0.84      0.80      0.82        20
        rec.sport.hockey       0.91      1.00      0.95        20
               sci.crypt       0.86      0.95      0.90        20
         sci.electronics       0.72      0.65      0.68        20
                 sci.med       0.83

## report
In this assignment, we performed text classification and sentiment analysis on the "blogs_categories.csv" dataset. The dataset contains blog posts labeled across 20 categories. We began by preprocessing the text data, including lowercasing, punctuation removal, stopword filtering, and stemming. Using TF-IDF vectorization, we extracted features and trained a Multinomial Naive Bayes classifier. The model achieved an accuracy of 69%, with strong performance in categories like rec.sport.hockey and sci.crypt, and lower scores in categories such as talk.religion.misc. For sentiment analysis, we applied the VADER sentiment analyzer to classify each blog post as positive, negative, or neutral. The majority of posts were positive, with political categories showing higher negative sentiment. Overall, the model demonstrated reliable classification performance, and the sentiment analysis provided useful insights into the emotional tone of different blog categories.