In [1]:
import pandas as pd
import numpy as np
import nltk 
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
plt.rcParams['figure.figsize'] = (10,5)
plt.rcParams['figure.dpi'] = 300
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from wordcloud import WordCloud
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

# Data Exploration and Preprocessing

In [2]:
df=pd.read_csv('blogs.csv')
df.head()

Unnamed: 0,Data,Labels
0,Path: cantaloupe.srv.cs.cmu.edu!magnesium.club...,alt.atheism
1,Newsgroups: alt.atheism\nPath: cantaloupe.srv....,alt.atheism
2,Path: cantaloupe.srv.cs.cmu.edu!das-news.harva...,alt.atheism
3,Path: cantaloupe.srv.cs.cmu.edu!magnesium.club...,alt.atheism
4,Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:53...,alt.atheism


In [4]:
df['Labels'].nunique()

20

In [5]:
df.shape

(2000, 2)

In [6]:
df.isnull().sum()

Data      0
Labels    0
dtype: int64

In [7]:
df.duplicated().sum()

np.int64(0)

In [8]:
df.describe()

Unnamed: 0,Data,Labels
count,2000,2000
unique,2000,20
top,Path: cantaloupe.srv.cs.cmu.edu!magnesium.club...,alt.atheism
freq,1,100


In [9]:
lemma = WordNetLemmatizer()

In [10]:
df['Data'][5]

'Xref: cantaloupe.srv.cs.cmu.edu talk.abortion:120646 alt.atheism:53205 talk.religion.misc:83616\nPath: cantaloupe.srv.cs.cmu.edu!das-news.harvard.edu!noc.near.net!uunet!gatekeeper.us.oracle.com!barrnet.net!kyle.eitech.com!kyle.eitech.com!not-for-mail\nFrom: ekr@squick.eitech.com (Eric Rescorla)\nNewsgroups: talk.abortion,alt.atheism,talk.religion.misc\nSubject: Re: After 2000 years, can we say that Christian Morality is\nDate: 15 Apr 1993 17:11:15 -0700\nOrganization: EIT\nLines: 40\nMessage-ID: <1qktj3$bn9@squick.eitech.com>\nReferences: <1qjd3o$nlv@horus.ap.mchp.sni.de> <1qk1pp$6hj@kyle.eitech.com> <1qkn1t$59l@horus.ap.mchp.sni.de>\nNNTP-Posting-Host: squick.eitech.com\n\nIn article <1qkn1t$59l@horus.ap.mchp.sni.de> frank@D012S658.uucp (Frank O\'Dwyer) writes:\n>In article <1qk1pp$6hj@kyle.eitech.com> ekr@kyle.eitech.com (Eric Rescorla) writes:\n>|In article <1qjd3o$nlv@horus.ap.mchp.sni.de> frank@D012S658.uucp (Frank O\'Dwyer) writes:\n>|>In article <sandvik-140493230024@sandvik-ke

In [11]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vaish\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [15]:
 nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\vaish\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [16]:
stop_words = set(stopwords.words('english'))

In [17]:

# Custom extra words to remove
extra_stopwords = {
    'edu', 'com', 'org', 'net', 'srv', 'cs', 'cmu', 'nntp', 'apr', 'gmt', 
    'xref', 'path', 'article', 'newsgroups', 'subject', 'write', 'lines',
    're', 'host', 'posting', 'date'
}

In [18]:
all_stopwords = stop_words.union(extra_stopwords)


In [19]:
corpus = []

for i in range(len(df)):
    msg = re.sub('[^a-zA-Z]', ' ', df['Data'][i])  # keep only letters
    msg = msg.lower()
    words = msg.split()
    words = [
        lemma.lemmatize(w) for w in words 
        if w not in all_stopwords and len(w) > 2
    ]
    cleaned_msg = ' '.join(words)
    corpus.append(cleaned_msg)

df['cleaned_text'] = corpus

# Show cleaned sample
df[['Data', 'cleaned_text']].sample(5)

Unnamed: 0,Data,cleaned_text
1476,Newsgroups: sci.space\nPath: cantaloupe.srv.cs...,sci space cantaloupe rochester udel darwin sur...
984,Newsgroups: rec.sport.baseball\nPath: cantalou...,rec sport baseball cantaloupe crabapple ece eu...
1762,Xref: cantaloupe.srv.cs.cmu.edu soc.culture.ar...,cantaloupe soc culture arabic talk politics mi...
1496,Path: cantaloupe.srv.cs.cmu.edu!magnesium.club...,cantaloupe magnesium club news sei ece europa ...
529,Path: cantaloupe.srv.cs.cmu.edu!crabapple.srv....,cantaloupe crabapple andrew andrew rob earhart...


In [None]:
corpus

# Naive Bayes Model for Text Classification

In [22]:
y=df['Labels']
y

0              alt.atheism
1              alt.atheism
2              alt.atheism
3              alt.atheism
4              alt.atheism
               ...        
1995    talk.religion.misc
1996    talk.religion.misc
1997    talk.religion.misc
1998    talk.religion.misc
1999    talk.religion.misc
Name: Labels, Length: 2000, dtype: object

In [23]:

tf = TfidfVectorizer()

x_data = tf.fit_transform(corpus)
x_data = x_data.toarray()
x_data

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], shape=(2000, 33977))

In [24]:
x_train,x_test,y_train,y_test=train_test_split(x_data,y,test_size=0.2,random_state=1)

In [25]:
nb = MultinomialNB()
nb.fit(x_train,y_train)
ypred = nb.predict(x_test)

print(f"Train Accuracy: {nb.score(x_train,y_train)}\nTest Accuracy: {nb.score(x_test,y_test)}")

Train Accuracy: 0.986875
Test Accuracy: 0.84


# Sentiment Analysis

In [26]:
positive_words = ["good", "great", "amazing", "love", "best", "happy", "excellent"]
negative_words = ["bad", "worst", "poor", "hate", "slow", "problem", "issue"]


In [27]:
def sentiment(text):
    words = text.split()
    score = 0
    for w in words:
        if w in positive_words:
            score += 1
        elif w in negative_words:
            score -= 1
    if score > 0:
        return "Positive"
    elif score < 0:
        return "Negative"
    else:
        return "Neutral"

In [28]:
df['Sentiment'] = df['cleaned_text'].apply(sentiment)

In [29]:
print("\nSentiment counts:")
print(df['Sentiment'].value_counts())


Sentiment counts:
Sentiment
Neutral     1098
Positive     535
Negative     367
Name: count, dtype: int64


# Conclusion:


This notebook implements a text classification pipeline using Natural Language Processing (NLP)
and the Multinomial Naive Bayes algorithm. The steps included:
1. Data loading and exploration of blog posts and their labels.
2. Text preprocessing involving removing stopwords, lemmatization, and cleaning unwanted characters.
3. Feature extraction using TF-IDF Vectorization to convert text into numerical form.
4. Model training using the Multinomial Naive Bayes classifier.
5. Evaluation of the model's performance with metrics like accuracy and classification report.
6. Visualization using word clouds to interpret the common words in different classes.
The notebook demonstrates how NLP techniques and Naive Bayes can be applied to classify text data effectively.
