In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from textblob import TextBlob
import nltk
import re

In [2]:
# Ensure necessary NLTK data is downloaded
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\tharu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\tharu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\tharu\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
# 1. Load Dataset
data = pd.read_csv("D:\\ExcelR Assignments\\Assignment 19\\blogs.csv")
data

Unnamed: 0,Data,Labels
0,Path: cantaloupe.srv.cs.cmu.edu!magnesium.club...,alt.atheism
1,Newsgroups: alt.atheism\nPath: cantaloupe.srv....,alt.atheism
2,Path: cantaloupe.srv.cs.cmu.edu!das-news.harva...,alt.atheism
3,Path: cantaloupe.srv.cs.cmu.edu!magnesium.club...,alt.atheism
4,Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:53...,alt.atheism
...,...,...
1995,Xref: cantaloupe.srv.cs.cmu.edu talk.abortion:...,talk.religion.misc
1996,Xref: cantaloupe.srv.cs.cmu.edu talk.religion....,talk.religion.misc
1997,Xref: cantaloupe.srv.cs.cmu.edu talk.origins:4...,talk.religion.misc
1998,Xref: cantaloupe.srv.cs.cmu.edu talk.religion....,talk.religion.misc


In [7]:
# 2. Data Preprocessing
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Tokenize
    tokens = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    # Lemmatize
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)

In [9]:
# 3. Feature Extraction
def extract_features(data, column):
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(data[column])
    return X, vectorizer

In [11]:
# 4. Train-Test Split
def split_data(features, labels, test_size=0.2, random_state=42):
    return train_test_split(features, labels, test_size=test_size, random_state=random_state)

In [13]:
# 5. Train Naive Bayes Model
def train_model(X_train, y_train):
    model = MultinomialNB()
    model.fit(X_train, y_train)
    return model

In [15]:
# 6. Evaluate Model
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    print("Classification Report:\n", classification_report(y_test, y_pred))
    return accuracy, precision, recall, f1

In [17]:
# 7. Sentiment Analysis
def perform_sentiment_analysis(data, text_column):
    sentiments = []
    for text in data[text_column]:
        blob = TextBlob(text)
        polarity = blob.sentiment.polarity
        if polarity > 0:
            sentiments.append('Positive')
        elif polarity < 0:
            sentiments.append('Negative')
        else:
            sentiments.append('Neutral')
    data['Sentiment'] = sentiments
    return data

In [19]:
# Main Execution
def main():
    # Load the dataset
    data = pd.read_csv("D:\\ExcelR Assignments\\Assignment 19\\blogs.csv")

    # Preprocess the text data
    data['Cleaned_Data'] = data['Data'].apply(preprocess_text)

    # Extract features and labels
    X, vectorizer = extract_features(data, 'Cleaned_Data')
    y = data['Labels']

    # Split the data
    X_train, X_test, y_train, y_test = split_data(X, y)

    # Train the model
    model = train_model(X_train, y_train)

    # Evaluate the model
    accuracy, precision, recall, f1 = evaluate_model(model, X_test, y_test)
    print(f"Accuracy: {accuracy}\nPrecision: {precision}\nRecall: {recall}\nF1-Score: {f1}")

    # Perform sentiment analysis
    data = perform_sentiment_analysis(data, 'Data')

    # Analyze sentiment distribution
    sentiment_distribution = data.groupby(['Labels', 'Sentiment']).size().unstack(fill_value=0)
    print("Sentiment Distribution:\n", sentiment_distribution)

    # Save the results
    data.to_csv('blogs_with_sentiments.csv', index=False)

if __name__ == "__main__":
    main()

Classification Report:
                           precision    recall  f1-score   support

             alt.atheism       0.55      0.94      0.69        18
           comp.graphics       0.80      0.89      0.84        18
 comp.os.ms-windows.misc       0.87      0.91      0.89        22
comp.sys.ibm.pc.hardware       0.88      0.84      0.86        25
   comp.sys.mac.hardware       0.79      0.90      0.84        21
          comp.windows.x       1.00      0.76      0.86        25
            misc.forsale       1.00      0.67      0.80        18
               rec.autos       0.94      0.94      0.94        18
         rec.motorcycles       0.83      0.94      0.88        16
      rec.sport.baseball       0.74      0.94      0.83        18
        rec.sport.hockey       0.88      1.00      0.94        15
               sci.crypt       0.86      1.00      0.93        19
         sci.electronics       0.76      0.81      0.79        16
                 sci.med       0.94      0.88      

Conclusion:

In [None]:
'''
Naive Bayes Classification:
The Multinomial Naive Bayes model successfully categorized the blog posts based on their text content.
The model's evaluation metrics (accuracy, precision, recall, and F1-score) indicate its effectiveness in handling this classification task.
The classification report provided detailed insights into the model's performance across all categories, highlighting areas of strength 
and potential improvement.

Sentiment Analysis:
Sentiment analysis categorized the blog posts into positive, negative, and neutral sentiments.
The distribution of sentiments across different blog categories revealed trends, which could provide insights into the tone and emotional content 
of the posts in each category.
These findings can be leveraged to understand audience engagement and thematic sentiment trends.
'''