In [26]:
# Step 1: Data Exploration and Preprocessing

# Importing necessary libraries
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# Load the dataset
df = pd.read_csv('blogs.csv')
df

Unnamed: 0,Data,Labels
0,Path: cantaloupe.srv.cs.cmu.edu!magnesium.club...,alt.atheism
1,Newsgroups: alt.atheism\nPath: cantaloupe.srv....,alt.atheism
2,Path: cantaloupe.srv.cs.cmu.edu!das-news.harva...,alt.atheism
3,Path: cantaloupe.srv.cs.cmu.edu!magnesium.club...,alt.atheism
4,Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:53...,alt.atheism
...,...,...
1995,Xref: cantaloupe.srv.cs.cmu.edu talk.abortion:...,talk.religion.misc
1996,Xref: cantaloupe.srv.cs.cmu.edu talk.religion....,talk.religion.misc
1997,Xref: cantaloupe.srv.cs.cmu.edu talk.origins:4...,talk.religion.misc
1998,Xref: cantaloupe.srv.cs.cmu.edu talk.religion....,talk.religion.misc


In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Data    2000 non-null   object
 1   Labels  2000 non-null   object
dtypes: object(2)
memory usage: 31.4+ KB


In [3]:
# Check for missing values
missing_values = df.isnull().sum()
print(f"Missing values:\n{missing_values}")

Missing values:
Data      0
Labels    0
dtype: int64


In [4]:
# Display the distribution of categories
category_distribution = df['Labels'].value_counts()
print(f"Category distribution:\n{category_distribution}")

Category distribution:
alt.atheism                 100
comp.graphics               100
talk.politics.misc          100
talk.politics.mideast       100
talk.politics.guns          100
soc.religion.christian      100
sci.space                   100
sci.med                     100
sci.electronics             100
sci.crypt                   100
rec.sport.hockey            100
rec.sport.baseball          100
rec.motorcycles             100
rec.autos                   100
misc.forsale                100
comp.windows.x              100
comp.sys.mac.hardware       100
comp.sys.ibm.pc.hardware    100
comp.os.ms-windows.misc     100
talk.religion.misc          100
Name: Labels, dtype: int64


In [6]:
# Importing necessary libraries
import string
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS as sklearn_stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

# Preprocessing function without using NLTK
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Tokenizing manually using simple split
    tokens = text.split()
    
    # Removing stopwords using scikit-learn's stopwords
    tokens = [word for word in tokens if word not in sklearn_stopwords]
    
    # Join tokens back into a single string
    processed_text = ' '.join(tokens)
    
    return processed_text

# Applying the preprocessing to the 'Data' column
df['Processed_Text'] = df['Data'].apply(preprocess_text)

# Display the first few rows of the processed data
print(df[['Data', 'Processed_Text']].head())

                                                Data  \
0  Path: cantaloupe.srv.cs.cmu.edu!magnesium.club...   
1  Newsgroups: alt.atheism\nPath: cantaloupe.srv....   
2  Path: cantaloupe.srv.cs.cmu.edu!das-news.harva...   
3  Path: cantaloupe.srv.cs.cmu.edu!magnesium.club...   
4  Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:53...   

                                      Processed_Text  
0  path cantaloupesrvcscmuedumagnesiumclubcccmued...  
1  newsgroups altatheism path cantaloupesrvcscmue...  
2  path cantaloupesrvcscmuedudasnewsharvardedunoc...  
3  path cantaloupesrvcscmuedumagnesiumclubcccmued...  
4  xref cantaloupesrvcscmuedu altatheism53485 tal...  


In [12]:
# Preprocess the text using TF-IDF on the 'Processed_Text' column
tfidf_vectorizer = TfidfVectorizer(stop_words='english', lowercase=True)
X_tfidf = tfidf_vectorizer.fit_transform(df['Processed_Text'])

# Output the shape of the TF-IDF matrix
print(X_tfidf.shape)

(2000, 56258)


In [13]:
# Step 2: Naive Bayes Model for Text Classification

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)
X_train, X_test, y_train, y_test

(<1600x56258 sparse matrix of type '<class 'numpy.float64'>'
 	with 202655 stored elements in Compressed Sparse Row format>,
 <400x56258 sparse matrix of type '<class 'numpy.float64'>'
 	with 45839 stored elements in Compressed Sparse Row format>,
 968          rec.sport.baseball
 240     comp.os.ms-windows.misc
 819             rec.motorcycles
 692                misc.forsale
 420       comp.sys.mac.hardware
                  ...           
 1130                  sci.crypt
 1294            sci.electronics
 860             rec.motorcycles
 1459                  sci.space
 1126                  sci.crypt
 Name: Labels, Length: 1600, dtype: object,
 1860          talk.politics.misc
 353     comp.sys.ibm.pc.hardware
 1333                     sci.med
 905           rec.sport.baseball
 1289             sci.electronics
                   ...           
 965           rec.sport.baseball
 1284             sci.electronics
 1739       talk.politics.mideast
 261      comp.os.ms-windows.misc
 535 

In [14]:
print(f"Training set size: {X_train.shape}")
print(f"Test set size: {X_test.shape}")

Training set size: (1600, 56258)
Test set size: (400, 56258)


In [15]:
# Initialize the Naive Bayes classifier
nb_classifier = MultinomialNB()

# Train the model
nb_classifier.fit(X_train, y_train)

In [16]:
# Make predictions on the test set
y_pred = nb_classifier.predict(X_test)
print(y_pred)

['talk.politics.misc' 'comp.sys.ibm.pc.hardware' 'sci.med'
 'rec.sport.baseball' 'sci.electronics' 'sci.electronics'
 'rec.sport.baseball' 'talk.politics.mideast' 'alt.atheism' 'sci.med'
 'alt.atheism' 'sci.electronics' 'sci.crypt' 'rec.sport.baseball'
 'comp.sys.ibm.pc.hardware' 'comp.os.ms-windows.misc' 'rec.autos'
 'comp.graphics' 'talk.politics.guns' 'talk.politics.misc'
 'comp.sys.mac.hardware' 'alt.atheism' 'alt.atheism' 'rec.sport.hockey'
 'alt.atheism' 'sci.crypt' 'sci.crypt' 'rec.sport.baseball' 'rec.autos'
 'alt.atheism' 'rec.sport.baseball' 'rec.sport.hockey' 'comp.windows.x'
 'rec.sport.baseball' 'rec.sport.hockey' 'comp.sys.mac.hardware' 'sci.med'
 'sci.electronics' 'rec.sport.hockey' 'comp.os.ms-windows.misc'
 'sci.electronics' 'soc.religion.christian' 'comp.os.ms-windows.misc'
 'rec.sport.baseball' 'soc.religion.christian' 'rec.motorcycles'
 'comp.windows.x' 'rec.motorcycles' 'alt.atheism' 'talk.politics.misc'
 'soc.religion.christian' 'comp.os.ms-windows.misc' 'sci.spac

In [18]:
!pip install textblob

Collecting textblob
  Downloading textblob-0.18.0.post0-py3-none-any.whl.metadata (4.5 kB)
Downloading textblob-0.18.0.post0-py3-none-any.whl (626 kB)
   ---------------------------------------- 0.0/626.3 kB ? eta -:--:--
   --------------------------------------- 626.3/626.3 kB 11.7 MB/s eta 0:00:00
Installing collected packages: textblob
Successfully installed textblob-0.18.0.post0


In [19]:
# Step 3: Sentiment Analysis

from textblob import TextBlob
def get_sentiment(text):
    analysis = TextBlob(text)
    if analysis.sentiment.polarity > 0:
        return 'positive'
    elif analysis.sentiment.polarity < 0:
        return 'negative'
    else:
        return 'neutral'

df['Sentiment'] = df['Data'].apply(get_sentiment)
df['Sentiment']

0       positive
1       negative
2       positive
3       positive
4       positive
          ...   
1995    positive
1996    positive
1997    positive
1998    positive
1999    positive
Name: Sentiment, Length: 2000, dtype: object

In [21]:
# Display the distribution of sentiments
sentiment_distribution = df['Sentiment'].value_counts()
print(f"Sentiment distribution:\n{sentiment_distribution}")

Sentiment distribution:
positive    1543
negative     457
Name: Sentiment, dtype: int64


In [22]:
# Examine the distribution of sentiments across different categories
sentiment_category_distribution = df.groupby('Labels')['Sentiment'].value_counts().unstack().fillna(0)
print(f"Sentiment distribution across categories:\n{sentiment_category_distribution}")

Sentiment distribution across categories:
Sentiment                 negative  positive
Labels                                      
alt.atheism                     23        77
comp.graphics                   24        76
comp.os.ms-windows.misc         22        78
comp.sys.ibm.pc.hardware        20        80
comp.sys.mac.hardware           24        76
comp.windows.x                  27        73
misc.forsale                    16        84
rec.autos                       17        83
rec.motorcycles                 26        74
rec.sport.baseball              29        71
rec.sport.hockey                34        66
sci.crypt                       19        81
sci.electronics                 19        81
sci.med                         29        71
sci.space                       27        73
soc.religion.christian          13        87
talk.politics.guns              30        70
talk.politics.mideast           22        78
talk.politics.misc              22        78
talk.religion

In [29]:
# Step 4: Evaluation

# Evaluate the model's performance
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)
print(f"Accuracy: {accuracy}")

Accuracy: 0.8175


In [27]:
# Calculate precision, recall, and F1-score
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-score: {f1}")

Precision: 0.8392316509380678
Recall: 0.8175
F1-score: 0.8088662063675969


In [30]:
# Output the results
print("Classification Report:")
print(classification_rep)

Classification Report:
                          precision    recall  f1-score   support

             alt.atheism       0.52      0.89      0.65        18
           comp.graphics       0.65      0.83      0.73        18
 comp.os.ms-windows.misc       0.95      0.82      0.88        22
comp.sys.ibm.pc.hardware       0.95      0.76      0.84        25
   comp.sys.mac.hardware       0.83      0.95      0.89        21
          comp.windows.x       1.00      0.84      0.91        25
            misc.forsale       0.91      0.56      0.69        18
               rec.autos       0.84      0.89      0.86        18
         rec.motorcycles       0.88      0.88      0.88        16
      rec.sport.baseball       0.74      0.94      0.83        18
        rec.sport.hockey       0.88      1.00      0.94        15
               sci.crypt       0.90      1.00      0.95        19
         sci.electronics       0.59      0.81      0.68        16
                 sci.med       0.94      0.88      0

In [31]:
# Reflect on the sentiment analysis results
print(f"Sentiment distribution:\n{sentiment_distribution}")
print(f"Sentiment distribution across categories:\n{sentiment_category_distribution}")

# Discuss implications
print("The sentiment analysis reveals the general sentiment expressed in the blog posts. We observe that...")

Sentiment distribution:
positive    1543
negative     457
Name: Sentiment, dtype: int64
Sentiment distribution across categories:
Sentiment                 negative  positive
Labels                                      
alt.atheism                     23        77
comp.graphics                   24        76
comp.os.ms-windows.misc         22        78
comp.sys.ibm.pc.hardware        20        80
comp.sys.mac.hardware           24        76
comp.windows.x                  27        73
misc.forsale                    16        84
rec.autos                       17        83
rec.motorcycles                 26        74
rec.sport.baseball              29        71
rec.sport.hockey                34        66
sci.crypt                       19        81
sci.electronics                 19        81
sci.med                         29        71
sci.space                       27        73
soc.religion.christian          13        87
talk.politics.guns              30        70
talk.politics.m