# NATURAL LANGUAGE PROCESSING (NLP)

In [2]:
#TEXT CLASSIFICATION USING NAIVE BAYES AND SENTIMENT ANALYSIS ON BLOG POSTS

In [3]:
# LOAD DATASET:
import pandas as pd
data=pd.read_csv("/content/blogs.csv")

In [4]:
data

Unnamed: 0,Data,Labels
0,Path: cantaloupe.srv.cs.cmu.edu!magnesium.club...,alt.atheism
1,Newsgroups: alt.atheism\nPath: cantaloupe.srv....,alt.atheism
2,Path: cantaloupe.srv.cs.cmu.edu!das-news.harva...,alt.atheism
3,Path: cantaloupe.srv.cs.cmu.edu!magnesium.club...,alt.atheism
4,Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:53...,alt.atheism
...,...,...
1995,Xref: cantaloupe.srv.cs.cmu.edu talk.abortion:...,talk.religion.misc
1996,Xref: cantaloupe.srv.cs.cmu.edu talk.religion....,talk.religion.misc
1997,Xref: cantaloupe.srv.cs.cmu.edu talk.origins:4...,talk.religion.misc
1998,Xref: cantaloupe.srv.cs.cmu.edu talk.religion....,talk.religion.misc


In [5]:
data.describe()
# BOTH ARE OBJECT COLUMNS
# THERE IS NO NULL VALUES
# DATA HAVING UNIQUE SENTENCES
# LABELS HAVING 20 MEANS FOR EACH LABELS THERE IS 100 STATEMENTS


Unnamed: 0,Data,Labels
count,2000,2000
unique,2000,20
top,Path: cantaloupe.srv.cs.cmu.edu!magnesium.club...,alt.atheism
freq,1,100


In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Data    2000 non-null   object
 1   Labels  2000 non-null   object
dtypes: object(2)
memory usage: 31.4+ KB


In [7]:
data['Labels'].value_counts()

Unnamed: 0_level_0,count
Labels,Unnamed: 1_level_1
alt.atheism,100
comp.graphics,100
talk.politics.misc,100
talk.politics.mideast,100
talk.politics.guns,100
soc.religion.christian,100
sci.space,100
sci.med,100
sci.electronics,100
sci.crypt,100


In [8]:
# Download necessary NLTK data
import nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

## Convert data into consistence foramt

In [9]:
# Remove punctuation and convert to lowercase to get the data consistence
import string
data['Cleaned_Data'] = data['Data'].str.translate(str.maketrans('', '', string.punctuation)).str.lower()

## Tokenization

In [10]:
# Tokenize the text
from nltk.tokenize import word_tokenize
data['Cleaned_Data'] = data['Cleaned_Data'].apply(word_tokenize)

## Stopwords hadling

In [11]:
# Remove stopwords
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
data['Cleaned_Data'] = data['Cleaned_Data'].apply(lambda tokens: [word for word in tokens if word not in stop_words])

In [12]:
# Join tokens back into a string
data['Cleaned_Data'] = data['Cleaned_Data'].apply(' '.join)

In [13]:
# Display the first few rows of the cleaned dataset
print(data[['Cleaned_Data', 'Labels']].head())

                                        Cleaned_Data       Labels
0  path cantaloupesrvcscmuedumagnesiumclubcccmued...  alt.atheism
1  newsgroups altatheism path cantaloupesrvcscmue...  alt.atheism
2  path cantaloupesrvcscmuedudasnewsharvardedunoc...  alt.atheism
3  path cantaloupesrvcscmuedumagnesiumclubcccmued...  alt.atheism
4  xref cantaloupesrvcscmuedu altatheism53485 tal...  alt.atheism


## Feature Extraction  

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [15]:
# Initialize the TF-IDF Vectorizer
vectorizer = TfidfVectorizer()

In [16]:
# Fit and transform the cleaned text data
X = vectorizer.fit_transform(data['Cleaned_Data'])
# Display the shape of the resulting feature matrix
X.shape

(2000, 56432)

# 2. Naive Bayes Model for Text Classification

In [17]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score

In [18]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, data['Labels'], test_size=0.2, random_state=42)

In [19]:
# Initialize the Naive Bayes classifier
nb_classifier = MultinomialNB()

In [20]:
# Train the model on the training set
nb_classifier.fit(X_train, y_train)


In [21]:
# Make predictions on the test set
y_pred = nb_classifier.predict(X_test)

In [22]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)


In [23]:
print(accuracy)
#The Naive Bayes classifier achieved an accuracy of 82% on the test set.
#Overall, the model performs well, but there is room for improvement in certain categories.

0.82


In [24]:
print(report)

                          precision    recall  f1-score   support

             alt.atheism       0.52      0.89      0.65        18
           comp.graphics       0.62      0.83      0.71        18
 comp.os.ms-windows.misc       0.95      0.86      0.90        22
comp.sys.ibm.pc.hardware       0.95      0.76      0.84        25
   comp.sys.mac.hardware       0.87      0.95      0.91        21
          comp.windows.x       1.00      0.80      0.89        25
            misc.forsale       0.92      0.61      0.73        18
               rec.autos       0.89      0.89      0.89        18
         rec.motorcycles       0.88      0.88      0.88        16
      rec.sport.baseball       0.80      0.89      0.84        18
        rec.sport.hockey       0.83      1.00      0.91        15
               sci.crypt       0.82      0.95      0.88        19
         sci.electronics       0.68      0.81      0.74        16
                 sci.med       0.94      0.88      0.91        17
         

# Optimization Using GridSearch

In [25]:
from sklearn.model_selection import GridSearchCV

In [26]:
# Define the parameter grid
param_grid = {
    'alpha': [0.1, 0.5, 1.0, 2.0, 5.0]
}

In [27]:
# Initialize the GridSearchCV with the Naive Bayes classifier
grid_search = GridSearchCV(MultinomialNB(), param_grid, cv=5, scoring='accuracy')

In [28]:
# Fit the model to the training data
grid_search.fit(X_train, y_train)

In [29]:
# Get the best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

In [30]:
print(best_params)
print(best_score)

{'alpha': 0.5}
0.8393749999999999


In [31]:
# Train the model with the best parameters
optimized_nb_classifier = MultinomialNB(**best_params)
optimized_nb_classifier.fit(X_train, y_train)

In [32]:
# Make predictions on the test set
y_optimized_pred = optimized_nb_classifier.predict(X_test)

In [33]:
# Evaluate the optimized model
optimized_accuracy = accuracy_score(y_test, y_optimized_pred)
optimized_report = classification_report(y_test, y_optimized_pred)

In [34]:
print(f'accuracy = {optimized_accuracy}')
print(optimized_report)
# it's  dosen't gives us the better accuracy than the origanl model

accuracy = 0.8125
                          precision    recall  f1-score   support

             alt.atheism       0.53      0.89      0.67        18
           comp.graphics       0.73      0.89      0.80        18
 comp.os.ms-windows.misc       0.95      0.82      0.88        22
comp.sys.ibm.pc.hardware       0.83      0.76      0.79        25
   comp.sys.mac.hardware       0.82      0.86      0.84        21
          comp.windows.x       0.91      0.80      0.85        25
            misc.forsale       1.00      0.61      0.76        18
               rec.autos       0.84      0.89      0.86        18
         rec.motorcycles       0.88      0.88      0.88        16
      rec.sport.baseball       0.79      0.83      0.81        18
        rec.sport.hockey       0.83      1.00      0.91        15
               sci.crypt       0.82      0.95      0.88        19
         sci.electronics       0.72      0.81      0.76        16
                 sci.med       0.88      0.88      0.88  

## Optimization Using Randomized Search

In [35]:
from sklearn.model_selection import RandomizedSearchCV

In [36]:
# Define the parameter grid for RandomizedSearchCV
param_grid = {
    'alpha': [0.0, 0.1, 0.5, 1.0, 1.5, 2.0],
    'fit_prior': [True, False]
}


In [37]:
# Initialize the Naive Bayes classifier
nb_classifier = MultinomialNB()

In [38]:
# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(estimator=nb_classifier, param_distributions=param_grid, n_iter=20, cv=5, n_jobs=-1, random_state=42)

In [39]:
# Fit RandomizedSearchCV to the training data
random_search.fit(X_train, y_train)



In [40]:
# Get the best estimator
best_nb_classifier = random_search.best_estimator_

In [41]:
# Evaluate the optimized model
accuracy_opt = best_nb_classifier.score(X_test, y_test)

In [42]:
print(random_search.best_params_)
print(accuracy_opt)

{'fit_prior': False, 'alpha': 2.0}
0.835


In [43]:
# Train the model with the best parameters
optimized_rs_classifier = MultinomialNB(fit_prior = False, alpha = 2.0)
optimized_rs_classifier.fit(X_train, y_train)

In [44]:
# Make predictions on the test set
y_rs_pred = optimized_rs_classifier.predict(X_test)

In [45]:
# Evaluate the optimized model
rs_accuracy = accuracy_score(y_test, y_optimized_pred)

In [46]:
rs_accuracy

0.8125

# 3. Sentiment Analysis

In [47]:
import pandas as pd
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import plotly.express as px

In [48]:
# Download the VADER lexicon
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


True

In [49]:
# Initialize the VADER sentiment analyzer
sid = SentimentIntensityAnalyzer()

In [50]:
# Function to categorize sentiment
def categorize_sentiment(score):
    if score >= 0.05:
        return 'Positive'
    elif score <= -0.05:
        return 'Negative'
    else:
        return 'Neutral'

In [51]:
# Perform sentiment analysis on the 'Data' column
data['Sentiment'] = data['Cleaned_Data'].apply(lambda x: sid.polarity_scores(x)['compound']).apply(categorize_sentiment)

In [52]:
# Examine the distribution of sentiments
sentiment_distribution = data['Sentiment'].value_counts(normalize=True) * 100

In [53]:
# Display the sentiment distribution
sentiment_distribution
# we can see that we got more positive comments and less negative comments.

Unnamed: 0_level_0,proportion
Sentiment,Unnamed: 1_level_1
Positive,67.8
Negative,29.05
Neutral,3.15


In [54]:
# Plot the distribution of sentiments
fig_sentiment_dist = px.bar(sentiment_distribution, x=sentiment_distribution.index, y=sentiment_distribution.values, labels={'x': 'Sentiment', 'y': 'Percentage'}, title='Distribution of Sentiments')

# Show the plot
fig_sentiment_dist.show()