In [None]:
import pandas as pd
import csv

file_path = '/content/blogs.csv'

valid_lines = []
invalid_lines = []

# Read the file line by line and filter valid lines
with open(file_path, 'r', encoding='utf-8') as file:
    reader = csv.reader(file)
    for i, line in enumerate(reader):
        if len(line) == 2:  # Assuming there should be exactly 2 fields per line
            valid_lines.append(line)
        else:
            invalid_lines.append((i, line))

# Print invalid lines for debugging
print(f"Found {len(invalid_lines)} invalid lines.")
for line_num, line in invalid_lines:
    print(f"Line {line_num}: {line}")

# Create a DataFrame from valid lines
df = pd.DataFrame(valid_lines, columns=['Data', 'Labels'])

# Display the first few rows of the dataset
print(df.head())

# Check for any missing values
print(df.isnull().sum())

# Get basic statistics about the dataset
print(df.describe())



Found 1 invalid lines.
Line 1832: ['Xref: cantaloupe.srv.cs.cmu.edu alt.politics.libertarian:6652 talk.politics.misc:179033 alt.politics.usa.misc:2843\nNewsgroups: alt.politics.libertarian,talk.politics.misc,alt.politics.usa.misc\nPath: cantaloupe.srv.cs.cmu.edu!rochester!udel!darwin.sura.net!haven.umd.edu!uunet!infonode!jima.b17d.ingr.com!jwalbea\nFrom: jwalbea@jima.b17d.ingr.com (Jim Albea)\nSubject: Re: We\'re from the government and we\'re here to help you\nMessage-ID: <1993Apr23.213057.5207@infonode.ingr.com>\nSender: usenet@infonode.ingr.com (Usenet Administrator)\nOrganization: Intergraph Corporation, Huntsville, AL.\nReferences: <93096.28448.J056600@LMSC5.IS.LMSC.LOCKHEED.COM> <1993Apr8.200326.27560@infonode.ingr.com> <1993Apr18.192508.12442@isc-br.isc-br.com>\nDate: Fri, 23 Apr 1993 21:30:57 GMT\nLines: 95\n\nIn article <1993Apr18.192508.12442@isc-br.isc-br.com>, steveh@thor.isc-br.com (Steve Hendricks) writes:\n|> In article <1993Apr8.200326.27560@infonode.ingr.com> albeaj@ji

In [None]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer

# Download stopwords
nltk.download('stopwords')
nltk.download('punkt')

# Preprocessing function
def preprocess_text(text):
    # Remove punctuation and numbers
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)
    # Convert to lowercase
    text = text.lower()
    # Tokenize the text
    tokens = word_tokenize(text)
    # Remove stopwords
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    return ' '.join(tokens)

# Apply the preprocessing function to the 'Data' column
df['Processed_Data'] = df['Data'].apply(preprocess_text)

# Display the first few rows of the processed data

print(df.head())


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


                                                Data       Labels  \
0                                               Data       Labels   
1  Path: cantaloupe.srv.cs.cmu.edu!magnesium.club...  alt.atheism   
2  Newsgroups: alt.atheism\nPath: cantaloupe.srv....  alt.atheism   
3  Path: cantaloupe.srv.cs.cmu.edu!das-news.harva...  alt.atheism   
4  Path: cantaloupe.srv.cs.cmu.edu!magnesium.club...  alt.atheism   

                                      Processed_Data  
0                                               data  
1  path cantaloupesrvcscmuedumagnesiumclubcccmued...  
2  newsgroups altatheism path cantaloupesrvcscmue...  
3  path cantaloupesrvcscmuedudasnewsharvardedunoc...  
4  path cantaloupesrvcscmuedumagnesiumclubcccmued...  


In [None]:
# Initialize the TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000)

# Fit and transform the data
X = tfidf_vectorizer.fit_transform(df['Processed_Data'])

# Extract the labels
y = df['Labels']


In [None]:
from sklearn.model_selection import train_test_split

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Initialize the Naive Bayes classifier
nb_classifier = MultinomialNB()

# Train the model
nb_classifier.fit(X_train, y_train)

# Make predictions on the test set
y_pred = nb_classifier.predict(X_test)


In [None]:
from textblob import TextBlob

# Function to get sentiment
def get_sentiment(text):
    analysis = TextBlob(text)
    if analysis.sentiment.polarity > 0:
        return 'positive'
    elif analysis.sentiment.polarity == 0:
        return 'neutral'
    else:
        return 'negative'

# Apply the sentiment function to the 'Data' column
df['Sentiment'] = df['Data'].apply(get_sentiment)

# Display the first few rows with sentiment
print(df.head())


                                                Data       Labels  \
0                                               Data       Labels   
1  Path: cantaloupe.srv.cs.cmu.edu!magnesium.club...  alt.atheism   
2  Newsgroups: alt.atheism\nPath: cantaloupe.srv....  alt.atheism   
3  Path: cantaloupe.srv.cs.cmu.edu!das-news.harva...  alt.atheism   
4  Path: cantaloupe.srv.cs.cmu.edu!magnesium.club...  alt.atheism   

                                      Processed_Data Sentiment  
0                                               data   neutral  
1  path cantaloupesrvcscmuedumagnesiumclubcccmued...  positive  
2  newsgroups altatheism path cantaloupesrvcscmue...  negative  
3  path cantaloupesrvcscmuedudasnewsharvardedunoc...  positive  
4  path cantaloupesrvcscmuedumagnesiumclubcccmued...  positive  


In [None]:
# Group by category and sentiment
sentiment_distribution = df.groupby(['Labels', 'Sentiment']).size().unstack().fillna(0)

# Display the sentiment distribution
print(sentiment_distribution)


Sentiment                 negative  neutral  positive
Labels                                               
Labels                         0.0      1.0       0.0
alt.atheism                   23.0      0.0      77.0
comp.graphics                 24.0      0.0      76.0
comp.os.ms-windows.misc       22.0      0.0      78.0
comp.sys.ibm.pc.hardware      20.0      0.0      80.0
comp.sys.mac.hardware         24.0      0.0      76.0
comp.windows.x                27.0      0.0      73.0
misc.forsale                  16.0      0.0      84.0
rec.autos                     17.0      0.0      83.0
rec.motorcycles               26.0      0.0      74.0
rec.sport.baseball            29.0      0.0      71.0
rec.sport.hockey              34.0      0.0      66.0
sci.crypt                     19.0      0.0      81.0
sci.electronics               19.0      0.0      81.0
sci.med                       29.0      0.0      71.0
sci.space                     27.0      0.0      73.0
soc.religion.christian      

In [None]:
# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

# Display the evaluation metrics
print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1-Score: {f1}')


Accuracy: 0.8746594005449592
Precision: 0.8805225216338421
Recall: 0.8746594005449592
F1-Score: 0.8712085523393069


  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
# Summarize the sentiment analysis results
sentiment_summary = df.groupby('Sentiment').size()

# Display the sentiment summary
print(sentiment_summary)


Sentiment
negative     428
neutral        1
positive    1403
dtype: int64


The output suggests sentiment skew towards positive responses, with a minimal occurrence of neutral sentiment and a notable number of negative sentiments.