In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
import nltk
from nltk.corpus import stopwords

In [2]:
# Download stopwords if not already downloaded
nltk.download('stopwords')

# Load the dataset
data=pd.read_csv("Twitter_Data.csv")

# Preprocess the text (including stopword removal)
stop_words = set(stopwords.words('english'))  # Customize stopwords if needed

def remove_stopwords(text):
    if isinstance(text, str):
        words = text.split()
        filtered_words = [word for word in words if word not in stop_words]
        return ' '.join(filtered_words).strip()
    else:
        # Handle non-string values here (e.g., return empty string)
        return ''

data['clean_text'] = data['clean_text'].apply(remove_stopwords)


[nltk_data] Downloading package stopwords to C:\Users\Zeel
[nltk_data]     soni\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
data = data.dropna(subset=['category']) 
#data = data.dropna(subset=['clean_text']) 

In [4]:
data

Unnamed: 0,clean_text,category
0,modi promised “minimum government maximum gove...,-1.0
1,talk nonsense continue drama vote modi,0.0
2,say vote modi welcome bjp told rahul main camp...,1.0
3,asking supporters prefix chowkidar names modi ...,1.0
4,answer among powerful world leader today trump...,1.0
...,...,...
162975,456 crores paid neerav modi recovered congress...,-1.0
162976,dear rss terrorist payal gawar modi killing 10...,-1.0
162977,cover interaction forum left,0.0
162978,big project came india modi dream project happ...,0.0


In [5]:
data.sample(10)

Unnamed: 0,clean_text,category
2082,dont want blame power merely 5560 years indepe...,-1.0
67035,modi violates moral code conduct sleeping post...,0.0
146467,conversation cab driver amazed see innocence w...,0.0
8717,condition finances govt upa 2019 rahul gandhi ...,-1.0
16506,even today press conference said modi govt alr...,0.0
17742,subramanian swamy well said neither modi jetle...,0.0
24359,yeah let’ start nit picking agree point congre...,1.0
123733,economics elections modi wins buy mutual funds...,1.0
140680,pliss modi charismatic lets pace,1.0
45354,meanwhile mamta banerjee preparing give dharna...,-1.0


In [6]:
# Create a CountVectorizer object
vectorizer1 = CountVectorizer(binary=True)
vectorizer2=CountVectorizer(binary=False)
# Fit and transform the text data
X1 = vectorizer1.fit_transform(data['clean_text'])
X2=  vectorizer2.fit_transform(data['clean_text'])
y = data['category']

In [7]:
X_train1, X_test1, y_train1, y_test1 = train_test_split(X1, y, test_size=0.2, random_state=42)
X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y, test_size=0.2, random_state=42)

In [8]:
model1=BernoulliNB()
model1.fit(X_train1,y_train1)

In [9]:
model2=MultinomialNB()
model2.fit(X_train2,y_train2)

In [10]:
prediction1=model1.predict(X_test1)
prediction2=model2.predict(X_test2)

In [11]:
accuracy_score(y_test1,prediction1) #countVectorizer binary=True

0.7568031906734162

In [12]:
accuracy_score(y_test2,prediction2) #countVectorizer binary=False

0.7246816996471851

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
model=make_pipeline(TfidfVectorizer(), MultinomialNB())

In [14]:
xtrain,xtest,ytrain,ytest=train_test_split(data['clean_text'],data['category'],test_size=0.25,random_state=12)

In [15]:
model.fit(xtrain,ytrain)

In [16]:
prediction3=model.predict(xtest)

In [17]:
accuracy_score(ytest,prediction3) #Tfidf vectorizer

0.5765266051443158

Conclusion- For the twitter sentiment analysis Count Vectorizer where binary=True, Bernoulli model should be used as it give more accuracy over Multinomial model and TFIDF vectorizer

In [18]:
# Handling user input with BernoulliNB model
user_input = input("Enter the tweet: ")

# Preprocess the user input
user_input_cleaned = remove_stopwords(user_input)

# Transform the user input using vectorizer1
user_input_transformed = vectorizer1.transform([user_input_cleaned])

# Predict the category using model1
user_prediction = model1.predict(user_input_transformed)
print(f"The predicted category is: {user_prediction[0]}")

Enter the tweet:  Talk nonsense 


The predicted category is: 0.0


In [19]:
# Handling user input with BernoulliNB model
user_input = input("Enter the tweet: ")

# Preprocess the user input
user_input_cleaned = remove_stopwords(user_input)

# Transform the user input using vectorizer1
user_input_transformed = vectorizer1.transform([user_input_cleaned])

# Predict the category using model1
user_prediction = model1.predict(user_input_transformed)
print(f"The predicted category is: {user_prediction[0]}")

Enter the tweet:  this is good tweet


The predicted category is: 1.0
