In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import re
import nltk
import string,re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from textblob import TextBlob
from nltk.tokenize import TweetTokenizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, precision_score, recall_score

In [None]:
df=pd.read_csv("/content/demonetization-tweets_data.csv",encoding='ISO-8859-1')
df.head()

Unnamed: 0,text,favorited,favoriteCount,replyToSN,created,truncated,replyToSID,statusSource,screenName,retweetCount,isRetweet,retweeted
0,RT @rssurjewala: Critical question: Was PayTM ...,False,0,,11/23/2016 18:40,False,,"<a href=""http://twitter.com/download/android"" ...",HASHTAGFARZIWAL,331,True,False
1,"RT @roshankar: Former FinSec, RBI Dy Governor,...",False,0,,11/23/2016 18:40,False,,"<a href=""http://twitter.com/download/android"" ...",rahulja13034944,12,True,False
2,RT @satishacharya: Reddy Wedding! @mail_today ...,False,0,,11/23/2016 18:39,False,,"<a href=""http://cpimharyana.com"" rel=""nofollow...",CPIMBadli,120,True,False
3,RT @gauravcsawant: Rs 40 lakh looted from a ba...,False,0,,11/23/2016 18:38,False,,"<a href=""http://twitter.com/download/android"" ...",bhodia1,637,True,False
4,RT @sumitbhati2002: Many opposition leaders ar...,False,0,,11/23/2016 18:38,False,,"<a href=""http://twitter.com/download/android"" ...",sumitbhati2002,1,True,False


In [None]:
df.shape

(7470, 12)

In [None]:
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
def clean_text(tweets):
    tweets = word_tokenize(tweets)
    tweets = tweets[4:]
    tweets= " ".join(tweets)
    tweets= re.sub('https','',tweets)
    tweets = [char for char in tweets if char not in string.punctuation]
    tweets = ''.join(tweets)
    tweets = [word for word in tweets.split() if word.lower() not in stopwords.words('english')]
    return " ".join(tweets)

In [None]:
df['cleaned_text']=df['text'].apply(clean_text)
df['cleaned_text'].head()

0    Critical question PayTM informed Demonetizatio...
1    Former FinSec RBI Dy Governor CBDT Chair Harva...
2    Reddy Wedding mailtoday cartoon demonetization...
3    Rs 40 lakh looted bank Kishtwar J amp K Third ...
4    Many opposition leaders narendramodi Demonetiz...
Name: cleaned_text, dtype: object

In [None]:
features = df['cleaned_text']
processed_features = []

for sentence in range(0, len(features)):
    processed_feature = re.sub(r'\W', ' ', str(features[sentence]))
    processed_feature= re.sub(r'\s+[a-zA-Z]\s+', ' ', processed_feature)
    processed_feature = re.sub(r'\^[a-zA-Z]\s+', ' ', processed_feature)
    processed_feature = re.sub(r'\s+', ' ', processed_feature, flags=re.I)
    processed_feature = re.sub(r'^b\s+', '', processed_feature)
    processed_feature = processed_feature.lower()
    processed_features.append(processed_feature)

In [None]:
processed_features[:5]

['critical question paytm informed demonetization edict pm clearly fishy requires full disclosure amp',
 'former finsec rbi dy governor cbdt chair harvard professor lambaste demonetization aam aadmi listen th',
 'reddy wedding mailtoday cartoon demonetization reddywedding tcou7glnrq31f',
 'rs 40 lakh looted bank kishtwar amp third incident since demonetization terrorists',
 'many opposition leaders narendramodi demonetization respect decision support oppositio']

In [None]:
df['processed_text'] = processed_features

In [None]:
def generate_polarity(text):
    sentiment = TextBlob(text).sentiment
    return sentiment

In [None]:
sentiment = df['processed_text'].apply(generate_polarity)
sentiment = sentiment.to_frame()
sentiment.head()

Unnamed: 0,processed_text
0,"(0.15, 0.5777777777777778)"
1,"(0.0, 0.0)"
2,"(0.0, 0.0)"
3,"(0.0, 0.0)"
4,"(0.5, 0.5)"


In [None]:
sentiment['polarity'] = sentiment['processed_text'].apply(lambda x:x[0])
sentiment ['subjectivity'] = sentiment['processed_text'].apply(lambda x:x[1])

In [None]:
df['polarity'] = sentiment['polarity']
df['subjectivity'] = sentiment['subjectivity']
df['polarity_encoded'] = ['positive' if x > 0 else 'negative' if x < 0 else 'neutral' for x in df['polarity']]

In [None]:
df['polarity_encoded'].value_counts()

neutral     3720
positive    2648
negative    1102
Name: polarity_encoded, dtype: int64

In [None]:
print("The most positive tweet:",df.iloc[df['polarity'].idxmax()]['processed_text'])
print("The most negative tweet:",df.iloc[df['polarity'].idxmin()]['processed_text'])

print("The most subjective tweet:",df.iloc[df['subjectivity'].idxmax()]['processed_text'])
print("The most objective tweet:",df.iloc[df['subjectivity'].idxmin()]['processed_text'])

The most positive tweet: one greatest computer scientists dr vijay bhatkar views demonetization decision hon pm narendramodi h
The most negative tweet: pathetic journalism media thought get stds atms another attempt malign demonetization tco
The most subjective tweet: demonetization harbhajansingh gives hilarious shagun suggestion struggling wedding season
The most objective tweet: former finsec rbi dy governor cbdt chair harvard professor lambaste demonetization aam aadmi listen th


In [None]:
tweets = df[['processed_text', 'polarity_encoded']]
tweets.head()

Unnamed: 0,processed_text,polarity_encoded
0,critical question paytm informed demonetizatio...,positive
1,former finsec rbi dy governor cbdt chair harva...,neutral
2,reddy wedding mailtoday cartoon demonetization...,neutral
3,rs 40 lakh looted bank kishtwar amp third inci...,neutral
4,many opposition leaders narendramodi demonetiz...,positive


In [None]:
def tokenize(text):
    tk = TweetTokenizer()
    return tk.tokenize(text)

vectorizer = CountVectorizer(analyzer = 'word',tokenizer = tokenize,lowercase = True,ngram_range=(1, 1))

In [None]:
count= vectorizer.fit_transform(tweets['processed_text'])



In [None]:
X = tweets['processed_text'].values
y = tweets['polarity_encoded'].values
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=100, test_size=0.3)

In [None]:
vectorizer = TfidfVectorizer(max_features=1000)
X_train_idf = vectorizer.fit_transform(X_train)
X_test_idf = vectorizer.transform(X_test)

df_idf = pd.DataFrame(vectorizer.idf_, index=vectorizer.get_feature_names_out(),columns=["idf_weights"])
df_idf.sort_values(by=['idf_weights'],ascending = False).head()

Unnamed: 0,idf_weights
ysrcp,7.770407
u092c,7.482725
oppn,7.364942
lakhs,7.364942
lets,7.364942


In [None]:
mnb = MultinomialNB()
mnb.fit(X_train_idf, y_train)

In [None]:
pred_mnb = mnb.predict(X_test_idf)
acc = accuracy_score(y_test, pred_mnb)
results = pd.DataFrame([['Multinomial Naive Bayes', acc]],
               columns = ['Model', 'Accuracy'])

print(results)

                     Model  Accuracy
0  Multinomial Naive Bayes    0.8639
