<h2> Importing Libraries </h2>

In [None]:
import pandas as pd
import re 
import os
import numpy as np
import seaborn as sns
from tqdm.notebook import tqdm
import nltk
nltk.download('words')
import text2emotion as te
words = set(nltk.corpus.words.words())

In [None]:
from transformers import *
sentiment = pipeline('sentiment-analysis')

<h2> Uploading Files</h2>

In [None]:
data1=pd.read_csv("../Data/farmbill2.csv")
data1.drop(["Unnamed: 0"],axis=1,inplace=True)
data1.head()

In [None]:
data=pd.read_csv("../Data/farmbill.csv")
data.drop(["Unnamed: 0"],axis=1,inplace=True)
data.head()

In [None]:
data=data1.append(data)

In [None]:
data.reset_index(inplace=True)
data.head()

In [None]:
data.drop(['index'],axis=1,inplace=True)

In [None]:
data['Date']=pd.to_datetime(data['Date'])
data['Date'].dtypes

<h2> Data Cleaning </h2>

In [None]:
data['Tweet'][0]

In [None]:
def clean_txt(input_txt, pattern):
    input_txt=re.sub(r'#[\w]*','',input_txt)
    r = re.findall(pattern, input_txt)
    for i in r:
        input_txt = re.sub(i, '', input_txt)
        
    res=re.sub(r'[^\w\s]', '', input_txt.lower())
    res=re.sub('\s+',' ',res)
    res=re.sub(r'https[\w]*', '', res, flags=re.MULTILINE)
    res=''.join(i for i in res if not i.isdigit())
    res=' '.join([i for i in res.split() if len(i)>2])
    return res
clean_txt(data['Tweet'][0],"@[\w]*")

In [None]:
data['Clean Tweet']=np.vectorize(clean_txt)(data['Tweet'], "@[\w]*")
data.head()

In [None]:
all_tokens=[row['Clean Tweet'].split() for _,row in data.iterrows() ]
from nltk.stem.porter import *
stemmer = PorterStemmer()
for i in range(len(all_tokens)):
    for j in all_tokens[i]:
        j=stemmer.stem(j)
    all_tokens[i]=' '.join(all_tokens[i])
data['Clean Tweet']=all_tokens

In [None]:
data.head()

<h2> Data Analysis </h2>

In [None]:
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
sns.set()
fig, ax = plt.subplots()
fig.set_size_inches(11.7, 8.27)
g=sns.countplot(y="Tweet Source",  data=data,order=data['Tweet Source'].value_counts().iloc[:10].index)
plt.show()

In [None]:
sns.set()
fig, ax = plt.subplots()
fig.set_size_inches(11.7, 8.27)
g=sns.countplot(y="Location",  data=data,order=data['Location'].value_counts().iloc[1:11].index)
plt.show()

In [None]:
wcloud=' '.join([i for i in data['Clean Tweet']])
from wordcloud import WordCloud
import matplotlib.pyplot as plt
wordcloud = WordCloud(width=800, height=500, random_state=21, max_font_size=110).generate(wcloud)

plt.figure(figsize=(15, 8))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.savefig("../Images/wcloud.png")
plt.show()

<h2> Getting Sentiment Scores </h2>

In [None]:
def get_scores(text):
    txt=sentiment(text)
    return txt[0]['label']

In [None]:
data['Clean Tweet'][0]

In [None]:
ss=[get_scores(row['Clean Tweet']) for _,row in data.iterrows()]
data['SS']=ss
data.head()

<h2> Analysis on Basis of the Score</h2>

In [None]:
sns.set()
fig, ax = plt.subplots()
fig.set_size_inches(11.7, 8.27)
sns.countplot(y=data['SS'])
plt.show()

In [None]:
pos=[]
neg=[]
for _,row in data.iterrows():
    if(row['SS']=="POSITIVE"):
        pos.append(row['Clean Tweet'])
    elif(row['SS']=="NEGATIVE"):
        neg.append(row['Clean Tweet'])

In [None]:
poscloud=' '.join([i for i in pos])
wordcloud = WordCloud(width=800, height=500, random_state=21, max_font_size=110).generate(poscloud)

plt.figure(figsize=(15, 8))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.savefig("../Images/decpcloud.png")
plt.show()

In [None]:
negcloud=' '.join([i for i in neg])
wordcloud = WordCloud(width=800, height=500, random_state=21, max_font_size=110).generate(negcloud)

plt.figure(figsize=(15, 8))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.savefig("../Images/decncloud.png")
plt.show()

In [None]:
sns.set()
fig, ax = plt.subplots()
fig.set_size_inches(11.7, 8.27)
g=sns.countplot(data['Date'].dt.date,hue=data['SS'])
plt.savefig("../Images/SAdec.png")
plt.xticks(rotation=90)
plt.show()

In [None]:
def hashtag_extract(x):
    hashtags = []
    not_consider=['farmbills','farmersbill','farmersbill2020','farmers','farmerprotest','farmerprotestd','farmersprotests','farmersprotest','indianfarmersrevolution2020']
    # Loop over the words in the tweet
    for i in x:
        ht = re.findall(r"#(\w+)", i)
        ht=[j for j in ht if j.lower() not in not_consider]
        ht=[j for j in ht if j]
        hashtags.append(ht)

    return hashtags


In [None]:
HT_regular = hashtag_extract(data['Tweet'][data['SS'] == "POSITIVE"])

# extracting hashtags from racist/sexist tweets
HT_negative = hashtag_extract(data['Tweet'][data['SS'] == "NEGATIVE"])
pos_h=[]
            # unnesting list
HT_regular = sum(HT_regular,[])
HT_negative = sum(HT_negative,[])

In [None]:
a = nltk.FreqDist(HT_regular)
d = pd.DataFrame({'Hashtag': list(a.keys()),
                  'Count': list(a.values())})
# selecting top 10 most frequent hashtags     
d = d.nlargest(columns="Count", n = 10) 
plt.figure(figsize=(16,5))
ax = sns.barplot(data=d, x= "Hashtag", y = "Count")
ax.set(ylabel = 'Count')
plt.xticks(rotation=90)
plt.savefig("../Images/phashtags.png")
plt.show()

In [None]:
b = nltk.FreqDist(HT_negative)
e = pd.DataFrame({'Hashtag': list(b.keys()), 'Count': list(b.values())})
# selecting top 10 most frequent hashtags
e = e.nlargest(columns="Count", n = 10)   
plt.figure(figsize=(16,5))
ax = sns.barplot(data=e, x= "Hashtag", y = "Count")
ax.set(ylabel = 'Count')
plt.xticks(rotation=90)
plt.savefig("../Images/nhashtags.png")
plt.show()