In [32]:
import numpy as np
import pandas as pd
import spacy
import re
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import classification_report, ConfusionMatrixDisplay
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as snb
from wordcloud import WordCloud, STOPWORDS
import math
from scipy.stats import chi2_contingency, chisquare, chi2
import nltk  
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.probability import FreqDist
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.model_selection import GridSearchCV
import tensorflow as tf
import pycld2 as cld2
from textblob import TextBlob
from textblob import Word

In [2]:
def clean_tweet_text_regex(tweet):
    tweet = tweet.lower() # to lower case
    tweet = re.sub(r"@[A-Za-z0-9]+","", tweet) # Remove @Mentions
    tweet = re.sub(r"#","", tweet) # remove # \ART[\s]+
    tweet = re.sub(r"\ART[\s]+","", tweet) # Remove RT in start
    tweet = re.sub(r"https?:\/\/\S+","", tweet) # Remove hyperlink
    tweet = re.sub("(.)\\1{2,}","\\1", tweet) # Remove more than one characters.
    tweet = re.sub(r"[^A-Za-z0-9\s]+", "",str(tweet)) #remove special characters "!"
    
    return tweet

In [12]:
def detect_lang(tweet):
    try:
        isReliable, textBytesFound, details = cld2.detect(tweet)
        return details[0][0]
    except:
        return "not found"

In [5]:
joe_df = pd.read_csv('hashtag_joebiden.csv', lineterminator='\n')

In [13]:
joe_df["tweetNew"] = joe_df["tweet"].apply(clean_tweet_text_regex)

In [16]:
train_langs = joe_df["tweetNew"].apply(detect_lang)

In [17]:
train_langs.value_counts()

ENGLISH       526887
Unknown       118878
SPANISH        32578
FRENCH         22488
GERMAN         19271
               ...  
FIJIAN             1
NYANJA             1
VIETNAMESE         1
SUNDANESE          1
FAROESE            1
Name: tweetNew, Length: 100, dtype: int64

In [18]:
joe_df["Lang"] = train_langs

In [25]:
joe_df_ng = joe_df.loc[joe_df['Lang'] == "ENGLISH"]

In [26]:
joe_df_ng

Unnamed: 0,created_at,tweet_id,tweet,likes,retweet_count,source,user_id,user_name,user_screen_name,user_description,...,lat,long,city,country,continent,state,state_code,collected_at,tweetNew,Lang
2,2020-10-15 00:00:20,1.316529e+18,@IslandGirlPRV @BradBeauregardJ @MeidasTouch T...,0.0,0.0,Twitter Web App,3.494182e+09,Flag Waver,Flag_Wavers,,...,46.304036,-109.171431,,United States of America,North America,Montana,MT,2020-10-21 00:00:01.035654566,this is how biden made his trumpisnotamerica \n,ENGLISH
3,2020-10-15 00:00:21,1.316529e+18,@chrislongview Watching and setting dvr. Let’s...,0.0,0.0,Twitter for iPhone,8.242596e+17,Michelle Ferg,MichelleFerg4,,...,,,,,,,,2020-10-21 00:00:01.553481849,watching and setting dvr lets give him bonus ...,ENGLISH
4,2020-10-15 00:00:22,1.316529e+18,#censorship #HunterBiden #Biden #BidenEmails #...,1.0,0.0,Twitter Web App,1.032807e+18,the Gold State,theegoldstate,A Silicon Valley #independent #News #Media #St...,...,36.701463,-118.755997,,United States of America,North America,California,CA,2020-10-21 00:00:02.071309132,censorship hunterbiden biden bidenemails biden...,ENGLISH
5,2020-10-15 00:00:23,1.316529e+18,"""IS THIS WRONG??!!"" Cory Booker's BRILLIANT Fi...",0.0,0.0,Twitter Web App,3.057279e+08,Dose of Dissonance,Dose_Dissonance,YOUTUBE CHANNEL: https://t.co/EFnKm5gnvV MERCH...,...,,,,,,,,2020-10-21 00:00:02.589136415,is this wrong cory bookers brilliant final que...,ENGLISH
6,2020-10-15 00:00:25,1.316529e+18,"In 2020, #NYPost is being #censorship #CENSORE...",0.0,0.0,Twitter for iPhone,1.994033e+07,Change Illinois | Biden will increase taxes by...,changeillinois,"Illinois, home of Lincoln and Reagan, used to ...",...,41.875562,-87.624421,Chicago,United States of America,North America,Illinois,IL,2020-10-21 00:00:03.106963698,in 2020 nypost is being censorship censored by...,ENGLISH
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
776874,2020-11-08 23:59:06,1.325589e+18,@SamRamani2 #Biden camps announcement on Idlib...,1.0,0.0,Twitter Web App,8.169274e+17,TurkishFacts4u,TurkishFacts4U,ᴄᴏᴜɴᴛᴇʀɪɴɢ ᴘᴇʀᴄᴇᴘᴛɪᴏɴ ᴍᴀɴᴀɢᴇᴍᴇɴᴛ ᴄᴀᴍᴘᴀɪɢɴꜱ ᴀɢᴀ...,...,,,,,,,,2020-11-09 18:32:45.811602,biden camps announcement on idlib is merely a...,ENGLISH
776876,2020-11-08 23:59:16,1.325589e+18,"Mr. #Biden, tear down that wall (with #Mexico)...",1.0,0.0,Twitter for iPhone,3.772296e+08,Tim Welch,TimFWelch,transportation nerd; data geek.\nsenior lectur...,...,-36.852095,174.763180,Auckland,New Zealand,Oceania,Auckland,AUK,2020-11-09 18:32:45.760347,mr biden tear down that wall with mexico\nwhat...,ENGLISH
776877,2020-11-08 23:59:32,1.325589e+18,NYT: #BeratAlbayrak’s departure may also signa...,3.0,0.0,Twitter for iPhone,2.646308e+07,Cagil M. Kasapoglu,CagilKasapoglu,Journalist | BBC WS | @bbcturkce | Eyes on 🌍 |...,...,,,,,,,,2020-11-09 18:32:45.595167,nyt beratalbayraks departure may also signal a...,ENGLISH
776878,2020-11-08 23:59:33,1.325589e+18,@staceyabrams Thank you for all your support a...,2.0,1.0,Twitter for Android,2.252249e+08,"#BidenHarris2020 Amen, God Bless America",tabup23,,...,,,,,,,,2020-11-09 18:32:45.880153,thank you for all your support and hard work ...,ENGLISH


In [28]:
to_remove = r'\d+|http?\S+|[^A-Za-z0-9]+'
stop_words = set(stopwords.words('english'))
ps = PorterStemmer()

# Function to preprocess tweet 
def clean_tweet(tweet, stem=False, lemmatize=False):
        
    filtered_tweet = []
    words = word_tokenize(tweet) 

    # Remove stopwords and stem
    for word in words:
        if not word in stop_words:
            if stem:
                filtered_tweet.append(ps.stem(word))
            elif lemmatize:
                filtered_tweet.append(Word(word).lemmatize())
            else:
                filtered_tweet.append(word)
            
    return filtered_tweet

In [29]:
joe_df_ng['tweetNew'] = joe_df_ng.tweetNew.apply(lambda x: clean_tweet(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  joe_df_ng['tweetNew'] = joe_df_ng.tweetNew.apply(lambda x: clean_tweet(x))


In [30]:
def sentiment_analysis(df):
    
    # Determine polarity and subjectivity
    df['Polarity'] = df['tweetNew'].apply(lambda x: TextBlob(' '.join(x)).sentiment.polarity)
    df['Subjectivity'] = df['tweetNew'].apply(lambda x: TextBlob(' '.join(x)).sentiment.subjectivity)
    
    # Classify overall sentiment
    df.loc[df.Polarity > 0,'Sentiment'] = 1
    df.loc[df.Polarity == 0,'Sentiment'] = 0
    df.loc[df.Polarity < 0,'Sentiment'] = -1
    
    return df

In [33]:

joe_tweet_senti = sentiment_analysis(joe_df_ng)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Polarity'] = df['tweetNew'].apply(lambda x: TextBlob(' '.join(x)).sentiment.polarity)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Subjectivity'] = df['tweetNew'].apply(lambda x: TextBlob(' '.join(x)).sentiment.subjectivity)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = i

In [34]:
joe_tweet_senti

Unnamed: 0,created_at,tweet_id,tweet,likes,retweet_count,source,user_id,user_name,user_screen_name,user_description,...,country,continent,state,state_code,collected_at,tweetNew,Lang,Polarity,Subjectivity,Sentiment
2,2020-10-15 00:00:20,1.316529e+18,@IslandGirlPRV @BradBeauregardJ @MeidasTouch T...,0.0,0.0,Twitter Web App,3.494182e+09,Flag Waver,Flag_Wavers,,...,United States of America,North America,Montana,MT,2020-10-21 00:00:01.035654566,"[biden, made, trumpisnotamerica]",ENGLISH,0.000000,0.000000,0.0
3,2020-10-15 00:00:21,1.316529e+18,@chrislongview Watching and setting dvr. Let’s...,0.0,0.0,Twitter for iPhone,8.242596e+17,Michelle Ferg,MichelleFerg4,,...,,,,,2020-10-21 00:00:01.553481849,"[watching, setting, dvr, lets, give, bonus, ra...",ENGLISH,0.000000,0.000000,0.0
4,2020-10-15 00:00:22,1.316529e+18,#censorship #HunterBiden #Biden #BidenEmails #...,1.0,0.0,Twitter Web App,1.032807e+18,the Gold State,theegoldstate,A Silicon Valley #independent #News #Media #St...,...,United States of America,North America,California,CA,2020-10-21 00:00:02.071309132,"[censorship, hunterbiden, biden, bidenemails, ...",ENGLISH,0.000000,0.000000,0.0
5,2020-10-15 00:00:23,1.316529e+18,"""IS THIS WRONG??!!"" Cory Booker's BRILLIANT Fi...",0.0,0.0,Twitter Web App,3.057279e+08,Dose of Dissonance,Dose_Dissonance,YOUTUBE CHANNEL: https://t.co/EFnKm5gnvV MERCH...,...,,,,,2020-10-21 00:00:02.589136415,"[wrong, cory, bookers, brilliant, final, quest...",ENGLISH,0.133333,0.966667,1.0
6,2020-10-15 00:00:25,1.316529e+18,"In 2020, #NYPost is being #censorship #CENSORE...",0.0,0.0,Twitter for iPhone,1.994033e+07,Change Illinois | Biden will increase taxes by...,changeillinois,"Illinois, home of Lincoln and Reagan, used to ...",...,United States of America,North America,Illinois,IL,2020-10-21 00:00:03.106963698,"[2020, nypost, censorship, censored, twitter, ...",ENGLISH,-0.148810,0.678571,-1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
776874,2020-11-08 23:59:06,1.325589e+18,@SamRamani2 #Biden camps announcement on Idlib...,1.0,0.0,Twitter Web App,8.169274e+17,TurkishFacts4u,TurkishFacts4U,ᴄᴏᴜɴᴛᴇʀɪɴɢ ᴘᴇʀᴄᴇᴘᴛɪᴏɴ ᴍᴀɴᴀɢᴇᴍᴇɴᴛ ᴄᴀᴍᴘᴀɪɢɴꜱ ᴀɢᴀ...,...,,,,,2020-11-09 18:32:45.811602,"[biden, camps, announcement, idlib, merely, ai...",ENGLISH,-0.500000,0.500000,-1.0
776876,2020-11-08 23:59:16,1.325589e+18,"Mr. #Biden, tear down that wall (with #Mexico)...",1.0,0.0,Twitter for iPhone,3.772296e+08,Tim Welch,TimFWelch,transportation nerd; data geek.\nsenior lectur...,...,New Zealand,Oceania,Auckland,AUK,2020-11-09 18:32:45.760347,"[mr, biden, tear, wall, mexico, whats, never, ...",ENGLISH,0.311111,0.611111,1.0
776877,2020-11-08 23:59:32,1.325589e+18,NYT: #BeratAlbayrak’s departure may also signa...,3.0,0.0,Twitter for iPhone,2.646308e+07,Cagil M. Kasapoglu,CagilKasapoglu,Journalist | BBC WS | @bbcturkce | Eyes on 🌍 |...,...,,,,,2020-11-09 18:32:45.595167,"[nyt, beratalbayraks, departure, may, also, si...",ENGLISH,0.000000,0.000000,0.0
776878,2020-11-08 23:59:33,1.325589e+18,@staceyabrams Thank you for all your support a...,2.0,1.0,Twitter for Android,2.252249e+08,"#BidenHarris2020 Amen, God Bless America",tabup23,,...,,,,,2020-11-09 18:32:45.880153,"[thank, support, hard, work, biden, harris, ti...",ENGLISH,-0.291667,0.541667,-1.0
