In [1]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.preprocessing import StandardScaler

# data processing/manipulation
pd.options.mode.chained_assignment = None
import re

# data visualization
import matplotlib.pyplot as plt
from os import path
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

# stopwords, tokenizer, stemmer
import nltk  
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.probability import FreqDist

# spell correction, lemmatization
from textblob import TextBlob
from textblob import Word

# sklearn
from sklearn.model_selection import train_test_split
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory


In [2]:
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  1


In [3]:
trump_df = pd.read_csv('hashtag_donaldtrump.csv', lineterminator='\n')
biden_df = pd.read_csv('hashtag_joebiden.csv', lineterminator='\n')

In [4]:
# Remove unneeded columns
trump_df = trump_df.drop(columns=['tweet_id','user_id','user_name','user_screen_name',
                                  'user_description','user_join_date','collected_at'])
biden_df = biden_df.drop(columns=['tweet_id','user_id','user_name','user_screen_name',
                                  'user_description','user_join_date','collected_at'])

# Renaming columns
trump_df = trump_df.rename(columns={"likes": "Likes", "retweet_count": "Retweets", 
                                    "state": "State", "user_followers_count": "Followers"})
biden_df = biden_df.rename(columns={"likes": "Likes", "retweet_count": "Retweets", 
                                    "state": "State", "user_followers_count": "Followers"})

# Update United States country name for consistency
d = {"United States of America":"United States"}
trump_df['country'].replace(d, inplace=True)
biden_df['country'].replace(d, inplace=True)

trump_df = trump_df.loc[trump_df['country'] == "United States"]
biden_df = biden_df.loc[biden_df['country'] == "United States"]

# Drop null rows
trump_df = trump_df.dropna()
biden_df = biden_df.dropna()

In [5]:
to_remove = r'\d+|http?\S+|[^A-Za-z0-9]+'
stop_words = set(stopwords.words('english'))
ps = PorterStemmer()

# Function to preprocess tweet 
def clean_tweet(tweet, stem=False, lemmatize=False):

    # Make all text lowercase
    tweet = tweet.lower()
    
    # Remove links, special characters, punctuation, numbers, etc.
    tweet = re.sub(to_remove, ' ', tweet)
        
    filtered_tweet = []
    words = word_tokenize(tweet) 

    # Remove stopwords and stem
    for word in words:
        if not word in stop_words:
            if stem:
                filtered_tweet.append(ps.stem(word))
            elif lemmatize:
                filtered_tweet.append(Word(word).lemmatize())
            else:
                filtered_tweet.append(word)
            
    return filtered_tweet

In [6]:
#trump_df['tweetNew'] = trump_df.tweet.apply(lambda x: clean_tweet(x))
biden_df['tweetNew'] = biden_df.tweet.apply(lambda x: clean_tweet(x))

In [12]:
def sentiment_analysis(df):
    
    # Determine polarity and subjectivity
    df['Polarity'] = df['tweetNew'].apply(lambda x: TextBlob(' '.join(x)).sentiment.polarity)
    df['Subjectivity'] = df['tweetNew'].apply(lambda x: TextBlob(' '.join(x)).sentiment.subjectivity)
    
    # Classify overall sentiment
    df.loc[df.Polarity > 0,'Sentiment'] = 1
    df.loc[df.Polarity == 0,'Sentiment'] = 0
    df.loc[df.Polarity < 0,'Sentiment'] = -1
    
    return df

In [13]:
joe_tweet_senti = sentiment_analysis(biden_df)

In [14]:
joe_tweet_senti

Unnamed: 0,created_at,tweet,Likes,Retweets,source,Followers,user_location,lat,long,city,country,continent,State,state_code,tweetNew,Polarity,Subjectivity,Sentiment
6,2020-10-15 00:00:25,"In 2020, #NYPost is being #censorship #CENSORE...",0.0,0.0,Twitter for iPhone,1397.0,"Chicago, Illinois",41.875562,-87.624421,Chicago,United States,North America,Illinois,IL,"[nypost, censorship, censored, twitter, manipu...",-0.148810,0.678571,-1.0
17,2020-10-15 00:01:23,"Comments on this? ""Do Democrats Understand how...",0.0,0.0,Twitter Web App,83.0,"Tampa, Florida",27.947760,-82.458444,Tampa,United States,North America,Florida,FL,"[comments, democrats, understand, ruthless, ch...",-1.000000,1.000000,-1.0
22,2020-10-15 00:01:47,Twitter is doing everything they can to help D...,1.0,0.0,Twitter for iPhone,2.0,"Hollywood, FL",34.098003,-118.329523,Los Angeles,United States,North America,California,CA,"[twitter, everything, help, democrats, win, el...",0.175000,0.522222,1.0
25,2020-10-15 00:01:57,@RealJamesWoods #BidenCrimeFamily #JoeBiden #H...,0.0,0.0,Twitter for Android,29.0,"Los Angeles, CA",34.053691,-118.242766,Los Angeles,United States,North America,California,CA,"[realjameswoods, bidencrimefamily, joebiden, h...",0.000000,0.000000,0.0
29,2020-10-15 00:02:06,Come on @ABC PLEASE DO THE RIGHT THING. Move t...,0.0,0.0,Twitter Web App,166.0,"New York, NY",40.712728,-74.006015,New York,United States,North America,New York,NY,"[come, abc, please, right, thing, move, biden,...",0.078571,0.178571,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
776836,2020-11-08 23:55:24,"#Biden 🗽🇺🇸👍🏽 | Images 📷 @ Santa Maria, CA. | ...",1.0,0.0,Twitter for iPhone,8881.0,LOS ANGELES,34.053691,-118.242766,Los Angeles,United States,North America,California,CA,"[biden, images, santa, maria, ca, wethepeopleh...",0.000000,0.000000,0.0
776845,2020-11-08 23:56:15,Will #criticalRaceTheory become ubiquitous in ...,0.0,0.0,Twitter Web App,12606.0,"Philadelphia, PA",39.952724,-75.163526,Philadelphia,United States,North America,Pennsylvania,PA,"[criticalracetheory, become, ubiquitous, biden...",0.000000,0.000000,0.0
776847,2020-11-08 23:56:21,You moving near #Biden 🤔 https://t.co/1F6i1YIJ2P,0.0,0.0,Twitter for iPhone,803.0,Philadelphia PA,39.952724,-75.163526,Philadelphia,United States,North America,Pennsylvania,PA,"[moving, near, biden]",0.100000,0.400000,1.0
776861,2020-11-08 23:58:09,#election #2020Elections #trump #biden https:/...,0.0,0.0,Twitter for iPhone,1092.0,"New York, USA",40.712728,-74.006015,New York,United States,North America,New York,NY,"[election, elections, trump, biden]",0.000000,0.000000,0.0


In [11]:
from tensorflow.keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import sequence
%matplotlib inline

In [12]:
X=trump_df.tweetNew
y=trump_df.Sentiment

In [13]:
max_len = 500
tok = Tokenizer(num_words=2000)
tok.fit_on_texts(X)
sequences = tok.texts_to_sequences(X)
sequences_matrix = sequence.pad_sequences(sequences,maxlen=max_len)

In [14]:
X_train, X_test, Y_train, Y_test = train_test_split(sequences_matrix, y, test_size=0.3, random_state=2)

In [15]:
def tensorflow_based_model(): #Defined tensorflow_based_model function for training tenforflow based model
    inputs = Input(name='inputs',shape=[max_len])#step1
    layer = Embedding(2000,50,input_length=max_len)(inputs) #step2
    layer = LSTM(64)(layer) #step3
    layer = Dense(256,name='FC1')(layer) #step4
    layer = Activation('relu')(layer) # step5
    layer = Dropout(0.5)(layer) # step6
    layer = Dense(1,name='out_layer')(layer) #step4 again but this time its giving only one output as because we need to classify the tweet as positive or negative
    layer = Activation('sigmoid')(layer) #step5 but this time activation function is sigmoid for only one output.
    model = Model(inputs=inputs,outputs=layer) #here we are getting the final output value in the model for classification
    return model #function returning the value when we call it

In [16]:
model = tensorflow_based_model() # here we are calling the function of created model
model.compile(loss='binary_crossentropy',optimizer=RMSprop(),metrics=['accuracy'])

In [17]:
with tf.device('/GPU:0'):
    history=model.fit(X_train,Y_train,batch_size=80,epochs=6, validation_split=0.1)

Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


In [18]:
with tf.device('/GPU:0'):
    model.evaluate(X_test, Y_test)



In [21]:
data = pd.read_csv("final_df.csv", thousands=',', decimal='.')

In [25]:
data

Unnamed: 0,Rep2020,Rep2016,"Less than a high school diploma, 2015-19","High school diploma only, 2015-19","Some college or associate's degree, 2015-19","Bachelor's degree or higher, 2015-19","Percent of adults with less than a high school diploma, 2015-19","Percent of adults with a high school diploma only, 2015-19","Percent of adults completing some college or associate's degree, 2015-19","Percent of adults with a bachelor's degree or higher, 2015-19",...,CI90UB517P_2019,MEDHHINC_2019,CI90LBINC_2019,CI90UBINC_2019,Civilian_labor_force_2019,Employed_2019,Unemployed_2019,Unemployment_rate_2019,Median_Household_Income_2019,Med_HH_Income_Percent_of_State_Total_2019
1001,1,1,4291,12551,10596,9929,11.5,33.6,28.4,26.6,...,19.4,58233,52517,63949,26172,25458,714,2.7,58233,112.5
1003,1,1,13893,41797,47274,48148,9.2,27.7,31.3,31.9,...,17.2,59871,54593,65149,97328,94675,2653,2.7,59871,115.6
1005,1,1,4812,6396,4676,2080,26.8,35.6,26.0,11.6,...,49.0,35972,31822,40122,8537,8213,324,3.8,35972,69.5
1007,1,1,3386,7256,3848,1678,20.9,44.9,23.8,10.4,...,32.7,47918,42291,53545,8685,8419,266,3.1,47918,92.6
1009,1,1,7763,13299,13519,5210,19.5,33.4,34.0,13.1,...,25.8,52902,46777,59027,25331,24655,676,2.7,52902,102.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56037,1,1,2017,9239,10415,6291,7.2,33.0,37.2,22.5,...,11.1,80639,73437,87841,21274,20446,828,3.9,80639,121.9
56039,0,0,834,2577,4037,9875,4.8,14.9,23.3,57.0,...,6.7,98837,86531,111143,15575,15151,424,2.7,98837,149.4
56041,1,1,941,5383,4562,2078,7.3,41.5,35.2,16.0,...,11.1,70756,63191,78321,9035,8682,353,3.9,70756,107.0
56043,1,1,568,1650,2031,1297,10.2,29.8,36.6,23.4,...,17.4,55122,50050,60194,3941,3786,155,3.9,55122,83.3


In [26]:

data.index.name = None
data.dropna(axis='columns',inplace=True)
X = data.drop(['Rep2020'], axis = 1)
Y = data["Rep2020"]
x_in, x_out, y_in, y_out = train_test_split(X, Y, test_size=0.25)
sc = StandardScaler()
x_in = sc.fit_transform(x_in)
x_out = sc.transform(x_out)

In [38]:
# Set up layers 
inputs = Input(shape=(50,))
x = Dense(1, activation='sigmoid')(inputs)
predictions = Dense(1, activation='sigmoid')(x)

# Set up model
model = Model(inputs=inputs, outputs=predictions)
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
with tf.device('/GPU:0'):
    model.fit(x_in,y_in,epochs=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [39]:
with tf.device('/GPU:0'):
    model.evaluate(x_out, y_out)

