In [None]:
"""
The sentiment analysis code was taken from https://www.geeksforgeeks.org/twitter-sentiment-analysis-using-python/
and modified to analyze our pre-collected twitter data.
"""

import re 
import tweepy 
from tweepy import OAuthHandler 
from textblob import TextBlob 
import json
import pandas as pd

  
def clean_tweet(tweet): 
    ''' 
    Utility function to clean tweet text by removing links, special characters 
    using simple regex statements. 
    '''
    return ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", tweet).split()) 

def get_tweet_sentiment(tweet): 
    ''' 
    Utility function to classify sentiment of passed tweet 
    using textblob's sentiment method 
    '''
    # create TextBlob object of passed tweet text 
    analysis = TextBlob(clean_tweet(tweet)) 
    # set sentiment 
    if analysis.sentiment.polarity > 0: 
        return 'positive'
    elif analysis.sentiment.polarity == 0: 
        return 'neutral'
    else: 
        return 'negative'

def get_sentiment_tweets(filename): 
    ''' 
    Main function to fetch tweets and parse them. 
    '''
    # empty list to store parsed tweets 
    tweets = [] 
    tweets_text = []

    try: 
        with open(filename) as f:
            fetched_tweets = [json.loads(line) for line in f]
            
        for tweet in fetched_tweets: 
            
            if 'text' not in tweet:
                print('no text field in:\n', tweet)
            else:
                tweet['sentiment'] = get_tweet_sentiment(tweet['text'])
            
                # this sentiment analysis only works on english tweets. Also throw out direct retweets
                if tweet['lang'] == 'en' and (tweet['retweet_count'] == 0 or tweet['text'] not in tweets_text):
                    tweets.append(tweet) 
                    tweets_text.append(tweet['text'])

        return tweets 

    except tweepy.TweepError as e: 
        # print error (if any) 
        print("Error : " + str(e)) 

In [None]:
# preprocessing search api data

print("cspan SUMMARY:\n") # change this and run for each of the keywords

m = '02'
d = 20 # should be 15 for CNN, FoxNews, and MSNBC, but 20 for cspan and NPR

outFile = open("sentiment_processed_data/search/cspan_sentiment.json", 'w') # change this filename for each keyword

total_pos = 0
total_neg = 0
total = 0

for i in range(20): # should be 25 for CNN, FoxNews, and MSNBC, but 20 for cspan and NPR
    filename = "raw_data/cspan/cspan_search_2020-" + m + "-{:02d}".format(d) + '.json' # change this filename for each keyword
    d = d + 1
    if m == '02' and d == 30:
        m = '03'
        d = 1
    
    this_file_pos = 0
    this_file_neg = 0
    
    sentiment_tweets = get_sentiment_tweets(filename)
    
    for twt in sentiment_tweets:
        json.dump(twt, outFile)
        outFile.write('\n')
        
        if twt['sentiment'] == 'positive':
            this_file_pos += 1
        elif twt['sentiment'] == 'negative':
            this_file_neg += 1
        
        
    print("file:", filename)
    print("valid tweets:", len(sentiment_tweets))
    print("positive tweets:", this_file_pos)
    print("negative tweets:", this_file_neg)
    print("neutral tweets:", len(sentiment_tweets) - this_file_pos - this_file_neg)
    print("\n")
    
    total += len(sentiment_tweets)
    total_pos += this_file_pos
    total_neg += this_file_neg

    
    
outFile.close()
print("FINAL RESULTS")
print("valid tweets:", total)
print("positive tweets:", total_pos, "=", total_pos/total*100, '%')
print("negative tweets:", total_neg, "=", total_neg/total*100, '%')
print("neutral tweets:", total-total_pos-total_neg, "=", (total-total_pos-total_neg)/total*100, '%')

In [None]:
# test *_sentiment.json was written correctly to the file

with open('cspan_sentiment.json') as f: # change this filename for each of the keywords
    tweets = [json.loads(line) for line in f]
    data = pd.DataFrame(columns=['user','sentiment', 'text']);
    for twt in tweets:
        data = data.append({'sentiment': twt['sentiment'], 'text': twt['text'], 'user': str(twt['user']['screen_name'])}, ignore_index = True)

print(len(data))
data.head()

In [None]:
# preprocess streaming data
print("STREAMING SUMMARY:\n")

files = ["streaming_2020-03-03-0930.json", "streaming_2020-03-03-1300.json", "streaming_2020-03-03-1700.json", "streaming_2020-03-03-2230.json"]
keywords = ['CNN', 'FoxNews', 'MSNBC', 'NPR', 'cspan']

tweet_lists = [[], [], [], [], []]

for filename in files:
    
    sentiment_tweets = get_sentiment_tweets("raw_data/streaming/" + filename)
    
    for twt in sentiment_tweets:
        for i in range(5):
            if keywords[i] in twt['text']:
                tweet_lists[i].append(twt)


total_tweets = 0
for i in range(5):
    total_tweets += len(tweet_lists[i])

                
# now write all CNN to a file, all FoxNews to a different file, etc
for i in range(5):
    outFile = open("sentiment_processed_data/streaming/" + keywords[i] + "_streaming_sentiment.json", 'w')
    
    this_keyword_pos = 0
    this_keyword_neg = 0

    for twt in tweet_lists[i]:
        json.dump(twt, outFile)
        outFile.write('\n')
        
        if twt['sentiment'] == 'positive':
            this_keyword_pos += 1
        elif twt['sentiment'] == 'negative':
            this_keyword_neg += 1
            
    outFile.close()
        
    print(keywords[i])
    print("valid tweets:", len(tweet_lists[i]), "("+str(len(tweet_lists[i])/total_tweets*100)+"% of total)")
    print("positive tweets:", this_keyword_pos, "("+str(this_keyword_pos/len(tweet_lists[i])*100)+"%)")
    print("negative tweets:", this_keyword_neg, "("+str(this_keyword_neg/len(tweet_lists[i])*100)+"%)")
    print("neutral tweets:", len(tweet_lists[i])-this_keyword_pos-this_keyword_neg, "("+str((len(tweet_lists[i])-this_keyword_pos-this_keyword_neg)/len(tweet_lists[i])*100)+"%)")
    print("\n")