In [2]:
import os
import glob
import json
import pandas as pd

paths = []
paths.append('\\sentiment_processed_data\\search')
paths.append('\\sentiment_processed_data\\streaming')

keywords = ['CNN', 'FoxNews', 'MSNBC', 'NPR', 'cspan']

In [None]:
# for each keyword, read through files and gets a random sample of tweets
# write sampled tweets to *_sample.json
# keep track of unique users and write their screen names to users_list.csv

unique_users = set()

for word in keywords:
    
    print("\nPROCESSING", word)
    keyword_tweets = []

    for path in paths:
        for filename in glob.glob(os.path.join(path, '*' + word + "*.json")):
            print("Processing file:", filename)

            with open(os.path.join(path, filename), 'r') as f:
                tweets = [json.loads(line) for line in f]
                print("loaded", len(tweets), "tweets")
                
                for twt in tweets:
                    if twt['user']['friends_count'] > 0 and twt['user']['followers_count'] > 0: # remove tweets where the user doesn't follow anyone or isn't followed by anyone
                        keyword_tweets.append(twt)
                
                
    # get sample
    sample_idx = pd.Series(range(len(keyword_tweets)))
    sample_idx = sample_idx.sample(frac=.07, random_state=1)
    
    # write sample to .json
    with open(word + "_sample.json", 'w') as outFile:
        for idx in sample_idx:
            json.dump(keyword_tweets[idx], outFile)
            outFile.write('\n')
            
            # keep track of unique users
            if keyword_tweets[idx]['user']['screen_name'] not in unique_users:
                unique_users.add(keyword_tweets[idx]['user']['screen_name'])



print("\n" + len(unique_users))
with open("users_list.csv", 'w') as f:
    for user in unique_users:
        f.write(user)
        f.write('\n')

In [None]:
# test that users_list.csv was written correctly

test_unique_users = pd.read_csv("users_list.csv", names=['screen_name'])
test_unique_users

In [None]:
# test *_sample.json was written correctly to the file

with open('CNN_sample.json') as f:
    tweets = [json.loads(line) for line in f]
    data = pd.DataFrame(columns=['user','sentiment', 'text', 'followers', 'following']);
    for twt in tweets:
        data = data.append({'sentiment': twt['sentiment'], 'text': twt['text'], 'user': str(twt['user']['screen_name']),'followers':twt['user']['friends_count'], 'following':twt['user']['followers_count']}, ignore_index = True)

print(len(data))
data.head()

In [9]:
# read in list of unique users
# read in the 20 top followed political/news accounts on Twitter

users = pd.read_csv("users_list.csv", names=['screen_name'])['screen_name'].tolist()
print(len(users))

top_followed = pd.read_csv("top_followed.csv") # source: https://socialblade.com/twitter/top/500/followers
top_followed

2022


Unnamed: 0,rank,screen_name,millions of followers
0,1,BarackObama,115
1,9,realDonaldTrump,76
2,16,cnnbrk,57
3,22,CNN,47
4,24,nytimes,46
5,30,BBCBreaking,42
6,39,NASA,36
7,47,PMOIndia,33
8,61,POTUS,29
9,69,BBCWorld,27


In [12]:
# set up twitter api

import tweepy
from tweepy import OAuthHandler
from tweepy import API

C_KEY = ''
C_SECRET = ''
A_TOKEN_KEY = ''
A_TOKEN_SECRET = ''

auth = tweepy.OAuthHandler(C_KEY, C_SECRET)
auth.set_access_token(A_TOKEN_KEY, A_TOKEN_SECRET)
api = tweepy.API(auth, wait_on_rate_limit=True)

In [None]:
# use api to record for each user which of the top 20 accounts they follow


user_following_data_outfile = open("user_following_data.csv", 'a+')

for user in users:
    print(i, user)
    
    bit_str = ''
    
    for i in range(20):
        try:
            # does user follow top_followed[i]?
            friendship_test = api.show_friendship(source_screen_name=user, target_screen_name=top_followed['screen_name'][i])
            
            if friendship_test[0].following:
                bit_str += '1,'
            else:
                bit_str += '0,'

            
        except tweepy.TweepError:
            print("Failed to run the command on user", user, "and", top_followed['screen_name'][i], ", Skipping...")
            bit_str += '0,'
            
    user_following_data_outfile.write(user + "," + bit_str[:-1] + "\n")
    print(bit_str)
            
user_following_data_outfile.close()

In [44]:
# read the follower data into a dictionary

name_bitstr_dict = {}

with open("user_following_data.csv", 'r') as f:
    for line in f:
        comma = line.index(",")
        screen_name = line[:comma]
        bit_str = line[comma+1:-1]
        
        name_bitstr_dict[screen_name] = bit_str

len(name_bitstr_dict)

1977

In [None]:
# using the sampled tweets and user following data, write final preprocessed data to a csv

news_outlet = "cspan" # change this and run for each keyword

final_outfile = open(news_outlet + "_class_data.csv", 'w')

with open(news_outlet + '_sample.json') as f:
    tweets = [json.loads(line) for line in f]
    
    for twt in tweets:
        if twt['user']['screen_name'] in name_bitstr_dict:
            outputline = twt['sentiment'] + "," + name_bitstr_dict[twt['user']['screen_name']]
            final_outfile.write(outputline + "\n")            
        else:
            print("no followee data for", twt['user']['screen_name'])
            
        
final_outfile.close()

In [72]:
# test *_class_data.csv was written correctly

headers = [top_followed['screen_name'][x] for x in range(20)]
headers.insert(0, 'sentiment')
class_data = pd.read_csv(news_outlet + "_class_data.csv", names=headers)

class_data

Unnamed: 0,sentiment,BarackObama,realDonaldTrump,cnnbrk,CNN,nytimes,BBCBreaking,NASA,PMOIndia,POTUS,...,HillaryClinton,TheEconomist,Reuters,WhiteHouse,FoxNews,WSJ,TIME,NPR,MSNBC,cspan
0,neutral,1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1,neutral,1,0,0,1,0,0,1,0,0,...,0,0,0,0,1,0,0,0,1,0
2,negative,0,1,0,0,0,1,1,0,1,...,0,0,0,1,0,0,0,0,0,0
3,neutral,0,0,0,0,0,0,0,0,1,...,0,0,0,1,0,0,0,0,0,0
4,neutral,0,1,0,0,0,0,1,0,0,...,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
82,neutral,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
83,neutral,0,1,1,1,0,1,0,0,1,...,0,1,0,1,1,1,0,1,0,1
84,neutral,0,1,0,0,0,1,0,0,1,...,1,0,0,1,0,0,0,0,0,0
85,negative,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
