
Connect to Google Drive.

In [0]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


Access the data and determine the following:
*   If Tweet is related to Climate Change
*   If Tweet is sentimental or not

In [0]:
import requests, re, nltk
nltk.download('stopwords')
from collections import Counter
import operator

import json

# we may not care about the usage of stop words
stop_words = nltk.corpus.stopwords.words('english') + [
 'ut', '\'re','.', ',', '--', '\'s', '?', ')', '(', ':', '\'',
 '\"', '-', '}', '{', '&', '|', 'rt', '', u'\u2014' ]

#-------------------------------------------------------------------------------------------------------------------

# We also want to remove special characters, quotes, etc. from each word
def cleanWord (w):
    # r in r'[.,"\']' tells to treat \ as a regular character 
    # but we need to escape ' with \'
    # any character between the brackets [] is to be removed 
    wn = re.sub('[,"\.\'&\|:@>*;/=]', "", w)
    # get rid of numbers
    return re.sub('^[0-9\.]*$', "", wn)
  
#-------------------------------------------------------------------------------------------------------------------
  
def TweetAnalyze():

  #obtain .json file data
  file = open("/content/gdrive/My Drive/ECE-545 - Fundamentals of Digital Archeology/sample_vannostrand.json")
  data = json.load(file)
  file.close()
  
  climateChangeCount = 0 #total number of Tweets specific to climate change
  naturalDisasterCount = 0 #total number of Tweets specific to natural disasters
  changeAccept = 0 #total number of Tweets specific to climate change acceptance
  changeDeny = 0 #total number of Tweets specific to climate change denial
  posSentCount = 0 #total number of Tweets specific to sentamental Tweets
  negSentCount = 0 #total number of Tweets specific to non-sentamental Tweets
  
  # - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  
  #lists of words to determine Tweet classification
  #ALL WORDS ARE LOWERCASE
  #ALSO ONLY USING STEM WORDS
  climateWords = ['climat', 'chang', 'global', 'warm']
  disastersWords = ['hurrican', 'earthquake', 'tornado']
  acceptWords = ['real', 'believ', 'deterior', 'grave', 'danger', 'ignor', 'warn']
  denyWords = ['fake', 'hoax', 'deni', 'deny', 'doubt', 'scheme', 'scam']
  posSentWords = ['hope', 'wish', 'best', 'pray', 'share', 'great', 'well', 'offer', 'resolv']
  negSentWords = ['hate', 'stupid', 'offens', 'fear', 'fuck', 'crazi', 'shit', 'wast', 'disgrac', 'scold', 'anxieti']
  
  #noticed most regarding Trump are mocking him for not believing in climate change
  presidentWords = ['trump', 'presid']
  
  #-----------------------------------------------------------------------------------------------------

  #NOTE: data[#] is a dictionary, so each tweet is a dictionary
  #NOTE: 1,000 tweets in data
  for dataDict in data:
  
    #Debugging
    #print(dataDict)
    #input("wait")
    
    if(dataDict[u'lang']== u'en'): #if language of tweet is English
    #currently can only check for Tweets in English
    
      #-------------------------------------------------------------------------------------------------
    
      #print out important fields
      #--------------------------
    
      #Debugging
      #print(dataDict)
      #input("wait")
    
      #return names
      #------------
      #print("Screen name: " + dataDict["user"]["screen_name"]) #Twitter profile name
      #print("Full Name: " + dataDict["user"]["name"]) #full name associated with profile
    
      #- - - - - - - - - - - - -
    
      #return user ID
      #--------------
      #print("Consistent User ID: " + dataDict["user"]["id_str"]) #string version of user ID
    
      #- - - - - - - - - - - - -
    
      #also return retweet status
      #if 'retweeted_status' in dataDict:
      #  print("Retweet!")
      
      #-------------------------------------------------------------------------------------------------

      #return Tweet text and determine "context" of the Tweet
      #------------------------------------------------------
    
      if 'extended_tweet' in dataDict: #if there is an extended Tweet (sometimes there isn't)
        #print("Extended Tweet: " + dataDict["extended_tweet"]["full_text"]) #full Tweet text
        
        # split string into an array of words using any sequence of spaces "\s+" 
        wds = re.findall('\w+',dataDict["extended_tweet"]["full_text"])
        
      else: #if there is no extended Tweet
        #print("Tweet: " + dataDict["text"]) #Tweet text (cut off if too long)
        
        # split string into an array of words using any sequence of spaces "\s+" 
        wds = re.findall('\w+',dataDict["text"])
        
      # remove periods, commas, etc stuck to the edges of words
      for i in range(len(wds)):
        wds [i] = cleanWord (wds [i])
        
      #convert all words to lowercase
      wds = [word.lower() for word in wds]
      
      #leave only the stems of words using ntlk's Snowball stemmer
      snowball = nltk.stem.SnowballStemmer('english')
      wds = [snowball.stem(word) for word in wds]
      
      # If satisfied with results, lets go to the next step: calculate frequencies
      # We can write a loop to create a dictionary, but 
      # there is a special function for everything in python
      # in particular for counting frequencies (like function table() in R)
      wf = Counter (wds)
      
      # Remove stop words from the dictionary wf
      for k in stop_words:
        wf. pop(k, None)
      
      #- - - - - - - - - - - - -
      
      #determine Tweet classification
      
      #debugging
      print(wf)
      
      beliefCheck = 0
      sentCheck = 0
      climateFlag = False
      disasterFlag = False
      presidentOverride = False
      
      for words in wf:
        #determine flags
        
        #debugging
        #print(words)
        
        if words in climateWords:
          climateFlag = True
        if words in disastersWords:
          disasterFlag = True
        if (words == 'Trump') or (words == 'president'):
          presidentOverride = True
        
        #determine sentimental viewpoint and belief based on frequency
        if words in acceptWords:
          beliefCheck += 1
        if words in denyWords:
          beliefCheck -= 1
        if words in posSentWords:
          sentCheck += 1
        if words in negSentWords:
          sentCheck -= 1
      
      #- - - - - - - - - - - - -
      
      #return classification
      if climateFlag:
        #print("Climate Change Tweet")
        #print("\n")
        climateChangeCount += 1
      
      if disasterFlag:
        #print("Natural Disaster Tweet")
        #print("\n")
        naturalDisasterCount += 1
      
      if (not presidentOverride): #if not a President Trump related Tweet
        
        #for Climate Change belief check
        if climateFlag:
          if (beliefCheck > 0):
            #print("Belief Tweet")
            #print("\n")
            changeAccept += 1
          elif (beliefCheck < 0):
            #print("Deny Tweet")
            #print("\n")
            changeDeny += 1
          #else:
            #print("Viewpoint unknown")
            #print("\n")
            
        #for sentamentality check
        if (sentCheck > 0):
          #print("Sentamental Tweet")
          #print("\n")
          posSentCount += 1
        elif (sentCheck < 0):
          #print("Not Sentamental Tweet")
          #print("\n")
          negSentCount += 1
      
      elif (climateFlag and presidentOverride): #if Trump and Climate Change specific
        #print("Trump Climate Change Rant Tweet")
        #print("\n")
        changeAccept += 1
        negSentCount += 1
        
      #-------------------------------------------------------------------------------------------------
    
      #if(next(iter(data)) != None): #for keeping this legible, separate tweets if there's more
      #  print("\n")
      #  print("-----------------------------------------------------------------------------")
      #  print("\n")

  #-----------------------------------------------------------------------------------------------------

  #print final calculations
  print("Number of Climate Change Tweets: " + str(climateChangeCount))
  print("Number of Natural Disaster Tweets: " + str(naturalDisasterCount))
  print("Number of Climate Change Acceptance Tweets: " + str(changeAccept))
  print("Number of Climate Change Denial Tweets: " + str(changeDeny))
  print("Number of Sentamental Tweets: " + str(posSentCount))
  print("Number of Non-sentamental Disaster Tweets: " + str(negSentCount))

#-------------------------------------------------------------------------------------------------------------------
#call the TweetAnalyze ("main") function
TweetAnalyze()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Counter({'craigabriel': 1, 'tell': 1, 'https': 1, 'co': 1, 'hdayk3gnzw': 1})
Counter({'plant': 2, 'scobi': 1, 'new': 1, 'zealand': 1, 'pm': 1, 'jacindaardern': 1, 'join': 1, 'treesthatcount': 1, 'million': 1, 'tree': 1, 'mitig': 1, 'climat': 1, 'chang': 1})
Counter({'extinctionr': 1, 'everyon': 1, 'understand': 1, 'actual': 1, 'need': 1, 'someth': 1, 'much': 1, 'urgent': 1, 'situat': 1, 'amp': 1, 'onboa': 1})
Counter({'indivisible410': 1, 'file': 1, 'thing': 1, 'gonna': 1, 'sign': 1, 'senatorcardin': 1, 'chrisvanhollen': 1, 'dem': 1, 'say': 1})
Counter({'senwhitehous': 1, 'chamberofcarbon': 1, 'oil': 1, 'compani': 1, 'like': 1, 'paid': 1, 'accomplic': 1, 'support': 1, 'broad': 1, 'swath': 1, 'corpora': 1})
Counter({'michaelpolanyi': 1, 'obes': 1, 'undernutrit': 1, 'global': 1, 'warm': 1, 'share': 1, 'common': 1, 'caus': 1, 'power': 1, 'commerci': 1, 'interest': 1, 'pro