In [15]:
import pandas as pd
import json
import numpy as np
from abc import ABC, abstractmethod
from collections.abc import Iterable
import Utils
import geopandas as gp
from Constants import Constants
import fiona
import re
from shapely.geometry import Polygon, MultiPolygon, Point, mapping
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import re

In [16]:
class TwtConstants():
    #whatever the file they send me is with annotated phrases
    annotated_file = 'source_data/sah3_mf_all.parquet'
    #where to save the cleaned up file
    annotated_file_rows = [
        'tweet_id',
        'user_id',
        'screen_name',
        'time',
        'text', #full text
        'Case',
        'Code',
        'ATTITUDE',
        'QUALITY',
        'retweet_count',
        'place'
    ]
    
    #old stuff for the twitter api
    twitter_api_secret_key = 'i7O1NJn53v7mxMxFIcfoRjEfMz4GjKzgNFxpkYGPLnY9uKbJ9a'
    twitter_api_key = 'Vh1GSHot0eZd7Z2NUgwS6JBlk'
    twitter_access_token = '789976214941425665-K0M517xGxsScKPrK93xrCGyOInKkvtL'
    twitter_access_token_secret = 'HuWWkEa1Aeg5hZRdq8TI6609dKqmasvrU9QbhbnoOYKvf'
    
    
    status_lookup_batch_size = 99

In [17]:
class TweetLocator():
    #class for finding the county a tweet is from
    
    def __init__(self, border_file = Constants.static_county_data_output_file):
        self.borders = self.get_borders(border_file) #should be dataframe with GEOID: int and features: (shapely polygon)
        
    def get_borders(self, border_json):
        try:
            with open(border_json,'r') as f:
                border_dict = json.load(f)
            borders = pd.DataFrame(border_dict).T
            borders.index.name = "GEOID"
            borders = borders.features.apply(self.geojson_to_shapely)
            return borders.reset_index()
        except Exception as e:
            print(e)
    
    def geojson_to_shapely(self, features):
        coords = features['coordinates']
        if features['type'] == 'Polygon':
            coords = np.array(coords[0])
            shape = Polygon(coords)
        else:
            coords = [Polygon(np.array(c[0])) for c in coords]
            shape = MultiPolygon(coords)
        return shape
        
    def county_from_polygon(self, polygon, minimum_overlap = .4):
        if polygon is None:
            return 0
        try:
            intersects = self.borders.features.apply(lambda x: x.intersects(polygon))
            intersection = self.borders[intersects == True].features.apply(lambda x: polygon.intersection(x).area/polygon.area)
            if intersection.shape[0] <= 0 or intersection.max() < minimum_overlap:
                return 0
            max_overlap = self.borders.loc[intersection.idxmax()]
            return int(max_overlap.GEOID)
        except Exception as e:
            print(e)
            return 0

In [14]:
def clean_cases(case_text):
    #turns the "case #xx" test to just the number xx
    pattern = re.compile('[0-9]+')
    integer = pattern.search(case_text)
    if integer is None:
        print('bad case number', case_text)
        return 0
    return integer.group(0)

def place_coords(p):
    #This is an attempt to parse the bounding box of the 'place'
    #from the twitter data.  Probable specific to the vipul's weird encoding scheme
    try:
        place = p.replace("\'", "\"")
        place = place.replace("\"\{","\'\{")
        place = place.replace("\"\}","\'\}")
        place = json.loads(place)
        place = place['bounding_box']['coordinates']
    except:
        try:
            coords = re.match(".*\'coordinates\'\:\s*(.*)\}\s*\,",p).group(1)
            place = json.loads(coords)
        except:
            print('lol')
    while len(place) == 1:
        place = place[0]
    return place

def read_annotated_tweets():
    annotated_tweets = pd.read_parquet(TwtConstants.annotated_file)
    print(annotated_tweets.columns)
    annotated_tweets = annotated_tweets.loc[:,TwtConstants.annotated_file_rows]
    annotated_tweets.loc[:,'case_id'] = annotated_tweets.Case.apply(clean_cases)
    annotated_tweets.loc[:,'is_vivid'] = (annotated_tweets.QUALITY == 'V').astype('int')
    annotated_tweets.loc[:,'for_sah'] = (annotated_tweets.ATTITUDE == 1).astype('int')
    annotated_tweets = annotated_tweets.drop(['Case','ATTITUDE','QUALITY'],axis=1)
    annotated_tweets.time = Utils.format_datetime_series(annotated_tweets.time)
    return annotated_tweets

def extract_tweet_coords(annotated_tweets):
    #looks at the "place" string and pulls out the actual bounding box part
    annotated_tweets.loc[:,'geometry'] = annotated_tweets.place.apply(place_coords).apply(Polygon)
    annotated_tweets = annotated_tweets.drop('place',axis=1)
    return annotated_tweets
    
def get_tweet_county_geoids(annotated_tweets):
    tl = TweetLocator()
    annotated_tweets.loc[:,'GEOID'] = annotated_tweets.geometry.apply(tl.county_from_polygon)
    print(annotated_tweets.shape[0],len(np.unique(annotated_tweets[annotated_tweets.GEOID != 0].case_id)))
    return annotated_tweets

def phrases_to_tweets(annotated_tweets):
    unique_codes = list(np.unique(annotated_tweets.Code))
    tweet_list = []
    for case_id, tweet in annotated_tweets.drop('geometry',axis=1).groupby('case_id'):
        #just a quality check
        for col in tweet.drop(['Code'],axis=1).columns:
            if len( set(tweet[col].values)) > 1:
                print(col, tweet[col])
        #get all the codes as booleans
        codes = list(np.unique(tweet.Code))
        #all the values beside the code should be he same
        data = tweet.drop('Code',axis=1).iloc[0].to_dict()
        data['case_id'] = case_id
        for code in unique_codes:
            data[code] = int(code in codes)
        tweet_list.append(data)
    
    annotated_df = pd.DataFrame(tweet_list).set_index('case_id')
    return annotated_df.drop(['tweet_id'],axis=1)

def seperate_date(annotated_df):
    annotated_df.loc[:,'month'] = annotated_df.time.apply(lambda x: pd.to_datetime(x).month)
    annotated_df.loc[:,'day'] = annotated_df.time.apply(lambda x: pd.to_datetime(x).day)
    annotated_df.loc[:,'year'] = annotated_df.time.apply(lambda x: pd.to_datetime(x).year)
    annotated_df = annotated_df.drop(['time'],axis=1)
    return annotated_df

def get_tweet_sentiments(tweet_string,analyzer):
    #vader compound sentiment score for a single tweet
    tweet_string = re.sub(r'http\S+', 'url', tweet_string)
    tweet_string = re.sub(r'#(\w+)', 'hashtag ', tweet_string)
    sentiment = analyzer.polarity_scores(tweet_string)
    sscore = sentiment['compound']
    return sscore

def add_sentiment(tweet_df):
    #add a simple vader sentiment score
    analyzer = SentimentIntensityAnalyzer()
    sscores = tweet_df.text.apply(lambda x: get_tweet_sentiments(x,analyzer))
    tweet_df.loc[:,'sentiment_score'] = sscores
    return tweet_df

#this is essentially a series of scripts to convert into a dataframe with all the tweets with their geoids and codes
#geoid missing are included with a value of 0

phrases = read_annotated_tweets()
phrases = extract_tweet_coords(phrases)
phrases = get_tweet_county_geoids(phrases)
tweets = phrases_to_tweets(phrases)
tweets = seperate_date(tweets)
tweets = add_sentiment(tweets)
tweets.T

Index(['Unnamed: 0', 't_type', 'tweet_id', 'user_id', 'screen_name', 'time',
       'text', 'is_reply', 'reply_to_status_id', 'reply_to_user_id',
       'is_quote', 'quoted_tweet_id', 'quoted_status',
       'quoted_status_permalink', 'retweet_count', 'favorite_count', 'lang',
       'place', 'url_attached', 'link', 'SAH_attitude', 'day', 'month',
       'USER_ID', 'Case', 'Code', 'Text', 'Unnamed: 5', 'Unnamed: 6',
       'RELEVANCE', 'ATTITUDE', 'QUALITY', 'date'],
      dtype='object')
2427 1483


case_id,1000,1001,1003,1009,1011,1016,102,1021,1025,103,...,971,973,975,977,981,987,989,994,997,999
Authority,1,0,1,1,0,0,0,0,0,0,...,1,0,0,0,1,1,0,0,0,0
Betrayal,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Care,0,1,1,0,1,0,1,1,1,1,...,0,1,1,0,0,0,0,0,1,1
Degradation,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Fairness,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
Freedom,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
GEOID,24033,0,36007,53053,36047,12099,44007,6075,25015,0,...,48029,6001,4013,0,29510,53033,17201,37183,6061,6025
Harm,0,0,0,0,0,1,0,0,0,1,...,0,0,1,1,0,0,0,0,0,0
Injustice,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Loyalty,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0


In [9]:
tweets.to_csv(Constants.tweet_output_file)

In [10]:
pd.read_csv(Constants.tweet_output_file).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1795,1796,1797,1798,1799,1800,1801,1802,1803,1804
case_id,1000,1001,1003,1009,1011,1016,102,1021,1025,103,...,971,973,975,977,981,987,989,994,997,999
Authority,1,0,1,1,0,0,0,0,0,0,...,1,0,0,0,1,1,0,0,0,0
Betrayal,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Care,0,1,1,0,1,0,1,1,1,1,...,0,1,1,0,0,0,0,0,1,1
Degradation,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Fairness,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
Freedom,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
GEOID,24033,0,36007,53053,36047,12099,44007,6075,25015,0,...,48029,6001,4013,0,29510,53033,17201,37183,6061,6025
Harm,0,0,0,0,0,1,0,0,0,1,...,0,0,1,1,0,0,0,0,0,0
Injustice,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
