# Importing Modules

In [1]:
import os
import json
import time
import pickle
import tweepy as tw
import pandas as pd
import reverse_geocoder as rg

# Credentials

In [2]:
with open("/src/twitter_credentials.json", 'r') as f:
    credentials = json.load(f)

In [3]:
auth = tw.OAuthHandler(credentials['consumer_key'], credentials['consumer_secret'])
api = tw.API(auth)

# Unique Geo Ids

In [2]:
path = "/data/daily_world_en_csv"
db_save_path = "/data/unique_geo_ids"
list_of_csvs = os.listdir(path)
all_df = pd.concat([pd.read_csv(f"{path}/{i}") for i in list_of_csvs])
unique_ids = all_df['geo_place_id'].unique()

In [103]:
unique_ids.shape

(16725,)

# Fetching Data

In [104]:
time_delay = [4, 16, 36, 64, 100, 144, 196, 256]

geo_obj = api.geo_id(unique_ids[0]) # example / an example of removed place id: '07d9d27e7d480000'

In [105]:
geo_obj

Place(_api=<tweepy.api.API object at 0x000001FAED127700>, id='3b98b02fba3f9753', name='North Carolina', full_name='North Carolina, USA', country='United States', country_code='US', url='https://api.twitter.com/1.1/geo/id/3b98b02fba3f9753.json', place_type='admin', attributes={'162772:state_id': '37', '567718:targetable': '1', '189390:id': 'north-carolina', '162813:id': 'NC', 'geotagCount': '4002'}, bounding_box=BoundingBox(_api=<tweepy.api.API object at 0x000001FAED127700>, type='Polygon', coordinates=[[[-84.3219475, 33.752879], [-84.3219475, 36.588118], [-75.40012, 36.588118], [-75.40012, 33.752879], [-84.3219475, 33.752879]]]), centroid=[-78.57673379465703, 35.1704985], contained_within=[Place(_api=<tweepy.api.API object at 0x000001FAED127700>, id='96683cc9126741d1', name='United States', full_name='United States', country='United States', country_code='US', url='https://api.twitter.com/1.1/geo/id/96683cc9126741d1.json', place_type='country', attributes={}, bounding_box=BoundingBox(_

In [148]:
columns = ['id', 'name', 'state', 'place_type', 'geo_tag_count', 'longitude', 'latitude']

In [107]:
print(geo_obj.name)
print(geo_obj.full_name.split(",")[-1].strip())
print(geo_obj.place_type)
print(geo_obj.attributes['geotagCount'])
print(geo_obj.centroid)

North Carolina
USA
admin
4002
[-78.57673379465703, 35.1704985]


In [None]:
sleep_idx = 0
length_of_td = len(time_delay)
for geoID in unique_ids:
    while True:
        try:
            place = api.geo_id(geoID)
            sleep_idx = 0
            break
        except:
            print(f"sleep {time_delay[sleep_idx]} seconds")
            time.sleep(time_delay[sleep_idx])
            sleep_idx += 1
            sleep_idx %= length_of_td
    with open(f"{db_save_path}/{geoID}.db", 'wb') as f:
        pickle.dump(place, f)

### Looking Each place_type to determine how to get state

In [5]:
saved_dbs = os.listdir(db_save_path)
saved_dbs.sort()

In [37]:
place_types_and_place_objs = {}

for i in saved_dbs:
    with open(f"{db_save_path}/{i}", 'rb') as f:
        place_obj = pickle.load(f)
    place_type_aobj = place_obj.place_type
    if not place_types_and_place_objs.get(place_type_aobj, False):
        place_types_and_place_objs[place_type_aobj] = place_obj

In [39]:
for i in place_types_and_place_objs:
    print(place_types_and_place_objs[i])
    print()

Place(id='0000321b41466bc8', name='North Attleboro', full_name='North Attleboro, MA', country='United States', country_code='US', url='https://api.twitter.com/1.1/geo/id/0000321b41466bc8.json', place_type='city', attributes={'geotagCount': '41'}, bounding_box=BoundingBox(type='Polygon', coordinates=[[[-71.381728, 41.914734], [-71.381728, 42.0149913], [-71.268541, 42.0149913], [-71.268541, 41.914734], [-71.381728, 41.914734]]]), centroid=[-71.34153953038711, 41.96486265], contained_within=[Place(id='849efb7c9922523c', name='PROVIDENCE-NEW BEDFORD', full_name='PROVIDENCE-NEW BEDFORD', country='', country_code='', url='https://api.twitter.com/1.1/geo/id/849efb7c9922523c.json', place_type='admin', attributes={}, bounding_box=BoundingBox(type='Polygon', coordinates=[[[-71.895115, 41.146493], [-71.895115, 42.095723], [-70.815842, 42.095723], [-70.815842, 41.146493], [-71.895115, 41.146493]]]), centroid=[-71.60288232526187, 41.699487000000005])], polylines=[], geometry=None)

Place(id='0180a2

In [143]:
for i in saved_dbs:
    with open(f"{db_save_path}/{i}", 'rb') as f:
        place_obj = pickle.load(f)
    if place_obj.place_type == "admin" and not place_obj.attributes.get('162813:id', False):
        print(place_obj)
        print()

Place(id='27485069891a7938', name='New York', full_name='New York, NY', country='United States', country_code='US', url='https://api.twitter.com/1.1/geo/id/27485069891a7938.json', place_type='admin', attributes={'189390:id': 'new-york-ny', '162772:pop100': '8008278', 'geotagCount': '924', '162772:place_id': '3651000'}, bounding_box=BoundingBox(type='Polygon', coordinates=[[[-74.255641, 40.495865], [-74.255641, 40.91533], [-73.699793, 40.91533], [-73.699793, 40.495865], [-74.255641, 40.495865]]]), centroid=[-73.86770300366848, 40.685776000000004], contained_within=[Place(id='94965b2c45386f87', name='New York', full_name='New York, USA', country='United States', country_code='US', url='https://api.twitter.com/1.1/geo/id/94965b2c45386f87.json', place_type='admin', attributes={}, bounding_box=BoundingBox(type='Polygon', coordinates=[[[-79.76259, 40.477383], [-79.76259, 45.015851], [-71.777492, 45.015851], [-71.777492, 40.477383], [-79.76259, 40.477383]]]), centroid=[-76.2146236146748, 42.7

In [83]:
centroid = place_types_and_place_objs['poi'].centroid # replace with acronym
rg.search(centroid[::-1]['admin1'])

[{'lat': '40.74482',
  'lon': '-73.94875',
  'name': 'Long Island City',
  'admin1': 'New York',
  'admin2': 'Queens County',
  'cc': 'US'}]

In [100]:
centroid = [-73.96229110893742, 40.70995790682886]

results = rg.search(centroid[::-1]) # default mode = 2

results

[{'lat': '40.74482',
  'lon': '-73.94875',
  'name': 'Long Island City',
  'admin1': 'New York',
  'admin2': 'Queens County',
  'cc': 'US'}]

# Getting Places and Handling States

In [161]:
us_state_abbrev = {
    'Alabama': 'AL',
    'Alaska': 'AK',
    'American Samoa': 'AS',
    'Arizona': 'AZ',
    'Arkansas': 'AR',
    'California': 'CA',
    'Colorado': 'CO',
    'Connecticut': 'CT',
    'Delaware': 'DE',
    'District of Columbia': 'DC',
    'Florida': 'FL',
    'Georgia': 'GA',
    'Guam': 'GU',
    'Hawaii': 'HI',
    'Idaho': 'ID',
    'Illinois': 'IL',
    'Indiana': 'IN',
    'Iowa': 'IA',
    'Kansas': 'KS',
    'Kentucky': 'KY',
    'Louisiana': 'LA',
    'Maine': 'ME',
    'Maryland': 'MD',
    'Massachusetts': 'MA',
    'Michigan': 'MI',
    'Minnesota': 'MN',
    'Mississippi': 'MS',
    'Missouri': 'MO',
    'Montana': 'MT',
    'Nebraska': 'NE',
    'Nevada': 'NV',
    'New Hampshire': 'NH',
    'New Jersey': 'NJ',
    'New Mexico': 'NM',
    'New York': 'NY',
    'North Carolina': 'NC',
    'North Dakota': 'ND',
    'Northern Mariana Islands':'MP',
    'Ohio': 'OH',
    'Oklahoma': 'OK',
    'Oregon': 'OR',
    'Pennsylvania': 'PA',
    'Puerto Rico': 'PR',
    'Rhode Island': 'RI',
    'South Carolina': 'SC',
    'South Dakota': 'SD',
    'Tennessee': 'TN',
    'Texas': 'TX',
    'Utah': 'UT',
    'Vermont': 'VT',
    'Virgin Islands': 'VI',
    'Virginia': 'VA',
    'Washington': 'WA',
    'Washington, D.C.': 'DC',
    'West Virginia': 'WV',
    'Wisconsin': 'WI',
    'Wyoming': 'WY'
}

In [144]:
location_array = []

In [145]:
def get_state(place_obj):
    aplace_type = place_obj.place_type
    if place_obj.full_name == "[Place name removed]":
        return ""
    if aplace_type == 'city':
        return place_obj.full_name.split(",")[-1].strip()
    elif aplace_type == 'neighborhood':
        return place_obj.contained_within[0].full_name[-2:]
    elif aplace_type == 'admin':
        return us_state_abbrev[place_obj.name]
    elif aplace_type == 'poi':
        centroid = place_obj.centroid
        search_result = rg.search(centroid[::-1])[0]
        try:
            return us_state_abbrev[search_result['admin1']]
        except Exception as e:
            print(search_result['cc'])
            return search_result['cc']
    elif aplace_type == 'country':
        return ""

In [146]:
list_of_dbs = os.listdir(db_save_path)

for name in list_of_dbs:
    with open(f"{db_save_path}/{name}", 'rb') as f:
        place = pickle.load(f)
    
    state = get_state(place)
    
    location_array.append([place.id,
                           place.name,
                           state,
                           place.place_type,
                           place.attributes.get('geotagCount', ''),
                           *getattr(place, 'centroid', ['', ''])])

PR
PR
CA
PR
GU
PR
PR
PR
PR
GU
PR
GU
MX
CA


In [210]:
pd.DataFrame(location_array, columns=columns).to_csv("data/unique_geo_ids_with_states.csv", index=False)