In [38]:
import pandas as pd
import json
import csv

USER_FIELDS_TO_INCLUDE = ('id_str', 'name', 'screen_name', 'location', 'description', 'followers_count', 
                          'friends_count', 'created_at', 'geo_enabled', 'verified', 'lang')
DOWNLOADED_FILENAME = 'downloaded_json_dataset/downloaded_training.json'
BASE_FILENAME = 'base_dataset/training.json'    # Original labelled dataset file to get tweet_city

base_tweets = dict()
with open(BASE_FILENAME) as f:
    for record in f:
        tweet_dict = json.loads(record.strip())
        base_tweets[tweet_dict['tweet_id']] = tweet_dict
        
base_tweets

{'375694836227383296': {'tweet_id': '375694836227383296',
  'user_id': '16886693',
  'user_city': 'city of london-enggla-gb',
  'user_city_latitude': '51.51279',
  'user_city_longitude': '-0.09184',
  'tweet_city': 'city of london-enggla-gb',
  'tweet_latitude': '51.54479793',
  'tweet_longitude': '-0.02286849'},
 '375694840652374016': {'tweet_id': '375694840652374016',
  'user_id': '42766623',
  'user_city': 'talcahuano-0681-cl',
  'user_city_latitude': '-36.71667',
  'user_city_longitude': '-73.11667',
  'tweet_city': 'talcahuano-0681-cl',
  'tweet_latitude': '-36.72015591',
  'tweet_longitude': '-73.10867071'},
 '375694840303845376': {'tweet_id': '375694840303845376',
  'user_id': '24326581',
  'user_city': 'charlotte-nc119-us',
  'user_city_latitude': '35.22709',
  'user_city_longitude': '-80.84313',
  'tweet_city': 'charlotte-nc119-us',
  'tweet_latitude': '35.23065763',
  'tweet_longitude': '-80.83622548'},
 '375694844662145024': {'tweet_id': '375694844662145024',
  'user_id': '6

In [35]:
num_written = 0
num_without_geo = 0

# Reformat dataset as csv
with open(DOWNLOADED_FILENAME) as fread, open(DOWNLOADED_FILENAME + '_reformatted.csv', 'w') as fwrite:
    entries = fread.readlines()
    num_total_records = len(entries) 
    
    for entry in entries:
        entry_dict = json.loads(entry)     
                
        # Skip record if no coordinates present
        if not entry_dict.get('geo'):
            num_without_geo += 1
            continue
        
        # Remove unnecessary fields and add relevant user fields as keys to the main dict
        user_dict = entry_dict.pop('user')
        user_dict = {'user_' + key: value for key, value in user_dict.items() if key in USER_FIELDS_TO_INCLUDE}
        entry_dict.update(user_dict)

        # Remove duplicate coordinates fields
        entry_dict.pop('coordinates')
        coordinates = entry_dict.pop('geo')

        entry_dict['coordinates_lat'] = coordinates['coordinates'][1]    # As per twitter api, first entry is longitude, second is latitude
        entry_dict['coordinates_long'] = coordinates['coordinates'][0]    # https://developer.twitter.com/en/docs/tweets/data-dictionary/overview/geo-objects
        
        tweet_id = str(entry_dict['id'])
        entry_dict['tweet_city'] = base_tweets[tweet_id]['tweet_city']
        
        dictwriter = csv.DictWriter(fwrite, entry_dict.keys())
        
        if num_written == 0:
            dictwriter.writeheader()
        
        dictwriter.writerow(entry_dict)
        num_written += 1
        
        if num_written % 100 == 0:
            print('Processed {} records out of {}'.format(num_written, num_total_records))

print('Successfully wrote {} records, record without coordinates: {}'.format(num_written, num_without_geo))

Processed 100 records out of 78846
Processed 200 records out of 78846
Processed 300 records out of 78846
Processed 400 records out of 78846
Processed 500 records out of 78846
Processed 600 records out of 78846
Processed 700 records out of 78846
Processed 800 records out of 78846
Processed 900 records out of 78846
Processed 1000 records out of 78846
Processed 1100 records out of 78846
Processed 1200 records out of 78846
Processed 1300 records out of 78846
Processed 1400 records out of 78846
Processed 1500 records out of 78846
Processed 1600 records out of 78846
Processed 1700 records out of 78846
Processed 1800 records out of 78846
Processed 1900 records out of 78846
Processed 2000 records out of 78846
Processed 2100 records out of 78846
Processed 2200 records out of 78846
Processed 2300 records out of 78846
Processed 2400 records out of 78846
Processed 2500 records out of 78846
Processed 2600 records out of 78846
Processed 2700 records out of 78846
Processed 2800 records out of 78846
P

In [40]:
# Verify formatting

# tweets_df = pd.read_csv('formatted_csv_dataset/formatted_training.csv', sep=',')
# incorrect_tweets = tweets_df[tweets_df.apply(lambda t: t['tweet_city'] != base_tweets[str(t['id'])]['tweet_city'], axis=1)]

# incorrect_tweets

tweets_df[['id', 'tweet_city']]

Unnamed: 0,id,tweet_city
0,375695343696236544,moron-01-ar
1,375695570423513088,denpasar-02-id
2,375695876532228097,aydin-09-tr
3,375694979123146752,crawley-engp6-gb
4,375696291793477633,fayetteville-nc051-us
5,375696300005945344,coventry-engc7-gb
6,375695985529618433,johannesburg-06jhb-za
7,375695813483458560,frejus-b883-fr
8,375696010678665216,jamaica-ny081-us
9,375695876506660864,chicago-il031-us
