In [1]:
import re
import pandas as pd
import geonamescache

gc = geonamescache.GeonamesCache()

In [2]:
# Read in the data
data = pd.read_csv("../Data/ufo_data.csv")

In [3]:
# make sure we got it
data.head()

Unnamed: 0,Date/Time,City,State,Shape,Duration,Summary,Posted
0,2/28/21 21:50,Monterrey (Mexico),,Changing,3 minutes,"I was putting my clothes to dry, when i notice...",3/2/21
1,2/21/21 12:00,Littlehampton (UK/England),,Rectangle,several sconds,"Fast moving silver objects, visible through Bi...",3/2/21
2,2/20/21 11:00,Sydney,,Cigar,2 minutes,2 craft seen looked initially like a satellite...,3/2/21
3,2/19/21 19:34,PATNA,,Circle,2 seconds,There was a circle object having 8 red lights ...,3/2/21
4,2/18/21 18:00,,,Circle,Photo,First photos sent back from perseverance on ma...,3/2/21


In [4]:
# filter out NA city
data = data[data["State"].notna()]

In [5]:
# prep state data for filtering
states = pd.DataFrame(info for state, info in gc.get_us_states().items())
states.head()


Unnamed: 0,code,fips,geonameid,name
0,AK,2,5879092,Alaska
1,AL,1,4829764,Alabama
2,AR,5,4099753,Arkansas
3,AZ,4,5551752,Arizona
4,CA,6,5332921,California


In [6]:
# prep city data for filtering
cities = pd.DataFrame(info for cityid, info in gc.get_cities().items() if info["countrycode"] == "US")
cities.head()

Unnamed: 0,admin1code,alternatenames,countrycode,geonameid,latitude,longitude,name,population,timezone
0,VA,[],US,4046704,38.73289,-77.05803,Fort Hunt,16045,America/New_York
1,AL,"[Besemer, Bessemer, bei se mo, bes'semara, bes...",US,4048023,33.40178,-86.95444,Bessemer,26730,America/Chicago
2,KY,"[PAH, Padaka, Padjuka, Paducah, Paduka, Pekin,...",US,4048662,37.08339,-88.60005,Paducah,24864,America/Chicago
3,AL,"[BHM, Bermincham, Bermingkham, Birmingam, Birm...",US,4049979,33.52066,-86.80249,Birmingham,212461,America/Chicago
4,AL,"[Senter Pojnt, Sentr-Pojnt, sentara po'inta, s...",US,4054378,33.64566,-86.6836,Center Point,16655,America/Chicago


In [7]:
# break into US states and non (such as canadian provinces)
us_data = data[data["State"].isin(states["code"])].reset_index()
non_us_data = data[~data["State"].isin(states["code"])].reset_index()

In [8]:
# check what city entries have non-word characters to find patterns
weird_cities = us_data[us_data["City"].str.contains("[^a-zA-Z ]", na=False)]
cities_notes = us_data[us_data["City"].str.contains("[a-zA-Z- ] \(", na=False)]
print("number of strangeish entries (non-alpha):")
print(weird_cities.shape[0])
print("number of entries in format <city> (<something>)")
print(cities_notes.shape[0])

number of strangeish entries (non-alpha):
4926
number of entries in format <city> (<something>)
2203


4,926 is quite a few. Over half of these are just using parenthases for notes, so we can clean those.

In [9]:
def rm_notes(x):
    x = str(x)
    regexpr = re.compile("[a-zA-Z- ] \(")
    if regexpr.search(x):
        return x[0:x.find(" (")]
    else:
        return x

In [10]:
us_data["City"] = us_data["City"].apply(rm_notes)

In [11]:
# test if we've reduced that a little
cities_notes = us_data[us_data["City"].str.contains("[a-zA-Z- ] \(", na=False)]
print(cities_notes.shape[0])

0


There are a few where the exact coords are given. We can save these for later.

In [12]:
us_data_direct_coords = us_data[us_data["City"].str.contains("[0-9]+\.[0-9]+\s+[0-9]+\.[0-9]+", na=False)]

In [13]:
us_data["city_lower"] = us_data["City"].str.lower()
cities["city_lower"] = cities["name"].str.lower()
cities["State"] = cities["admin1code"]

In [14]:
us_data_matched_cities = pd.merge(us_data, cities[["latitude", "longitude", "city_lower", "State"]], on=["city_lower", "State"], how="inner")
# us_data[us_data["City"].str.lower().isin(cities["name"].str.lower())]

In [15]:
print("got {} out of initial {}".format(us_data_matched_cities.shape[0], us_data.shape[0]))

got 49956 out of initial 87784


Let's take a look at what didn't match

In [16]:
us_data_unmatched = us_data[~us_data["City"].str.lower().isin(cities["name"].str.lower())]

A lot of these are cities not recognized by the city list. Let's use a more comprehensive city list.

In [17]:
newcities = pd.read_csv("../Data/uscities.csv")
newcities.columns.tolist()

['city',
 'city_ascii',
 'state_id',
 'state_name',
 'county_fips',
 'county_name',
 'lat',
 'lng',
 'population',
 'density',
 'source',
 'military',
 'incorporated',
 'timezone',
 'ranking',
 'zips',
 'id']

In [18]:
newcities["city_lower"] = newcities["city_ascii"].str.lower()
newcities["State"] = newcities["state_id"]

us_data_matched_cities = pd.merge(us_data, newcities[["lat", "lng", "city_lower", "State"]], on=["city_lower", "State"], how="inner")
#us_data[us_data["City"].str.lower().isin(newcities["city_ascii"].str.lower())]
print("got {} out of initial {}".format(us_data_matched_cities.shape[0], us_data.shape[0]))

got 71805 out of initial 87784


Let's see again what didn't match.

In [19]:
us_data_unmatched = us_data[~us_data["City"].str.lower().isin(newcities["city_ascii"].str.lower())]

A lot of these are places rather than cities, with a lot probably being unincorporated townships, etc. Quite a few would appear to be along the lines of 'New York City', which doesn't match because it's not 'New York'. Let's get whatever we can by removing 'city' from these.

In [20]:
def rm_city(x):
    x = str(x)
    if "city" in x.lower():
        return x[0:x.lower().find(" city")]
    else:
        return x

In [21]:
us_data_unmatched["City"] = us_data_unmatched["City"].apply(rm_city)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [22]:
us_additional_matched_cities = pd.merge(us_data_unmatched, newcities[["lat", "lng", "city_lower", "State"]], on=["city_lower", "State"], how="inner")
#us_data_unmatched[us_data_unmatched["City"].str.lower().isin(newcities["city_ascii"].str.lower())]

In [23]:
us_additional_matched_cities.head()

Unnamed: 0,index,Date/Time,City,State,Shape,Duration,Summary,Posted,city_lower,lat,lng


Huh, nothing. Oh well.

In [24]:
# drop useless columns and reset the index
us_data_matched_cities.drop(columns=["Summary", "index"], inplace=True)
us_data_matched_cities.reset_index(drop=True, inplace=True)


This would appear to be the best it gets without significantly more in-depth work, so I'll call it here.

In [25]:
us_data_matched_cities.to_csv("../Data/ufo_data_us_cleaned_v1.csv", index=False)

In [27]:
# Get the final unmatched ones
us_data_unmatched = pd.merge(us_data, newcities[["lat", "lng", "city_lower", "State"]], on=["city_lower", "State"], how="outer", indicator="source")
us_data_unmatched = us_data_unmatched[us_data_unmatched.source.eq('left_only')].drop('source', axis=1)