# You can find the dataset [here](https://www.kaggle.com/NUFORC/ufo-sightings) (scrubbed one)


In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt       
%matplotlib inline 
import seaborn as sns
sns.set_style('whitegrid')
import warnings
warnings.filterwarnings('ignore')
import json
import datetime

In [None]:
df = pd.read_csv('/content/drive/My Drive/ufo sightings/scrubbed.csv', error_bad_lines=False)
df.head()

Unnamed: 0,datetime,city,state,country,shape,duration (seconds),duration (hours/min),comments,date posted,latitude,longitude
0,10/10/1949 20:30,san marcos,tx,us,cylinder,2700,45 minutes,This event took place in early fall around 194...,4/27/2004,29.8830556,-97.941111
1,10/10/1949 21:00,lackland afb,tx,,light,7200,1-2 hrs,1949 Lackland AFB&#44 TX. Lights racing acros...,12/16/2005,29.38421,-98.581082
2,10/10/1955 17:00,chester (uk/england),,gb,circle,20,20 seconds,Green/Orange circular disc over Chester&#44 En...,1/21/2008,53.2,-2.916667
3,10/10/1956 21:00,edna,tx,us,circle,20,1/2 hour,My older brother and twin sister were leaving ...,1/17/2004,28.9783333,-96.645833
4,10/10/1960 20:00,kaneohe,hi,us,light,900,15 minutes,AS a Marine 1st Lt. flying an FJ4B fighter/att...,1/22/2004,21.4180556,-157.803611


In [None]:
print(df.shape)

(80332, 11)


**All the columns have 'object' datatype. Rightly so. However, it is surprising to see that "lattitude" is object also. Like longitude it must be float.**

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80332 entries, 0 to 80331
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   datetime              80332 non-null  object 
 1   city                  80332 non-null  object 
 2   state                 74535 non-null  object 
 3   country               70662 non-null  object 
 4   shape                 78400 non-null  object 
 5   duration (seconds)    80332 non-null  object 
 6   duration (hours/min)  80332 non-null  object 
 7   comments              80317 non-null  object 
 8   date posted           80332 non-null  object 
 9   latitude              80332 non-null  object 
 10  longitude             80332 non-null  float64
dtypes: float64(1), object(10)
memory usage: 6.7+ MB


**Quite a few missing values as well.**

In [None]:
df.isnull().sum()

datetime                   0
city                       0
state                   5797
country                 9670
shape                   1932
duration (seconds)         0
duration (hours/min)       0
comments                  15
date posted                0
latitude                   0
longitude                  0
dtype: int64

**No duplicate records.**

In [None]:
df[df.duplicated()]

Unnamed: 0,datetime,city,state,country,shape,duration (seconds),duration (hours/min),comments,date posted,latitude,longitude


**Lets dig in:**
- Most number of occurrings took place on 4th of july, 2010
- Most sightings are in U.S, California
- Mostly it is ssen for 5 mins
- Shape "light" is most commonly reported 

In [None]:
df.describe(include="all")

Unnamed: 0,datetime,city,state,country,shape,duration (seconds),duration (hours/min),comments,date posted,latitude,longitude
count,80332,80332,74535,70662,78400,80332.0,80332,80317,80332,80332.0,80332.0
unique,69586,19900,67,5,29,706.0,8349,79997,317,23312.0,
top,7/4/2010 22:00,seattle,ca,us,light,300.0,5 minutes,Fireball,12/12/2009,47.6063889,
freq,36,525,9655,65114,16565,7070.0,4716,11,1510,481.0,
mean,,,,,,,,,,,-86.772885
std,,,,,,,,,,,39.697205
min,,,,,,,,,,,-176.658056
25%,,,,,,,,,,,-112.073333
50%,,,,,,,,,,,-87.903611
75%,,,,,,,,,,,-78.755


**Dropping duration (hours/min) since we have duration in seconds and it has only two null values. Also dropping comments.**

In [None]:
df.drop(["duration (hours/min)","comments"],axis=1,inplace=True)

**On investigating the data. It was found that some values are are "?" and "??". We'll replace them with NaNs.**

In [None]:
print("looking for ?")
for cols in df.columns:
  print(cols,":",sum(df[cols]=='?'))

print("###############################")

print("\nlooking for ??")
for cols in df.columns:
  print(cols,":",sum(df[cols]=='??'))

looking for ?
datetime : 0
city : 1
state : 0
country : 0
shape : 0
duration (seconds) : 0
date posted : 0
latitude : 0
longitude  : 0
###############################

looking for ??
datetime : 0
city : 2
state : 0
country : 0
shape : 0
duration (seconds) : 0
date posted : 0
latitude : 0
longitude  : 0


In [None]:
df.replace('?', np.nan, inplace=True)
df.replace('??', np.nan, inplace=True)

**Checking Null values again:**

In [None]:
df.isnull().sum()

datetime                 0
city                     3
state                 5797
country               9670
shape                 1932
duration (seconds)       0
date posted              0
latitude                 0
longitude                0
dtype: int64

**Making index as id column. May help in identifying records later.**


In [None]:
df = df.reset_index()
df.rename(columns={"index":"id"},inplace=True)
df.head()

Unnamed: 0,id,datetime,city,state,country,shape,duration (seconds),date posted,latitude,longitude
0,0,10/10/1949 20:30,san marcos,tx,us,cylinder,2700,4/27/2004,29.8830556,-97.941111
1,1,10/10/1949 21:00,lackland afb,tx,,light,7200,12/16/2005,29.38421,-98.581082
2,2,10/10/1955 17:00,chester (uk/england),,gb,circle,20,1/21/2008,53.2,-2.916667
3,3,10/10/1956 21:00,edna,tx,us,circle,20,1/17/2004,28.9783333,-96.645833
4,4,10/10/1960 20:00,kaneohe,hi,us,light,900,1/22/2004,21.4180556,-157.803611


# lets see the columns in detail<br>
- Processing datatime column
- This is the time of sighting
- We will convert time acc to 12hr clock and also create the following bins >> morning(6-12), afternoon(12-18), evening/night(18-24), midnight(0-6)
<br><br>

**Categorising to midnight, morning, afternoon and evening/night:**

In [None]:
def process_timeofday(x):
    tim = x.split()[1]
    twenty4hr = tim.split(":")[0]
    if((int(twenty4hr)>=0) and (int(twenty4hr)<=6)):
      return "midnight"
    if(int(twenty4hr)==24):
      return "midnight"
    elif((int(twenty4hr)>6) and (int(twenty4hr)<=12)):
      return "morning"
    elif((int(twenty4hr)>12) and (int(twenty4hr)<=18)):
      return "afternoon"
    elif((int(twenty4hr)>18) and (int(twenty4hr)<=23)):
      return "evening/night"

In [None]:
df["time_of_day"] = df["datetime"].apply(process_timeofday)
df.head()

Unnamed: 0,id,datetime,city,state,country,shape,duration (seconds),date posted,latitude,longitude,time_of_day
0,0,10/10/1949 20:30,san marcos,tx,us,cylinder,2700,4/27/2004,29.8830556,-97.941111,evening/night
1,1,10/10/1949 21:00,lackland afb,tx,,light,7200,12/16/2005,29.38421,-98.581082,evening/night
2,2,10/10/1955 17:00,chester (uk/england),,gb,circle,20,1/21/2008,53.2,-2.916667,afternoon
3,3,10/10/1956 21:00,edna,tx,us,circle,20,1/17/2004,28.9783333,-96.645833,evening/night
4,4,10/10/1960 20:00,kaneohe,hi,us,light,900,1/22/2004,21.4180556,-157.803611,evening/night


**Converting 24hr format to 12 hr format:**

In [None]:
def process_time(x):
    tim = x.split()[1]
    twenty4hr = tim.split(":")[0]
    mins = tim.split(":")[1]
    
    if((int(twenty4hr)>0) and (int(twenty4hr)<12)):
        return tim+" am"
    elif((int(twenty4hr)==0) or (int(twenty4hr)==24)):
        return "12:"+str(mins)+" am"
    elif((int(twenty4hr)>12) and (int(twenty4hr)<=23)):
        t = int(twenty4hr)-12
        return str(t)+":"+str(mins)+" pm"
    elif(int(twenty4hr)==12):
        return "12:"+str(mins)+" pm"

In [None]:
df["time"] = df["datetime"].apply(process_time)
df['am/pm'] = [x.split()[1] for x in df['time']]
df['hr'] = [x.split()[0].split(":")[0] for x in df['time']]
df['min'] = [x.split()[0].split(":")[1] for x in df['time']]

In [None]:
df.head()

Unnamed: 0,id,datetime,city,state,country,shape,duration (seconds),date posted,latitude,longitude,time_of_day,time,am/pm,hr,min
0,0,10/10/1949 20:30,san marcos,tx,us,cylinder,2700,4/27/2004,29.8830556,-97.941111,evening/night,8:30 pm,pm,8,30
1,1,10/10/1949 21:00,lackland afb,tx,,light,7200,12/16/2005,29.38421,-98.581082,evening/night,9:00 pm,pm,9,0
2,2,10/10/1955 17:00,chester (uk/england),,gb,circle,20,1/21/2008,53.2,-2.916667,afternoon,5:00 pm,pm,5,0
3,3,10/10/1956 21:00,edna,tx,us,circle,20,1/17/2004,28.9783333,-96.645833,evening/night,9:00 pm,pm,9,0
4,4,10/10/1960 20:00,kaneohe,hi,us,light,900,1/22/2004,21.4180556,-157.803611,evening/night,8:00 pm,pm,8,0


In [None]:
df.drop("time",axis=1,inplace=True)

**Also lets make some features from the date column. We can make 4 columns from date: Year, Date, Month, Dayofweek**<br>isoweekday() function starts from monday=1 while weekday() starts from monday=0.<br>So we will use isoweekday(), hence mon = 1, tues = 2, .... , sun = 7

In [None]:
 df["year"] = df["datetime"].apply(lambda x: datetime.datetime.strptime(x.split()[0], "%m/%d/%Y").date().year)
df["date"] = df["datetime"].apply(lambda x: datetime.datetime.strptime(x.split()[0], "%m/%d/%Y").date().day)
df["month"] = df["datetime"].apply(lambda x: datetime.datetime.strptime(x.split()[0], "%m/%d/%Y").date().month)
df["dayofweek"] = df["datetime"].apply(lambda x: datetime.datetime.strptime(x.split()[0], "%m/%d/%Y").date().isoweekday()) 

#from date time column we will only keep the date part. We have already converted time to 12 hr clock
df["datetime"] = df["datetime"].apply(lambda x: x.split()[0])

In [None]:
df.head()

Unnamed: 0,id,datetime,city,state,country,shape,duration (seconds),date posted,latitude,longitude,time_of_day,am/pm,hr,min,year,date,month,dayofweek
0,0,10/10/1949,san marcos,tx,us,cylinder,2700,4/27/2004,29.8830556,-97.941111,evening/night,pm,8,30,1949,10,10,1
1,1,10/10/1949,lackland afb,tx,,light,7200,12/16/2005,29.38421,-98.581082,evening/night,pm,9,0,1949,10,10,1
2,2,10/10/1955,chester (uk/england),,gb,circle,20,1/21/2008,53.2,-2.916667,afternoon,pm,5,0,1955,10,10,1
3,3,10/10/1956,edna,tx,us,circle,20,1/17/2004,28.9783333,-96.645833,evening/night,pm,9,0,1956,10,10,3
4,4,10/10/1960,kaneohe,hi,us,light,900,1/22/2004,21.4180556,-157.803611,evening/night,pm,8,0,1960,10,10,1


**Note that: we have 2 different dates. One date is the date of sighting and the other date is the date it was reported to the authorities. Using this information we can make another feature i.e difference in time of sighting and reporting (in days). Lets call this feature "reported after" i.e "reported after how many days?"** 

In [None]:
df["reported after (days)"] = [abs(datetime.datetime.strptime(i, "%m/%d/%Y").date()-datetime.datetime.strptime(j, "%m/%d/%Y").date()).days for i,j in zip(df["datetime"],df["date posted"])]

In [None]:
df.head()

Unnamed: 0,id,datetime,city,state,country,shape,duration (seconds),date posted,latitude,longitude,time_of_day,am/pm,hr,min,year,date,month,dayofweek,reported after (days)
0,0,10/10/1949,san marcos,tx,us,cylinder,2700,4/27/2004,29.8830556,-97.941111,evening/night,pm,8,30,1949,10,10,1,19923
1,1,10/10/1949,lackland afb,tx,,light,7200,12/16/2005,29.38421,-98.581082,evening/night,pm,9,0,1949,10,10,1,20521
2,2,10/10/1955,chester (uk/england),,gb,circle,20,1/21/2008,53.2,-2.916667,afternoon,pm,5,0,1955,10,10,1,19096
3,3,10/10/1956,edna,tx,us,circle,20,1/17/2004,28.9783333,-96.645833,evening/night,pm,9,0,1956,10,10,3,17265
4,4,10/10/1960,kaneohe,hi,us,light,900,1/22/2004,21.4180556,-157.803611,evening/night,pm,8,0,1960,10,10,1,15809


**I think we would not need datetime columns any more. Lets delete them.**

In [None]:
df.drop(["datetime","date posted"],axis=1,inplace=True)

In [None]:
df.head()

Unnamed: 0,id,city,state,country,shape,duration (seconds),latitude,longitude,time_of_day,am/pm,hr,min,year,date,month,dayofweek,reported after (days)
0,0,san marcos,tx,us,cylinder,2700,29.8830556,-97.941111,evening/night,pm,8,30,1949,10,10,1,19923
1,1,lackland afb,tx,,light,7200,29.38421,-98.581082,evening/night,pm,9,0,1949,10,10,1,20521
2,2,chester (uk/england),,gb,circle,20,53.2,-2.916667,afternoon,pm,5,0,1955,10,10,1,19096
3,3,edna,tx,us,circle,20,28.9783333,-96.645833,evening/night,pm,9,0,1956,10,10,3,17265
4,4,kaneohe,hi,us,light,900,21.4180556,-157.803611,evening/night,pm,8,0,1960,10,10,1,15809


- Processing latitude and longitude
- Latitude column seemed a little fishy because its datatype is object (it must be numeric)
- One more thing. Longitude column is not recogonized because it has a space after the last letter (See below). We'll fixed that as well.

In [None]:
df.columns

Index(['id', 'city', 'state', 'country', 'shape', 'duration (seconds)',
       'latitude', 'longitude ', 'time_of_day', 'am/pm', 'hr', 'min', 'year',
       'date', 'month', 'dayofweek', 'reported after (days)'],
      dtype='object')

In [None]:
df.rename(columns={"longitude ":"longitude"},inplace=True)

**We can see that latitude is type object while longitude is type float (rightly so). Lets see if we have any ambigous values in latitude. There must be string values in latitude** 


In [None]:
df[["latitude","longitude"]].dtypes

latitude      object
longitude    float64
dtype: object

**On investigating we found that a lot of values in latitude column were type string, even though their value was numerical.**<br><br> In order to make all the values type float in latitude column, we used astype but it showed error on one of the records where the latitude was alpha numeric. From there we realised that we not only have float values typed as string but we may also have alpha numeric latitude values. Latitudes cannot be alpha numeric hence there must be a typing mistake. Lets see what all values are alpha numeric. 


In [None]:
def split(word):
    return [char for char in word if char in "abcdefghijklmnopqrstuvwxyz"]
        

for idx,i in enumerate(df["latitude"]):
    letter = split(str(i))
    if(len(letter)!=0):
        print("at index "+str(idx)+" we have a latitude that is alpha numeric")
        display(df[df.index==idx])
        print("")

at index 43782 we have a latitude that is alpha numeric


Unnamed: 0,id,city,state,country,shape,duration (seconds),latitude,longitude,time_of_day,am/pm,hr,min,year,date,month,dayofweek,reported after (days)
43782,43782,mescalero indian reservation,nm,,rectangle,180,33q.200088,-105.624152,midnight,am,5,30,1974,22,5,3,13846





**Suprisingly just one record is invalid. Other than this all values are numerical but they are string type.** Lets see the true latitude and longitude for this record, fix it and convert this column to type float.


In [None]:
from  geopy.geocoders import Nominatim
geolocator = Nominatim(user_agent="myGeocoder")
loc = geolocator.geocode("mescalero indian reservation",timeout=None)
print("latitude is: " ,loc.latitude,"\nlongtitude is: " ,loc.longitude)

latitude is:  33.1576919 
longtitude is:  -105.7743515


**So we have the correct lattitude. As expected it was a typing error. The actual latitude is 33.1475856. Lets fix it:**

In [None]:
df["latitude"][df["latitude"]=="33q.200088"] = 33.1475856 
df[df["latitude"]==33.1475856]

Unnamed: 0,id,city,state,country,shape,duration (seconds),latitude,longitude,time_of_day,am/pm,hr,min,year,date,month,dayofweek,reported after (days)
43782,43782,mescalero indian reservation,nm,,rectangle,180,33.1476,-105.624152,midnight,am,5,30,1974,22,5,3,13846


In [None]:
df["latitude"]=df["latitude"].astype("float")
df[["latitude","longitude"]].dtypes

latitude     float64
longitude    float64
dtype: object

**lets process city and country columns:**<br>
- We'll remove the Null values in country and city columns based on latitudes and longitudes provided.
- We'll need to install reverse_geocoder for that.

In [None]:
# pip install reverse_geocoder

Collecting reverse_geocoder
[?25l  Downloading https://files.pythonhosted.org/packages/0b/0f/b7d5d4b36553731f11983e19e1813a1059ad0732c5162c01b3220c927d31/reverse_geocoder-1.5.1.tar.gz (2.2MB)
[K     |████████████████████████████████| 2.3MB 6.4MB/s 
Building wheels for collected packages: reverse-geocoder
  Building wheel for reverse-geocoder (setup.py) ... [?25l[?25hdone
  Created wheel for reverse-geocoder: filename=reverse_geocoder-1.5.1-cp37-none-any.whl size=2268089 sha256=5585d7f649478ecc9dcd4cc80e50c7067c3ff7585aca6b752bc8e379c3131915
  Stored in directory: /root/.cache/pip/wheels/47/05/50/b1350ff094ef91e082665b4a2f9ca551f8acea4aa55d796b26
Successfully built reverse-geocoder
Installing collected packages: reverse-geocoder
Successfully installed reverse-geocoder-1.5.1


In [None]:
import reverse_geocoder as rg

def isNaN(string):
    return string != string

cntry = {}
cty = {}
def find_countriesncities(row):
  if(isNaN(row["country"])==True):
    cntry[row["id"]] = rg.search((row["latitude"],row["longitude"]))[0]["cc"]

  if(isNaN(row["city"])==True):
    cty[row["id"]] = rg.search((row["latitude"],row["longitude"]))[0]["name"]

In [None]:
# df[ (df["country"].isnull()) | (df["city"].isnull()) ][["id","city","country","latitude","longitude"]].apply(find_countriesncities,axis=1)

In [None]:
# with open("/content/drive/My Drive/Colab Notebooks/ufo sightings/country.json", "w") as fp:
#   json.dump(cntry,fp)

# with open("/content/drive/My Drive/Colab Notebooks/ufo sightings/city.json", "w") as fp:
#   json.dump(cty,fp)

In [None]:
with open("/content/drive/My Drive/ufo sightings/country.json", "r") as fp:
  cntry = json.load(fp)

with open("/content/drive/My Drive/ufo sightings/city.json", "r") as fp:
  cty = json.load(fp)

In [None]:
print(len(cntry))
print(len(cty))

9670
3


**Filling the cities and countries:**

In [None]:
for idd,cnt in cntry.items(): 
  df["country"][df["id"]==int(idd)] = cnt.lower()

for idd,cit in cty.items():
  df["city"][df["id"]==int(idd)] = cit.lower()

**And.. we have removed the NaN values in country and city columns**

In [None]:
df.isnull().sum()

id                          0
city                        0
state                    5797
country                     0
shape                    1932
duration (seconds)          0
latitude                    0
longitude                   0
time_of_day                 0
am/pm                       0
hr                          0
min                         0
year                        0
date                        0
month                       0
dayofweek                   0
reported after (days)       0
dtype: int64

**Also we will drop the state column. We dont really need it. We could have just as easily filled the missing state values like we filled cities and countries using latitudes and longitudes... but with cities, countries, latitudess and longitudess, we have pretty good information on the geography of the sightings. Hence we can delete state column.**

In [None]:
df.drop("state",axis=1,inplace=True)

In [None]:
df.shape

(80332, 16)

**Processing shape column:** 
- It has 1932 missing values. We already have "unknown" label in this column hence we will replace NaNs by "unkown"


In [None]:
df.isnull().sum()

id                          0
city                        0
country                     0
shape                    1932
duration (seconds)          0
latitude                    0
longitude                   0
time_of_day                 0
am/pm                       0
hr                          0
min                         0
year                        0
date                        0
month                       0
dayofweek                   0
reported after (days)       0
dtype: int64

In [None]:
df["shape"] = df["shape"].fillna("unknown")

In [None]:
df["shape"].value_counts()

light        16565
triangle      7865
circle        7608
unknown       7516
fireball      6208
other         5649
sphere        5387
disk          5213
oval          3733
formation     2457
cigar         2057
changing      1962
flash         1328
rectangle     1297
cylinder      1283
diamond       1178
chevron        952
egg            759
teardrop       750
cone           316
cross          233
delta            7
crescent         2
round            2
changed          1
flare            1
pyramid          1
hexagon          1
dome             1
Name: shape, dtype: int64

**There is one more thing that I should point out.**<br>
We have city names that are written in different ways. On exploring this column. A lot of city names are such that the text inside the bracket is either different for the same cities or irrelevant... or it is just the name of the country and we have already captured it. Due to this, the city column does not have unique labels. The labels are repeating with different names.<br> **Eg: Here we have grouped by country and city. We can see that Abu Dabi although refers to the same city but because it is written in a slightly different way, it is considered as a whole new label. Similarly we can see multiple instances of dubai, sharjah and many more.**

In [None]:
df_grp = df[["country","city"]].groupby(by=["country","city"]).size().reset_index().iloc[:,:2]
df_grp[:50]

Unnamed: 0,country,city
0,ae,abu dhabi (u. a. e.)
1,ae,abu dhabi (u.a.e.)
2,ae,abu dhabi (uae)
3,ae,abu dhabi (united arab emirates)
4,ae,ajman (united arab emirates)
5,ae,al warqaa 1 (u.a.r.)
6,ae,al-ain (uae)
7,ae,dubai
8,ae,dubai (u.a.r.)
9,ae,dubai (uae)


**To fix this we will remove the bracket parts of the cities:**

In [None]:
df["city"] = df["city"].apply(lambda x: x.split("(")[0].lower())

In [None]:
df.head()

Unnamed: 0,id,city,country,shape,duration (seconds),latitude,longitude,time_of_day,am/pm,hr,min,year,date,month,dayofweek,reported after (days)
0,0,san marcos,us,cylinder,2700,29.883056,-97.941111,evening/night,pm,8,30,1949,10,10,1,19923
1,1,lackland afb,us,light,7200,29.38421,-98.581082,evening/night,pm,9,0,1949,10,10,1,20521
2,2,chester,gb,circle,20,53.2,-2.916667,afternoon,pm,5,0,1955,10,10,1,19096
3,3,edna,us,circle,20,28.978333,-96.645833,evening/night,pm,9,0,1956,10,10,3,17265
4,4,kaneohe,us,light,900,21.418056,-157.803611,evening/night,pm,8,0,1960,10,10,1,15809


**Last thing. There are 3 columns that have numerical values and must not have data type 'object'. These columns are duration, min and hr**

In [None]:
df.dtypes

id                         int64
city                      object
country                   object
shape                     object
duration (seconds)        object
latitude                 float64
longitude                float64
time_of_day               object
am/pm                     object
hr                        object
min                       object
year                       int64
date                       int64
month                      int64
dayofweek                  int64
reported after (days)      int64
dtype: object

**While converting to float we encountered these 3 records in duration column that were string and had small 'tilda' after the number. Lets fix them and convert there type:**


In [None]:
df["duration (seconds)"][df["duration (seconds)"]=="0.5`"]=0.5
df["duration (seconds)"][df["duration (seconds)"]=="2`"]=2
df["duration (seconds)"][df["duration (seconds)"]=="8`"]=8

In [None]:
df["duration (seconds)"]=df["duration (seconds)"].astype("float")
df["min"]=df["min"].astype("int32")
df["hr"]=df["hr"].astype("int32")

In [None]:
df.dtypes

id                         int64
city                      object
country                   object
shape                     object
duration (seconds)       float64
latitude                 float64
longitude                float64
time_of_day               object
am/pm                     object
hr                         int32
min                        int32
year                       int64
date                       int64
month                      int64
dayofweek                  int64
reported after (days)      int64
dtype: object

# Here we complete the pre-processing. Lets save the processed data. In the next part we will explore the data

In [None]:
# df.to_csv('/content/drive/My Drive/Colab Notebooks/ufo sightings/pre-processed.csv',index=False)