# Data Rationalisation

This notebook prepares and inserts data from a `.csv` source into an existing database table.

## Imports

In [72]:
import pandas as pd
from redshift_connector import connect

## Setup

In [73]:
def get_db_connection():
    return connect(host="c17-redshift-cluster.cdq12ms5gjyk.eu-west-2.redshift.amazonaws.com",
                   user="admin",
                   password="Password1",
                   database="dw_air_travel",
                   port=5439)

conn = get_db_connection()

In [74]:
def get_state_mapping(conn) -> dict:
    """Returns a mapping of state codes and DB IDs."""

    with conn.cursor() as cur:
        cur.execute("SELECT state_code, state_id FROM s_coach_dan.state;")
        data = cur.fetchall()

    mapping = {}
    for row in data:
        mapping[row[0]] = row[1]

    return mapping

def get_shape_mapping(conn) -> dict:
    """Returns a mapping of shape names and DB IDs."""

    with conn.cursor() as cur:
        cur.execute("SELECT ufo_shape_name, ufo_shape_id FROM ufo.ufo_shape;")
        data = cur.fetchall()

    mapping = {}
    for row in data:
        mapping[row[0]] = row[1]

    return mapping

## Data Sourcing

In [75]:
sightings = pd.read_csv("ufo_data.csv", sep="\t")

In [76]:
sightings.head(2)

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,summary,city,state,date_time,shape,duration,stats,report_link,text,posted,city_latitude,city_longitude
0,0,0,My wife was driving southeast on a fairly popu...,Chester,VA,2019-12-12T18:43:00,light,"5 seconds, 0 minutes, 0 hours",Occurred : 12/12/2019 18:43 (Entered as : 12/...,http://www.nuforc.org/webreports/151/S151739.html,My wife was driving southeast on a fairly popu...,2019-12-22T00:00:00,37.343152,-77.408582
1,1,1,I think that I may caught a UFO on the NBC Nig...,Rocky Hill,CT,2019-03-22T18:30:00,circle,"5 seconds, 0 minutes, 0 hours",Occurred : 3/22/2019 18:30 (Entered as : 03/2...,http://www.nuforc.org/webreports/145/S145297.html,I think that I may caught a UFO on the NBC Nig...,2019-03-29T00:00:00,41.6648,-72.6393


## Data Rationalisation/Cleaning

In [77]:
sightings = sightings.drop(columns=["Unnamed: 0.1", "Unnamed: 0", "city", "city_latitude", "city_longitude", "report_link",
                                    "posted", "summary", "stats"])

In [78]:
state_mapping = get_state_mapping(conn)
shape_mapping = get_shape_mapping(conn)

In [79]:
sightings = sightings.dropna()

In [80]:
sightings["shape"] = sightings["shape"].map(shape_mapping)
sightings["state"] = sightings["state"].map(state_mapping)

In [82]:
sightings

Unnamed: 0,state,date_time,shape,duration,text
0,46.0,2019-12-12T18:43:00,1,"5 seconds, 0 minutes, 0 hours",My wife was driving southeast on a fairly popu...
1,7.0,2019-03-22T18:30:00,2,"5 seconds, 0 minutes, 0 hours",I think that I may caught a UFO on the NBC Nig...
3,,2019-04-17T02:00:00,4,"10 seconds, 0 minutes, 0 hours",I was driving towards the intersection of fall...
4,32.0,2009-03-15T18:00:00,5,"0 seconds, 2 minutes, 0 hours","In Peoria, Arizona, I saw a cigar shaped craft..."
5,60.0,2019-04-02T20:25:00,6,"0 seconds, 15 minutes, 0 hours","The object has flashing lights that are green,..."
...,...,...,...,...,...
88120,3.0,2019-10-02T20:00:00,11,"0 seconds, 3 minutes, 0 hours",4 lights in formation over Tempe appear while ...
88121,25.0,2019-10-02T20:00:00,1,"20 seconds, 0 minutes, 0 hours",2 bright star like lights in the NNW sky two b...
88122,9.0,2019-10-02T20:03:00,11,"20 seconds, 0 minutes, 0 hours",10/2/19 @ 8:03PM EST UFO SIGHTING in the 3428...
88123,33.0,2019-10-02T22:00:00,16,"0 seconds, 2 minutes, 0 hours","Witnessed an orange, slow moving light. Was lo..."


In [None]:
conn.close()